aboutsummaryrefslogtreecommitdiffstats
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/Kconfig27
-rw-r--r--fs/Makefile1
-rw-r--r--fs/afs/netdevices.c3
-rw-r--r--fs/befs/debug.c1
-rw-r--r--fs/befs/super.c1
-rw-r--r--fs/buffer.c35
-rw-r--r--fs/direct-io.c2
-rw-r--r--fs/ext3/Kconfig19
-rw-r--r--fs/ext3/inode.c23
-rw-r--r--fs/ext3/super.c8
-rw-r--r--fs/ext4/extents.c2
-rw-r--r--fs/ext4/inode.c8
-rw-r--r--fs/ext4/super.c9
-rw-r--r--fs/jbd/commit.c7
-rw-r--r--fs/jbd2/commit.c13
-rw-r--r--fs/jffs2/acl.c4
-rw-r--r--fs/jffs2/malloc.c6
-rw-r--r--fs/libfs.c16
-rw-r--r--fs/lockd/svclock.c13
-rw-r--r--fs/nfs/file.c2
-rw-r--r--fs/nfs/super.c2
-rw-r--r--fs/nfsd/Kconfig1
-rw-r--r--fs/nfsd/nfs3proc.c10
-rw-r--r--fs/nfsd/nfs4callback.c47
-rw-r--r--fs/nfsd/nfs4proc.c246
-rw-r--r--fs/nfsd/nfs4recover.c74
-rw-r--r--fs/nfsd/nfs4state.c1196
-rw-r--r--fs/nfsd/nfs4xdr.c633
-rw-r--r--fs/nfsd/nfsctl.c38
-rw-r--r--fs/nfsd/nfsproc.c3
-rw-r--r--fs/nfsd/nfssvc.c88
-rw-r--r--fs/nfsd/vfs.c37
-rw-r--r--fs/nilfs2/Makefile5
-rw-r--r--fs/nilfs2/alloc.c504
-rw-r--r--fs/nilfs2/alloc.h72
-rw-r--r--fs/nilfs2/bmap.c783
-rw-r--r--fs/nilfs2/bmap.h244
-rw-r--r--fs/nilfs2/bmap_union.h42
-rw-r--r--fs/nilfs2/btnode.c316
-rw-r--r--fs/nilfs2/btnode.h58
-rw-r--r--fs/nilfs2/btree.c2269
-rw-r--r--fs/nilfs2/btree.h117
-rw-r--r--fs/nilfs2/cpfile.c925
-rw-r--r--fs/nilfs2/cpfile.h45
-rw-r--r--fs/nilfs2/dat.c430
-rw-r--r--fs/nilfs2/dat.h52
-rw-r--r--fs/nilfs2/dir.c711
-rw-r--r--fs/nilfs2/direct.c436
-rw-r--r--fs/nilfs2/direct.h78
-rw-r--r--fs/nilfs2/file.c160
-rw-r--r--fs/nilfs2/gcdat.c84
-rw-r--r--fs/nilfs2/gcinode.c288
-rw-r--r--fs/nilfs2/ifile.c150
-rw-r--r--fs/nilfs2/ifile.h53
-rw-r--r--fs/nilfs2/inode.c785
-rw-r--r--fs/nilfs2/ioctl.c654
-rw-r--r--fs/nilfs2/mdt.c563
-rw-r--r--fs/nilfs2/mdt.h125
-rw-r--r--fs/nilfs2/namei.c474
-rw-r--r--fs/nilfs2/nilfs.h318
-rw-r--r--fs/nilfs2/page.c540
-rw-r--r--fs/nilfs2/page.h76
-rw-r--r--fs/nilfs2/recovery.c929
-rw-r--r--fs/nilfs2/sb.h102
-rw-r--r--fs/nilfs2/segbuf.c439
-rw-r--r--fs/nilfs2/segbuf.h201
-rw-r--r--fs/nilfs2/seglist.h85
-rw-r--r--fs/nilfs2/segment.c2977
-rw-r--r--fs/nilfs2/segment.h243
-rw-r--r--fs/nilfs2/sufile.c640
-rw-r--r--fs/nilfs2/sufile.h54
-rw-r--r--fs/nilfs2/super.c1323
-rw-r--r--fs/nilfs2/the_nilfs.c637
-rw-r--r--fs/nilfs2/the_nilfs.h298
-rw-r--r--fs/ocfs2/file.c8
-rw-r--r--fs/proc/task_mmu.c4
-rw-r--r--fs/proc/task_nommu.c4
-rw-r--r--fs/ramfs/inode.c19
-rw-r--r--fs/romfs/Kconfig48
-rw-r--r--fs/romfs/Makefile9
-rw-r--r--fs/romfs/inode.c665
-rw-r--r--fs/romfs/internal.h47
-rw-r--r--fs/romfs/mmap-nommu.c75
-rw-r--r--fs/romfs/storage.c261
-rw-r--r--fs/romfs/super.c653
-rw-r--r--fs/splice.c25
-rw-r--r--fs/squashfs/export.c1
-rw-r--r--fs/super.c40
-rw-r--r--fs/ubifs/budget.c37
-rw-r--r--fs/ubifs/debug.c6
-rw-r--r--fs/ubifs/file.c16
-rw-r--r--fs/ubifs/find.c12
-rw-r--r--fs/ubifs/gc.c428
-rw-r--r--fs/ubifs/journal.c7
-rw-r--r--fs/ubifs/key.h6
-rw-r--r--fs/ubifs/log.c5
-rw-r--r--fs/ubifs/lpt_commit.c34
-rw-r--r--fs/ubifs/recovery.c70
-rw-r--r--fs/ubifs/replay.c2
-rw-r--r--fs/ubifs/sb.c36
-rw-r--r--fs/ubifs/shrinker.c6
-rw-r--r--fs/ubifs/super.c37
-rw-r--r--fs/ubifs/tnc.c2
-rw-r--r--fs/ubifs/ubifs-media.h30
-rw-r--r--fs/ubifs/ubifs.h13
105 files changed, 23102 insertions, 1364 deletions
diff --git a/fs/Kconfig b/fs/Kconfig
index 86b203fc3c56..9f7270f36b2a 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -175,9 +175,34 @@ source "fs/qnx4/Kconfig"
175source "fs/romfs/Kconfig" 175source "fs/romfs/Kconfig"
176source "fs/sysv/Kconfig" 176source "fs/sysv/Kconfig"
177source "fs/ufs/Kconfig" 177source "fs/ufs/Kconfig"
178
179source "fs/exofs/Kconfig" 178source "fs/exofs/Kconfig"
180 179
180config NILFS2_FS
181 tristate "NILFS2 file system support (EXPERIMENTAL)"
182 depends on BLOCK && EXPERIMENTAL
183 select CRC32
184 help
185 NILFS2 is a log-structured file system (LFS) supporting continuous
186 snapshotting. In addition to versioning capability of the entire
187 file system, users can even restore files mistakenly overwritten or
188 destroyed just a few seconds ago. Since this file system can keep
189 consistency like conventional LFS, it achieves quick recovery after
190 system crashes.
191
192 NILFS2 creates a number of checkpoints every few seconds or per
193 synchronous write basis (unless there is no change). Users can
194 select significant versions among continuously created checkpoints,
195 and can change them into snapshots which will be preserved for long
196 periods until they are changed back to checkpoints. Each
197 snapshot is mountable as a read-only file system concurrently with
198 its writable mount, and this feature is convenient for online backup.
199
200 Some features including atime, extended attributes, and POSIX ACLs,
201 are not supported yet.
202
203 To compile this file system support as a module, choose M here: the
204 module will be called nilfs2. If unsure, say N.
205
181endif # MISC_FILESYSTEMS 206endif # MISC_FILESYSTEMS
182 207
183menuconfig NETWORK_FILESYSTEMS 208menuconfig NETWORK_FILESYSTEMS
diff --git a/fs/Makefile b/fs/Makefile
index 70b2aed87133..af6d04700d9c 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -114,6 +114,7 @@ obj-$(CONFIG_JFS_FS) += jfs/
114obj-$(CONFIG_XFS_FS) += xfs/ 114obj-$(CONFIG_XFS_FS) += xfs/
115obj-$(CONFIG_9P_FS) += 9p/ 115obj-$(CONFIG_9P_FS) += 9p/
116obj-$(CONFIG_AFS_FS) += afs/ 116obj-$(CONFIG_AFS_FS) += afs/
117obj-$(CONFIG_NILFS2_FS) += nilfs2/
117obj-$(CONFIG_BEFS_FS) += befs/ 118obj-$(CONFIG_BEFS_FS) += befs/
118obj-$(CONFIG_HOSTFS) += hostfs/ 119obj-$(CONFIG_HOSTFS) += hostfs/
119obj-$(CONFIG_HPPFS) += hppfs/ 120obj-$(CONFIG_HPPFS) += hppfs/
diff --git a/fs/afs/netdevices.c b/fs/afs/netdevices.c
index 49f189423063..7ad36506c256 100644
--- a/fs/afs/netdevices.c
+++ b/fs/afs/netdevices.c
@@ -20,8 +20,7 @@ int afs_get_MAC_address(u8 *mac, size_t maclen)
20 struct net_device *dev; 20 struct net_device *dev;
21 int ret = -ENODEV; 21 int ret = -ENODEV;
22 22
23 if (maclen != ETH_ALEN) 23 BUG_ON(maclen != ETH_ALEN);
24 BUG();
25 24
26 rtnl_lock(); 25 rtnl_lock();
27 dev = __dev_getfirstbyhwtype(&init_net, ARPHRD_ETHER); 26 dev = __dev_getfirstbyhwtype(&init_net, ARPHRD_ETHER);
diff --git a/fs/befs/debug.c b/fs/befs/debug.c
index b8e304a0661e..622e73775c83 100644
--- a/fs/befs/debug.c
+++ b/fs/befs/debug.c
@@ -17,6 +17,7 @@
17#include <linux/spinlock.h> 17#include <linux/spinlock.h>
18#include <linux/kernel.h> 18#include <linux/kernel.h>
19#include <linux/fs.h> 19#include <linux/fs.h>
20#include <linux/slab.h>
20 21
21#endif /* __KERNEL__ */ 22#endif /* __KERNEL__ */
22 23
diff --git a/fs/befs/super.c b/fs/befs/super.c
index 41f2b4d0093e..ca40f828f64d 100644
--- a/fs/befs/super.c
+++ b/fs/befs/super.c
@@ -8,6 +8,7 @@
8 */ 8 */
9 9
10#include <linux/fs.h> 10#include <linux/fs.h>
11#include <asm/page.h> /* for PAGE_SIZE */
11 12
12#include "befs.h" 13#include "befs.h"
13#include "super.h" 14#include "super.h"
diff --git a/fs/buffer.c b/fs/buffer.c
index 5d55a896ff78..13edf7ad3ff1 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -737,7 +737,7 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
737{ 737{
738 struct buffer_head *bh; 738 struct buffer_head *bh;
739 struct list_head tmp; 739 struct list_head tmp;
740 struct address_space *mapping; 740 struct address_space *mapping, *prev_mapping = NULL;
741 int err = 0, err2; 741 int err = 0, err2;
742 742
743 INIT_LIST_HEAD(&tmp); 743 INIT_LIST_HEAD(&tmp);
@@ -762,7 +762,18 @@ static int fsync_buffers_list(spinlock_t *lock, struct list_head *list)
762 * contents - it is a noop if I/O is still in 762 * contents - it is a noop if I/O is still in
763 * flight on potentially older contents. 763 * flight on potentially older contents.
764 */ 764 */
765 ll_rw_block(SWRITE_SYNC, 1, &bh); 765 ll_rw_block(SWRITE_SYNC_PLUG, 1, &bh);
766
767 /*
768 * Kick off IO for the previous mapping. Note
769 * that we will not run the very last mapping,
770 * wait_on_buffer() will do that for us
771 * through sync_buffer().
772 */
773 if (prev_mapping && prev_mapping != mapping)
774 blk_run_address_space(prev_mapping);
775 prev_mapping = mapping;
776
766 brelse(bh); 777 brelse(bh);
767 spin_lock(lock); 778 spin_lock(lock);
768 } 779 }
@@ -1585,6 +1596,16 @@ EXPORT_SYMBOL(unmap_underlying_metadata);
1585 * locked buffer. This only can happen if someone has written the buffer 1596 * locked buffer. This only can happen if someone has written the buffer
1586 * directly, with submit_bh(). At the address_space level PageWriteback 1597 * directly, with submit_bh(). At the address_space level PageWriteback
1587 * prevents this contention from occurring. 1598 * prevents this contention from occurring.
1599 *
1600 * If block_write_full_page() is called with wbc->sync_mode ==
1601 * WB_SYNC_ALL, the writes are posted using WRITE_SYNC_PLUG; this
1602 * causes the writes to be flagged as synchronous writes, but the
1603 * block device queue will NOT be unplugged, since usually many pages
1604 * will be pushed to the out before the higher-level caller actually
1605 * waits for the writes to be completed. The various wait functions,
1606 * such as wait_on_writeback_range() will ultimately call sync_page()
1607 * which will ultimately call blk_run_backing_dev(), which will end up
1608 * unplugging the device queue.
1588 */ 1609 */
1589static int __block_write_full_page(struct inode *inode, struct page *page, 1610static int __block_write_full_page(struct inode *inode, struct page *page,
1590 get_block_t *get_block, struct writeback_control *wbc) 1611 get_block_t *get_block, struct writeback_control *wbc)
@@ -1595,7 +1616,8 @@ static int __block_write_full_page(struct inode *inode, struct page *page,
1595 struct buffer_head *bh, *head; 1616 struct buffer_head *bh, *head;
1596 const unsigned blocksize = 1 << inode->i_blkbits; 1617 const unsigned blocksize = 1 << inode->i_blkbits;
1597 int nr_underway = 0; 1618 int nr_underway = 0;
1598 int write_op = (wbc->sync_mode == WB_SYNC_ALL ? WRITE_SYNC : WRITE); 1619 int write_op = (wbc->sync_mode == WB_SYNC_ALL ?
1620 WRITE_SYNC_PLUG : WRITE);
1599 1621
1600 BUG_ON(!PageLocked(page)); 1622 BUG_ON(!PageLocked(page));
1601 1623
@@ -2957,12 +2979,13 @@ void ll_rw_block(int rw, int nr, struct buffer_head *bhs[])
2957 for (i = 0; i < nr; i++) { 2979 for (i = 0; i < nr; i++) {
2958 struct buffer_head *bh = bhs[i]; 2980 struct buffer_head *bh = bhs[i];
2959 2981
2960 if (rw == SWRITE || rw == SWRITE_SYNC) 2982 if (rw == SWRITE || rw == SWRITE_SYNC || rw == SWRITE_SYNC_PLUG)
2961 lock_buffer(bh); 2983 lock_buffer(bh);
2962 else if (!trylock_buffer(bh)) 2984 else if (!trylock_buffer(bh))
2963 continue; 2985 continue;
2964 2986
2965 if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC) { 2987 if (rw == WRITE || rw == SWRITE || rw == SWRITE_SYNC ||
2988 rw == SWRITE_SYNC_PLUG) {
2966 if (test_clear_buffer_dirty(bh)) { 2989 if (test_clear_buffer_dirty(bh)) {
2967 bh->b_end_io = end_buffer_write_sync; 2990 bh->b_end_io = end_buffer_write_sync;
2968 get_bh(bh); 2991 get_bh(bh);
@@ -2998,7 +3021,7 @@ int sync_dirty_buffer(struct buffer_head *bh)
2998 if (test_clear_buffer_dirty(bh)) { 3021 if (test_clear_buffer_dirty(bh)) {
2999 get_bh(bh); 3022 get_bh(bh);
3000 bh->b_end_io = end_buffer_write_sync; 3023 bh->b_end_io = end_buffer_write_sync;
3001 ret = submit_bh(WRITE, bh); 3024 ret = submit_bh(WRITE_SYNC, bh);
3002 wait_on_buffer(bh); 3025 wait_on_buffer(bh);
3003 if (buffer_eopnotsupp(bh)) { 3026 if (buffer_eopnotsupp(bh)) {
3004 clear_buffer_eopnotsupp(bh); 3027 clear_buffer_eopnotsupp(bh);
diff --git a/fs/direct-io.c b/fs/direct-io.c
index b6d43908ff7a..da258e7249cc 100644
--- a/fs/direct-io.c
+++ b/fs/direct-io.c
@@ -1126,7 +1126,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode,
1126 int acquire_i_mutex = 0; 1126 int acquire_i_mutex = 0;
1127 1127
1128 if (rw & WRITE) 1128 if (rw & WRITE)
1129 rw = WRITE_SYNC; 1129 rw = WRITE_ODIRECT;
1130 1130
1131 if (bdev) 1131 if (bdev)
1132 bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev)); 1132 bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev));
diff --git a/fs/ext3/Kconfig b/fs/ext3/Kconfig
index 8e0cfe44b0fc..fb3c1a21b135 100644
--- a/fs/ext3/Kconfig
+++ b/fs/ext3/Kconfig
@@ -28,6 +28,25 @@ config EXT3_FS
28 To compile this file system support as a module, choose M here: the 28 To compile this file system support as a module, choose M here: the
29 module will be called ext3. 29 module will be called ext3.
30 30
31config EXT3_DEFAULTS_TO_ORDERED
32 bool "Default to 'data=ordered' in ext3 (legacy option)"
33 depends on EXT3_FS
34 help
35 If a filesystem does not explicitly specify a data ordering
36 mode, and the journal capability allowed it, ext3 used to
37 historically default to 'data=ordered'.
38
39 That was a rather unfortunate choice, because it leads to all
40 kinds of latency problems, and the 'data=writeback' mode is more
41 appropriate these days.
42
43 You should probably always answer 'n' here, and if you really
44 want to use 'data=ordered' mode, set it in the filesystem itself
45 with 'tune2fs -o journal_data_ordered'.
46
47 But if you really want to enable the legacy default, you can do
48 so by answering 'y' to this question.
49
31config EXT3_FS_XATTR 50config EXT3_FS_XATTR
32 bool "Ext3 extended attributes" 51 bool "Ext3 extended attributes"
33 depends on EXT3_FS 52 depends on EXT3_FS
diff --git a/fs/ext3/inode.c b/fs/ext3/inode.c
index 466a332e0bd1..fcfa24361856 100644
--- a/fs/ext3/inode.c
+++ b/fs/ext3/inode.c
@@ -1521,12 +1521,16 @@ static int ext3_ordered_writepage(struct page *page,
1521 if (!page_has_buffers(page)) { 1521 if (!page_has_buffers(page)) {
1522 create_empty_buffers(page, inode->i_sb->s_blocksize, 1522 create_empty_buffers(page, inode->i_sb->s_blocksize,
1523 (1 << BH_Dirty)|(1 << BH_Uptodate)); 1523 (1 << BH_Dirty)|(1 << BH_Uptodate));
1524 } else if (!walk_page_buffers(NULL, page_buffers(page), 0, PAGE_CACHE_SIZE, NULL, buffer_unmapped)) { 1524 page_bufs = page_buffers(page);
1525 /* Provide NULL instead of get_block so that we catch bugs if buffers weren't really mapped */ 1525 } else {
1526 return block_write_full_page(page, NULL, wbc); 1526 page_bufs = page_buffers(page);
1527 if (!walk_page_buffers(NULL, page_bufs, 0, PAGE_CACHE_SIZE,
1528 NULL, buffer_unmapped)) {
1529 /* Provide NULL get_block() to catch bugs if buffers
1530 * weren't really mapped */
1531 return block_write_full_page(page, NULL, wbc);
1532 }
1527 } 1533 }
1528 page_bufs = page_buffers(page);
1529
1530 handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode)); 1534 handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1531 1535
1532 if (IS_ERR(handle)) { 1536 if (IS_ERR(handle)) {
@@ -1581,6 +1585,15 @@ static int ext3_writeback_writepage(struct page *page,
1581 if (ext3_journal_current_handle()) 1585 if (ext3_journal_current_handle())
1582 goto out_fail; 1586 goto out_fail;
1583 1587
1588 if (page_has_buffers(page)) {
1589 if (!walk_page_buffers(NULL, page_buffers(page), 0,
1590 PAGE_CACHE_SIZE, NULL, buffer_unmapped)) {
1591 /* Provide NULL get_block() to catch bugs if buffers
1592 * weren't really mapped */
1593 return block_write_full_page(page, NULL, wbc);
1594 }
1595 }
1596
1584 handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode)); 1597 handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1585 if (IS_ERR(handle)) { 1598 if (IS_ERR(handle)) {
1586 ret = PTR_ERR(handle); 1599 ret = PTR_ERR(handle);
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 9e5b8e387e1e..599dbfe504c3 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -44,6 +44,12 @@
44#include "acl.h" 44#include "acl.h"
45#include "namei.h" 45#include "namei.h"
46 46
47#ifdef CONFIG_EXT3_DEFAULTS_TO_ORDERED
48 #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_ORDERED_DATA
49#else
50 #define EXT3_MOUNT_DEFAULT_DATA_MODE EXT3_MOUNT_WRITEBACK_DATA
51#endif
52
47static int ext3_load_journal(struct super_block *, struct ext3_super_block *, 53static int ext3_load_journal(struct super_block *, struct ext3_super_block *,
48 unsigned long journal_devnum); 54 unsigned long journal_devnum);
49static int ext3_create_journal(struct super_block *, struct ext3_super_block *, 55static int ext3_create_journal(struct super_block *, struct ext3_super_block *,
@@ -1919,7 +1925,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent)
1919 cope, else JOURNAL_DATA */ 1925 cope, else JOURNAL_DATA */
1920 if (journal_check_available_features 1926 if (journal_check_available_features
1921 (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE)) 1927 (sbi->s_journal, 0, 0, JFS_FEATURE_INCOMPAT_REVOKE))
1922 set_opt(sbi->s_mount_opt, ORDERED_DATA); 1928 set_opt(sbi->s_mount_opt, DEFAULT_DATA_MODE);
1923 else 1929 else
1924 set_opt(sbi->s_mount_opt, JOURNAL_DATA); 1930 set_opt(sbi->s_mount_opt, JOURNAL_DATA);
1925 break; 1931 break;
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index ac77d8b8251d..6132353dcf62 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -342,7 +342,7 @@ static int ext4_valid_extent_idx(struct inode *inode,
342 ext4_fsblk_t block = idx_pblock(ext_idx); 342 ext4_fsblk_t block = idx_pblock(ext_idx);
343 struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; 343 struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
344 if (unlikely(block < le32_to_cpu(es->s_first_data_block) || 344 if (unlikely(block < le32_to_cpu(es->s_first_data_block) ||
345 (block > ext4_blocks_count(es)))) 345 (block >= ext4_blocks_count(es))))
346 return 0; 346 return 0;
347 else 347 else
348 return 1; 348 return 1;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index a2e7952bc5f9..c6bd6ced3bb7 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -372,16 +372,16 @@ static int ext4_block_to_path(struct inode *inode,
372} 372}
373 373
374static int __ext4_check_blockref(const char *function, struct inode *inode, 374static int __ext4_check_blockref(const char *function, struct inode *inode,
375 unsigned int *p, unsigned int max) { 375 __le32 *p, unsigned int max) {
376 376
377 unsigned int maxblocks = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es); 377 unsigned int maxblocks = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es);
378 unsigned int *bref = p; 378 __le32 *bref = p;
379 while (bref < p+max) { 379 while (bref < p+max) {
380 if (unlikely(*bref >= maxblocks)) { 380 if (unlikely(le32_to_cpu(*bref) >= maxblocks)) {
381 ext4_error(inode->i_sb, function, 381 ext4_error(inode->i_sb, function,
382 "block reference %u >= max (%u) " 382 "block reference %u >= max (%u) "
383 "in inode #%lu, offset=%d", 383 "in inode #%lu, offset=%d",
384 *bref, maxblocks, 384 le32_to_cpu(*bref), maxblocks,
385 inode->i_ino, (int)(bref-p)); 385 inode->i_ino, (int)(bref-p));
386 return -EIO; 386 return -EIO;
387 } 387 }
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 9987bba99db3..2958f4e6f222 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -2508,6 +2508,15 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
2508 if (EXT4_BLOCKS_PER_GROUP(sb) == 0) 2508 if (EXT4_BLOCKS_PER_GROUP(sb) == 0)
2509 goto cantfind_ext4; 2509 goto cantfind_ext4;
2510 2510
2511 /* check blocks count against device size */
2512 blocks_count = sb->s_bdev->bd_inode->i_size >> sb->s_blocksize_bits;
2513 if (blocks_count && ext4_blocks_count(es) > blocks_count) {
2514 printk(KERN_WARNING "EXT4-fs: bad geometry: block count %llu "
2515 "exceeds size of device (%llu blocks)\n",
2516 ext4_blocks_count(es), blocks_count);
2517 goto failed_mount;
2518 }
2519
2511 /* 2520 /*
2512 * It makes no sense for the first data block to be beyond the end 2521 * It makes no sense for the first data block to be beyond the end
2513 * of the filesystem. 2522 * of the filesystem.
diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c
index f8077b9c8981..a8e8513a78a9 100644
--- a/fs/jbd/commit.c
+++ b/fs/jbd/commit.c
@@ -351,8 +351,13 @@ void journal_commit_transaction(journal_t *journal)
351 spin_lock(&journal->j_state_lock); 351 spin_lock(&journal->j_state_lock);
352 commit_transaction->t_state = T_LOCKED; 352 commit_transaction->t_state = T_LOCKED;
353 353
354 /*
355 * Use plugged writes here, since we want to submit several before
356 * we unplug the device. We don't do explicit unplugging in here,
357 * instead we rely on sync_buffer() doing the unplug for us.
358 */
354 if (commit_transaction->t_synchronous_commit) 359 if (commit_transaction->t_synchronous_commit)
355 write_op = WRITE_SYNC; 360 write_op = WRITE_SYNC_PLUG;
356 spin_lock(&commit_transaction->t_handle_lock); 361 spin_lock(&commit_transaction->t_handle_lock);
357 while (commit_transaction->t_updates) { 362 while (commit_transaction->t_updates) {
358 DEFINE_WAIT(wait); 363 DEFINE_WAIT(wait);
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 4ea72377c7a2..073c8c3df7cd 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -138,7 +138,7 @@ static int journal_submit_commit_record(journal_t *journal,
138 set_buffer_ordered(bh); 138 set_buffer_ordered(bh);
139 barrier_done = 1; 139 barrier_done = 1;
140 } 140 }
141 ret = submit_bh(WRITE_SYNC, bh); 141 ret = submit_bh(WRITE_SYNC_PLUG, bh);
142 if (barrier_done) 142 if (barrier_done)
143 clear_buffer_ordered(bh); 143 clear_buffer_ordered(bh);
144 144
@@ -159,7 +159,7 @@ static int journal_submit_commit_record(journal_t *journal,
159 lock_buffer(bh); 159 lock_buffer(bh);
160 set_buffer_uptodate(bh); 160 set_buffer_uptodate(bh);
161 clear_buffer_dirty(bh); 161 clear_buffer_dirty(bh);
162 ret = submit_bh(WRITE_SYNC, bh); 162 ret = submit_bh(WRITE_SYNC_PLUG, bh);
163 } 163 }
164 *cbh = bh; 164 *cbh = bh;
165 return ret; 165 return ret;
@@ -190,7 +190,7 @@ retry:
190 set_buffer_uptodate(bh); 190 set_buffer_uptodate(bh);
191 bh->b_end_io = journal_end_buffer_io_sync; 191 bh->b_end_io = journal_end_buffer_io_sync;
192 192
193 ret = submit_bh(WRITE_SYNC, bh); 193 ret = submit_bh(WRITE_SYNC_PLUG, bh);
194 if (ret) { 194 if (ret) {
195 unlock_buffer(bh); 195 unlock_buffer(bh);
196 return ret; 196 return ret;
@@ -402,8 +402,13 @@ void jbd2_journal_commit_transaction(journal_t *journal)
402 spin_lock(&journal->j_state_lock); 402 spin_lock(&journal->j_state_lock);
403 commit_transaction->t_state = T_LOCKED; 403 commit_transaction->t_state = T_LOCKED;
404 404
405 /*
406 * Use plugged writes here, since we want to submit several before
407 * we unplug the device. We don't do explicit unplugging in here,
408 * instead we rely on sync_buffer() doing the unplug for us.
409 */
405 if (commit_transaction->t_synchronous_commit) 410 if (commit_transaction->t_synchronous_commit)
406 write_op = WRITE_SYNC; 411 write_op = WRITE_SYNC_PLUG;
407 stats.u.run.rs_wait = commit_transaction->t_max_wait; 412 stats.u.run.rs_wait = commit_transaction->t_max_wait;
408 stats.u.run.rs_locked = jiffies; 413 stats.u.run.rs_locked = jiffies;
409 stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start, 414 stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
diff --git a/fs/jffs2/acl.c b/fs/jffs2/acl.c
index 77ccf8cb0823..043740dde20c 100644
--- a/fs/jffs2/acl.c
+++ b/fs/jffs2/acl.c
@@ -38,12 +38,12 @@ static int jffs2_acl_count(size_t size)
38 size_t s; 38 size_t s;
39 39
40 size -= sizeof(struct jffs2_acl_header); 40 size -= sizeof(struct jffs2_acl_header);
41 s = size - 4 * sizeof(struct jffs2_acl_entry_short); 41 if (size < 4 * sizeof(struct jffs2_acl_entry_short)) {
42 if (s < 0) {
43 if (size % sizeof(struct jffs2_acl_entry_short)) 42 if (size % sizeof(struct jffs2_acl_entry_short))
44 return -1; 43 return -1;
45 return size / sizeof(struct jffs2_acl_entry_short); 44 return size / sizeof(struct jffs2_acl_entry_short);
46 } else { 45 } else {
46 s = size - 4 * sizeof(struct jffs2_acl_entry_short);
47 if (s % sizeof(struct jffs2_acl_entry)) 47 if (s % sizeof(struct jffs2_acl_entry))
48 return -1; 48 return -1;
49 return s / sizeof(struct jffs2_acl_entry) + 4; 49 return s / sizeof(struct jffs2_acl_entry) + 4;
diff --git a/fs/jffs2/malloc.c b/fs/jffs2/malloc.c
index f9211252b5f1..9eff2bdae8a7 100644
--- a/fs/jffs2/malloc.c
+++ b/fs/jffs2/malloc.c
@@ -284,10 +284,9 @@ void jffs2_free_inode_cache(struct jffs2_inode_cache *x)
284struct jffs2_xattr_datum *jffs2_alloc_xattr_datum(void) 284struct jffs2_xattr_datum *jffs2_alloc_xattr_datum(void)
285{ 285{
286 struct jffs2_xattr_datum *xd; 286 struct jffs2_xattr_datum *xd;
287 xd = kmem_cache_alloc(xattr_datum_cache, GFP_KERNEL); 287 xd = kmem_cache_zalloc(xattr_datum_cache, GFP_KERNEL);
288 dbg_memalloc("%p\n", xd); 288 dbg_memalloc("%p\n", xd);
289 289
290 memset(xd, 0, sizeof(struct jffs2_xattr_datum));
291 xd->class = RAWNODE_CLASS_XATTR_DATUM; 290 xd->class = RAWNODE_CLASS_XATTR_DATUM;
292 xd->node = (void *)xd; 291 xd->node = (void *)xd;
293 INIT_LIST_HEAD(&xd->xindex); 292 INIT_LIST_HEAD(&xd->xindex);
@@ -303,10 +302,9 @@ void jffs2_free_xattr_datum(struct jffs2_xattr_datum *xd)
303struct jffs2_xattr_ref *jffs2_alloc_xattr_ref(void) 302struct jffs2_xattr_ref *jffs2_alloc_xattr_ref(void)
304{ 303{
305 struct jffs2_xattr_ref *ref; 304 struct jffs2_xattr_ref *ref;
306 ref = kmem_cache_alloc(xattr_ref_cache, GFP_KERNEL); 305 ref = kmem_cache_zalloc(xattr_ref_cache, GFP_KERNEL);
307 dbg_memalloc("%p\n", ref); 306 dbg_memalloc("%p\n", ref);
308 307
309 memset(ref, 0, sizeof(struct jffs2_xattr_ref));
310 ref->class = RAWNODE_CLASS_XATTR_REF; 308 ref->class = RAWNODE_CLASS_XATTR_REF;
311 ref->node = (void *)ref; 309 ref->node = (void *)ref;
312 return ref; 310 return ref;
diff --git a/fs/libfs.c b/fs/libfs.c
index 4910a36f516e..cd223190c4e9 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -575,6 +575,21 @@ ssize_t memory_read_from_buffer(void *to, size_t count, loff_t *ppos,
575 * possibly a read which collects the result - which is stored in a 575 * possibly a read which collects the result - which is stored in a
576 * file-local buffer. 576 * file-local buffer.
577 */ 577 */
578
579void simple_transaction_set(struct file *file, size_t n)
580{
581 struct simple_transaction_argresp *ar = file->private_data;
582
583 BUG_ON(n > SIMPLE_TRANSACTION_LIMIT);
584
585 /*
586 * The barrier ensures that ar->size will really remain zero until
587 * ar->data is ready for reading.
588 */
589 smp_mb();
590 ar->size = n;
591}
592
578char *simple_transaction_get(struct file *file, const char __user *buf, size_t size) 593char *simple_transaction_get(struct file *file, const char __user *buf, size_t size)
579{ 594{
580 struct simple_transaction_argresp *ar; 595 struct simple_transaction_argresp *ar;
@@ -820,6 +835,7 @@ EXPORT_SYMBOL(simple_sync_file);
820EXPORT_SYMBOL(simple_unlink); 835EXPORT_SYMBOL(simple_unlink);
821EXPORT_SYMBOL(simple_read_from_buffer); 836EXPORT_SYMBOL(simple_read_from_buffer);
822EXPORT_SYMBOL(memory_read_from_buffer); 837EXPORT_SYMBOL(memory_read_from_buffer);
838EXPORT_SYMBOL(simple_transaction_set);
823EXPORT_SYMBOL(simple_transaction_get); 839EXPORT_SYMBOL(simple_transaction_get);
824EXPORT_SYMBOL(simple_transaction_read); 840EXPORT_SYMBOL(simple_transaction_read);
825EXPORT_SYMBOL(simple_transaction_release); 841EXPORT_SYMBOL(simple_transaction_release);
diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c
index 763b78a6e9de..83ee34203bd7 100644
--- a/fs/lockd/svclock.c
+++ b/fs/lockd/svclock.c
@@ -426,8 +426,15 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
426 ret = nlm_granted; 426 ret = nlm_granted;
427 goto out; 427 goto out;
428 case -EAGAIN: 428 case -EAGAIN:
429 /*
430 * If this is a blocking request for an
431 * already pending lock request then we need
432 * to put it back on lockd's block list
433 */
434 if (wait)
435 break;
429 ret = nlm_lck_denied; 436 ret = nlm_lck_denied;
430 break; 437 goto out;
431 case FILE_LOCK_DEFERRED: 438 case FILE_LOCK_DEFERRED:
432 if (wait) 439 if (wait)
433 break; 440 break;
@@ -443,10 +450,6 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file,
443 goto out; 450 goto out;
444 } 451 }
445 452
446 ret = nlm_lck_denied;
447 if (!wait)
448 goto out;
449
450 ret = nlm_lck_blocked; 453 ret = nlm_lck_blocked;
451 454
452 /* Append to list of blocked */ 455 /* Append to list of blocked */
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 3523b895eb4b..5a97bcfe03e5 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -516,8 +516,6 @@ static int nfs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
516 goto out_unlock; 516 goto out_unlock;
517 517
518 ret = nfs_updatepage(filp, page, 0, pagelen); 518 ret = nfs_updatepage(filp, page, 0, pagelen);
519 if (ret == 0)
520 ret = pagelen;
521out_unlock: 519out_unlock:
522 unlock_page(page); 520 unlock_page(page);
523 if (ret) 521 if (ret)
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 82eaadbff408..6717200923fe 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1228,7 +1228,6 @@ static int nfs_parse_mount_options(char *raw,
1228 goto out_nomem; 1228 goto out_nomem;
1229 token = match_token(string, 1229 token = match_token(string,
1230 nfs_xprt_protocol_tokens, args); 1230 nfs_xprt_protocol_tokens, args);
1231 kfree(string);
1232 1231
1233 switch (token) { 1232 switch (token) {
1234 case Opt_xprt_udp: 1233 case Opt_xprt_udp:
@@ -1258,6 +1257,7 @@ static int nfs_parse_mount_options(char *raw,
1258 goto out_nomem; 1257 goto out_nomem;
1259 token = match_token(string, 1258 token = match_token(string,
1260 nfs_xprt_protocol_tokens, args); 1259 nfs_xprt_protocol_tokens, args);
1260 kfree(string);
1261 1261
1262 switch (token) { 1262 switch (token) {
1263 case Opt_xprt_udp: 1263 case Opt_xprt_udp:
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 44d7d04dab95..503b9da159a3 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -1,6 +1,7 @@
1config NFSD 1config NFSD
2 tristate "NFS server support" 2 tristate "NFS server support"
3 depends on INET 3 depends on INET
4 depends on FILE_LOCKING
4 select LOCKD 5 select LOCKD
5 select SUNRPC 6 select SUNRPC
6 select EXPORTFS 7 select EXPORTFS
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index 9dbd2eb91281..7c9fe838f038 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -18,6 +18,7 @@
18#include <linux/unistd.h> 18#include <linux/unistd.h>
19#include <linux/slab.h> 19#include <linux/slab.h>
20#include <linux/major.h> 20#include <linux/major.h>
21#include <linux/magic.h>
21 22
22#include <linux/sunrpc/svc.h> 23#include <linux/sunrpc/svc.h>
23#include <linux/nfsd/nfsd.h> 24#include <linux/nfsd/nfsd.h>
@@ -202,6 +203,7 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp,
202 struct nfsd3_writeres *resp) 203 struct nfsd3_writeres *resp)
203{ 204{
204 __be32 nfserr; 205 __be32 nfserr;
206 unsigned long cnt = argp->len;
205 207
206 dprintk("nfsd: WRITE(3) %s %d bytes at %ld%s\n", 208 dprintk("nfsd: WRITE(3) %s %d bytes at %ld%s\n",
207 SVCFH_fmt(&argp->fh), 209 SVCFH_fmt(&argp->fh),
@@ -214,9 +216,9 @@ nfsd3_proc_write(struct svc_rqst *rqstp, struct nfsd3_writeargs *argp,
214 nfserr = nfsd_write(rqstp, &resp->fh, NULL, 216 nfserr = nfsd_write(rqstp, &resp->fh, NULL,
215 argp->offset, 217 argp->offset,
216 rqstp->rq_vec, argp->vlen, 218 rqstp->rq_vec, argp->vlen,
217 argp->len, 219 &cnt,
218 &resp->committed); 220 &resp->committed);
219 resp->count = argp->count; 221 resp->count = cnt;
220 RETURN_STATUS(nfserr); 222 RETURN_STATUS(nfserr);
221} 223}
222 224
@@ -569,7 +571,7 @@ nfsd3_proc_fsinfo(struct svc_rqst * rqstp, struct nfsd_fhandle *argp,
569 struct super_block *sb = argp->fh.fh_dentry->d_inode->i_sb; 571 struct super_block *sb = argp->fh.fh_dentry->d_inode->i_sb;
570 572
571 /* Note that we don't care for remote fs's here */ 573 /* Note that we don't care for remote fs's here */
572 if (sb->s_magic == 0x4d44 /* MSDOS_SUPER_MAGIC */) { 574 if (sb->s_magic == MSDOS_SUPER_MAGIC) {
573 resp->f_properties = NFS3_FSF_BILLYBOY; 575 resp->f_properties = NFS3_FSF_BILLYBOY;
574 } 576 }
575 resp->f_maxfilesize = sb->s_maxbytes; 577 resp->f_maxfilesize = sb->s_maxbytes;
@@ -610,7 +612,7 @@ nfsd3_proc_pathconf(struct svc_rqst * rqstp, struct nfsd_fhandle *argp,
610 resp->p_link_max = EXT2_LINK_MAX; 612 resp->p_link_max = EXT2_LINK_MAX;
611 resp->p_name_max = EXT2_NAME_LEN; 613 resp->p_name_max = EXT2_NAME_LEN;
612 break; 614 break;
613 case 0x4d44: /* MSDOS_SUPER_MAGIC */ 615 case MSDOS_SUPER_MAGIC:
614 resp->p_case_insensitive = 1; 616 resp->p_case_insensitive = 1;
615 resp->p_case_preserving = 0; 617 resp->p_case_preserving = 0;
616 break; 618 break;
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index c464181b5994..290289bd44f7 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -218,7 +218,7 @@ static int
218encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec) 218encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec)
219{ 219{
220 __be32 *p; 220 __be32 *p;
221 int len = cb_rec->cbr_fhlen; 221 int len = cb_rec->cbr_fh.fh_size;
222 222
223 RESERVE_SPACE(12+sizeof(cb_rec->cbr_stateid) + len); 223 RESERVE_SPACE(12+sizeof(cb_rec->cbr_stateid) + len);
224 WRITE32(OP_CB_RECALL); 224 WRITE32(OP_CB_RECALL);
@@ -226,7 +226,7 @@ encode_cb_recall(struct xdr_stream *xdr, struct nfs4_cb_recall *cb_rec)
226 WRITEMEM(&cb_rec->cbr_stateid.si_opaque, sizeof(stateid_opaque_t)); 226 WRITEMEM(&cb_rec->cbr_stateid.si_opaque, sizeof(stateid_opaque_t));
227 WRITE32(cb_rec->cbr_trunc); 227 WRITE32(cb_rec->cbr_trunc);
228 WRITE32(len); 228 WRITE32(len);
229 WRITEMEM(cb_rec->cbr_fhval, len); 229 WRITEMEM(&cb_rec->cbr_fh.fh_base, len);
230 return 0; 230 return 0;
231} 231}
232 232
@@ -361,9 +361,8 @@ static struct rpc_program cb_program = {
361/* Reference counting, callback cleanup, etc., all look racy as heck. 361/* Reference counting, callback cleanup, etc., all look racy as heck.
362 * And why is cb_set an atomic? */ 362 * And why is cb_set an atomic? */
363 363
364static int do_probe_callback(void *data) 364static struct rpc_clnt *setup_callback_client(struct nfs4_client *clp)
365{ 365{
366 struct nfs4_client *clp = data;
367 struct sockaddr_in addr; 366 struct sockaddr_in addr;
368 struct nfs4_callback *cb = &clp->cl_callback; 367 struct nfs4_callback *cb = &clp->cl_callback;
369 struct rpc_timeout timeparms = { 368 struct rpc_timeout timeparms = {
@@ -384,17 +383,10 @@ static int do_probe_callback(void *data)
384 .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET), 383 .flags = (RPC_CLNT_CREATE_NOPING | RPC_CLNT_CREATE_QUIET),
385 .client_name = clp->cl_principal, 384 .client_name = clp->cl_principal,
386 }; 385 };
387 struct rpc_message msg = {
388 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
389 .rpc_argp = clp,
390 };
391 struct rpc_clnt *client; 386 struct rpc_clnt *client;
392 int status;
393 387
394 if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5)) { 388 if (!clp->cl_principal && (clp->cl_flavor >= RPC_AUTH_GSS_KRB5))
395 status = nfserr_cb_path_down; 389 return ERR_PTR(-EINVAL);
396 goto out_err;
397 }
398 390
399 /* Initialize address */ 391 /* Initialize address */
400 memset(&addr, 0, sizeof(addr)); 392 memset(&addr, 0, sizeof(addr));
@@ -404,9 +396,29 @@ static int do_probe_callback(void *data)
404 396
405 /* Create RPC client */ 397 /* Create RPC client */
406 client = rpc_create(&args); 398 client = rpc_create(&args);
399 if (IS_ERR(client))
400 dprintk("NFSD: couldn't create callback client: %ld\n",
401 PTR_ERR(client));
402 return client;
403
404}
405
406static int do_probe_callback(void *data)
407{
408 struct nfs4_client *clp = data;
409 struct nfs4_callback *cb = &clp->cl_callback;
410 struct rpc_message msg = {
411 .rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL],
412 .rpc_argp = clp,
413 };
414 struct rpc_clnt *client;
415 int status;
416
417 client = setup_callback_client(clp);
407 if (IS_ERR(client)) { 418 if (IS_ERR(client)) {
408 dprintk("NFSD: couldn't create callback client\n");
409 status = PTR_ERR(client); 419 status = PTR_ERR(client);
420 dprintk("NFSD: couldn't create callback client: %d\n",
421 status);
410 goto out_err; 422 goto out_err;
411 } 423 }
412 424
@@ -422,10 +434,10 @@ static int do_probe_callback(void *data)
422out_release_client: 434out_release_client:
423 rpc_shutdown_client(client); 435 rpc_shutdown_client(client);
424out_err: 436out_err:
425 dprintk("NFSD: warning: no callback path to client %.*s\n", 437 dprintk("NFSD: warning: no callback path to client %.*s: error %d\n",
426 (int)clp->cl_name.len, clp->cl_name.data); 438 (int)clp->cl_name.len, clp->cl_name.data, status);
427 put_nfs4_client(clp); 439 put_nfs4_client(clp);
428 return status; 440 return 0;
429} 441}
430 442
431/* 443/*
@@ -451,7 +463,6 @@ nfsd4_probe_callback(struct nfs4_client *clp)
451 463
452/* 464/*
453 * called with dp->dl_count inc'ed. 465 * called with dp->dl_count inc'ed.
454 * nfs4_lock_state() may or may not have been called.
455 */ 466 */
456void 467void
457nfsd4_cb_recall(struct nfs4_delegation *dp) 468nfsd4_cb_recall(struct nfs4_delegation *dp)
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 9fa60a3ad48c..b2883e9c6381 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -93,6 +93,21 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
93 open->op_truncate = 0; 93 open->op_truncate = 0;
94 94
95 if (open->op_create) { 95 if (open->op_create) {
96 /* FIXME: check session persistence and pnfs flags.
97 * The nfsv4.1 spec requires the following semantics:
98 *
99 * Persistent | pNFS | Server REQUIRED | Client Allowed
100 * Reply Cache | server | |
101 * -------------+--------+-----------------+--------------------
102 * no | no | EXCLUSIVE4_1 | EXCLUSIVE4_1
103 * | | | (SHOULD)
104 * | | and EXCLUSIVE4 | or EXCLUSIVE4
105 * | | | (SHOULD NOT)
106 * no | yes | EXCLUSIVE4_1 | EXCLUSIVE4_1
107 * yes | no | GUARDED4 | GUARDED4
108 * yes | yes | GUARDED4 | GUARDED4
109 */
110
96 /* 111 /*
97 * Note: create modes (UNCHECKED,GUARDED...) are the same 112 * Note: create modes (UNCHECKED,GUARDED...) are the same
98 * in NFSv4 as in v3. 113 * in NFSv4 as in v3.
@@ -103,11 +118,13 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
103 (u32 *)open->op_verf.data, 118 (u32 *)open->op_verf.data,
104 &open->op_truncate, &created); 119 &open->op_truncate, &created);
105 120
106 /* If we ever decide to use different attrs to store the 121 /*
107 * verifier in nfsd_create_v3, then we'll need to change this 122 * Following rfc 3530 14.2.16, use the returned bitmask
123 * to indicate which attributes we used to store the
124 * verifier:
108 */ 125 */
109 if (open->op_createmode == NFS4_CREATE_EXCLUSIVE && status == 0) 126 if (open->op_createmode == NFS4_CREATE_EXCLUSIVE && status == 0)
110 open->op_bmval[1] |= (FATTR4_WORD1_TIME_ACCESS | 127 open->op_bmval[1] = (FATTR4_WORD1_TIME_ACCESS |
111 FATTR4_WORD1_TIME_MODIFY); 128 FATTR4_WORD1_TIME_MODIFY);
112 } else { 129 } else {
113 status = nfsd_lookup(rqstp, current_fh, 130 status = nfsd_lookup(rqstp, current_fh,
@@ -118,13 +135,11 @@ do_open_lookup(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_o
118 goto out; 135 goto out;
119 136
120 set_change_info(&open->op_cinfo, current_fh); 137 set_change_info(&open->op_cinfo, current_fh);
121
122 /* set reply cache */
123 fh_dup2(current_fh, &resfh); 138 fh_dup2(current_fh, &resfh);
124 open->op_stateowner->so_replay.rp_openfh_len = resfh.fh_handle.fh_size;
125 memcpy(open->op_stateowner->so_replay.rp_openfh,
126 &resfh.fh_handle.fh_base, resfh.fh_handle.fh_size);
127 139
140 /* set reply cache */
141 fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh,
142 &resfh.fh_handle);
128 if (!created) 143 if (!created)
129 status = do_open_permission(rqstp, current_fh, open, 144 status = do_open_permission(rqstp, current_fh, open,
130 NFSD_MAY_NOP); 145 NFSD_MAY_NOP);
@@ -150,10 +165,8 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_
150 memset(&open->op_cinfo, 0, sizeof(struct nfsd4_change_info)); 165 memset(&open->op_cinfo, 0, sizeof(struct nfsd4_change_info));
151 166
152 /* set replay cache */ 167 /* set replay cache */
153 open->op_stateowner->so_replay.rp_openfh_len = current_fh->fh_handle.fh_size; 168 fh_copy_shallow(&open->op_stateowner->so_replay.rp_openfh,
154 memcpy(open->op_stateowner->so_replay.rp_openfh, 169 &current_fh->fh_handle);
155 &current_fh->fh_handle.fh_base,
156 current_fh->fh_handle.fh_size);
157 170
158 open->op_truncate = (open->op_iattr.ia_valid & ATTR_SIZE) && 171 open->op_truncate = (open->op_iattr.ia_valid & ATTR_SIZE) &&
159 (open->op_iattr.ia_size == 0); 172 (open->op_iattr.ia_size == 0);
@@ -164,12 +177,23 @@ do_open_fhandle(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_
164 return status; 177 return status;
165} 178}
166 179
180static void
181copy_clientid(clientid_t *clid, struct nfsd4_session *session)
182{
183 struct nfsd4_sessionid *sid =
184 (struct nfsd4_sessionid *)session->se_sessionid.data;
185
186 clid->cl_boot = sid->clientid.cl_boot;
187 clid->cl_id = sid->clientid.cl_id;
188}
167 189
168static __be32 190static __be32
169nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 191nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
170 struct nfsd4_open *open) 192 struct nfsd4_open *open)
171{ 193{
172 __be32 status; 194 __be32 status;
195 struct nfsd4_compoundres *resp;
196
173 dprintk("NFSD: nfsd4_open filename %.*s op_stateowner %p\n", 197 dprintk("NFSD: nfsd4_open filename %.*s op_stateowner %p\n",
174 (int)open->op_fname.len, open->op_fname.data, 198 (int)open->op_fname.len, open->op_fname.data,
175 open->op_stateowner); 199 open->op_stateowner);
@@ -178,16 +202,19 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
178 if (open->op_create && open->op_claim_type != NFS4_OPEN_CLAIM_NULL) 202 if (open->op_create && open->op_claim_type != NFS4_OPEN_CLAIM_NULL)
179 return nfserr_inval; 203 return nfserr_inval;
180 204
205 if (nfsd4_has_session(cstate))
206 copy_clientid(&open->op_clientid, cstate->session);
207
181 nfs4_lock_state(); 208 nfs4_lock_state();
182 209
183 /* check seqid for replay. set nfs4_owner */ 210 /* check seqid for replay. set nfs4_owner */
184 status = nfsd4_process_open1(open); 211 resp = rqstp->rq_resp;
212 status = nfsd4_process_open1(&resp->cstate, open);
185 if (status == nfserr_replay_me) { 213 if (status == nfserr_replay_me) {
186 struct nfs4_replay *rp = &open->op_stateowner->so_replay; 214 struct nfs4_replay *rp = &open->op_stateowner->so_replay;
187 fh_put(&cstate->current_fh); 215 fh_put(&cstate->current_fh);
188 cstate->current_fh.fh_handle.fh_size = rp->rp_openfh_len; 216 fh_copy_shallow(&cstate->current_fh.fh_handle,
189 memcpy(&cstate->current_fh.fh_handle.fh_base, rp->rp_openfh, 217 &rp->rp_openfh);
190 rp->rp_openfh_len);
191 status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP); 218 status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP);
192 if (status) 219 if (status)
193 dprintk("nfsd4_open: replay failed" 220 dprintk("nfsd4_open: replay failed"
@@ -209,10 +236,6 @@ nfsd4_open(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
209 236
210 switch (open->op_claim_type) { 237 switch (open->op_claim_type) {
211 case NFS4_OPEN_CLAIM_DELEGATE_CUR: 238 case NFS4_OPEN_CLAIM_DELEGATE_CUR:
212 status = nfserr_inval;
213 if (open->op_create)
214 goto out;
215 /* fall through */
216 case NFS4_OPEN_CLAIM_NULL: 239 case NFS4_OPEN_CLAIM_NULL:
217 /* 240 /*
218 * (1) set CURRENT_FH to the file being opened, 241 * (1) set CURRENT_FH to the file being opened,
@@ -455,8 +478,9 @@ nfsd4_getattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
455 if (getattr->ga_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1) 478 if (getattr->ga_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1)
456 return nfserr_inval; 479 return nfserr_inval;
457 480
458 getattr->ga_bmval[0] &= NFSD_SUPPORTED_ATTRS_WORD0; 481 getattr->ga_bmval[0] &= nfsd_suppattrs0(cstate->minorversion);
459 getattr->ga_bmval[1] &= NFSD_SUPPORTED_ATTRS_WORD1; 482 getattr->ga_bmval[1] &= nfsd_suppattrs1(cstate->minorversion);
483 getattr->ga_bmval[2] &= nfsd_suppattrs2(cstate->minorversion);
460 484
461 getattr->ga_fhp = &cstate->current_fh; 485 getattr->ga_fhp = &cstate->current_fh;
462 return nfs_ok; 486 return nfs_ok;
@@ -520,9 +544,8 @@ nfsd4_read(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
520 544
521 nfs4_lock_state(); 545 nfs4_lock_state();
522 /* check stateid */ 546 /* check stateid */
523 if ((status = nfs4_preprocess_stateid_op(&cstate->current_fh, 547 if ((status = nfs4_preprocess_stateid_op(cstate, &read->rd_stateid,
524 &read->rd_stateid, 548 RD_STATE, &read->rd_filp))) {
525 CHECK_FH | RD_STATE, &read->rd_filp))) {
526 dprintk("NFSD: nfsd4_read: couldn't process stateid!\n"); 549 dprintk("NFSD: nfsd4_read: couldn't process stateid!\n");
527 goto out; 550 goto out;
528 } 551 }
@@ -548,8 +571,9 @@ nfsd4_readdir(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
548 if (readdir->rd_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1) 571 if (readdir->rd_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1)
549 return nfserr_inval; 572 return nfserr_inval;
550 573
551 readdir->rd_bmval[0] &= NFSD_SUPPORTED_ATTRS_WORD0; 574 readdir->rd_bmval[0] &= nfsd_suppattrs0(cstate->minorversion);
552 readdir->rd_bmval[1] &= NFSD_SUPPORTED_ATTRS_WORD1; 575 readdir->rd_bmval[1] &= nfsd_suppattrs1(cstate->minorversion);
576 readdir->rd_bmval[2] &= nfsd_suppattrs2(cstate->minorversion);
553 577
554 if ((cookie > ~(u32)0) || (cookie == 1) || (cookie == 2) || 578 if ((cookie > ~(u32)0) || (cookie == 1) || (cookie == 2) ||
555 (cookie == 0 && memcmp(readdir->rd_verf.data, zeroverf.data, NFS4_VERIFIER_SIZE))) 579 (cookie == 0 && memcmp(readdir->rd_verf.data, zeroverf.data, NFS4_VERIFIER_SIZE)))
@@ -653,8 +677,8 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
653 677
654 if (setattr->sa_iattr.ia_valid & ATTR_SIZE) { 678 if (setattr->sa_iattr.ia_valid & ATTR_SIZE) {
655 nfs4_lock_state(); 679 nfs4_lock_state();
656 status = nfs4_preprocess_stateid_op(&cstate->current_fh, 680 status = nfs4_preprocess_stateid_op(cstate,
657 &setattr->sa_stateid, CHECK_FH | WR_STATE, NULL); 681 &setattr->sa_stateid, WR_STATE, NULL);
658 nfs4_unlock_state(); 682 nfs4_unlock_state();
659 if (status) { 683 if (status) {
660 dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n"); 684 dprintk("NFSD: nfsd4_setattr: couldn't process stateid!\n");
@@ -685,6 +709,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
685 struct file *filp = NULL; 709 struct file *filp = NULL;
686 u32 *p; 710 u32 *p;
687 __be32 status = nfs_ok; 711 __be32 status = nfs_ok;
712 unsigned long cnt;
688 713
689 /* no need to check permission - this will be done in nfsd_write() */ 714 /* no need to check permission - this will be done in nfsd_write() */
690 715
@@ -692,8 +717,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
692 return nfserr_inval; 717 return nfserr_inval;
693 718
694 nfs4_lock_state(); 719 nfs4_lock_state();
695 status = nfs4_preprocess_stateid_op(&cstate->current_fh, stateid, 720 status = nfs4_preprocess_stateid_op(cstate, stateid, WR_STATE, &filp);
696 CHECK_FH | WR_STATE, &filp);
697 if (filp) 721 if (filp)
698 get_file(filp); 722 get_file(filp);
699 nfs4_unlock_state(); 723 nfs4_unlock_state();
@@ -703,7 +727,7 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
703 return status; 727 return status;
704 } 728 }
705 729
706 write->wr_bytes_written = write->wr_buflen; 730 cnt = write->wr_buflen;
707 write->wr_how_written = write->wr_stable_how; 731 write->wr_how_written = write->wr_stable_how;
708 p = (u32 *)write->wr_verifier.data; 732 p = (u32 *)write->wr_verifier.data;
709 *p++ = nfssvc_boot.tv_sec; 733 *p++ = nfssvc_boot.tv_sec;
@@ -711,10 +735,12 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
711 735
712 status = nfsd_write(rqstp, &cstate->current_fh, filp, 736 status = nfsd_write(rqstp, &cstate->current_fh, filp,
713 write->wr_offset, rqstp->rq_vec, write->wr_vlen, 737 write->wr_offset, rqstp->rq_vec, write->wr_vlen,
714 write->wr_buflen, &write->wr_how_written); 738 &cnt, &write->wr_how_written);
715 if (filp) 739 if (filp)
716 fput(filp); 740 fput(filp);
717 741
742 write->wr_bytes_written = cnt;
743
718 if (status == nfserr_symlink) 744 if (status == nfserr_symlink)
719 status = nfserr_inval; 745 status = nfserr_inval;
720 return status; 746 return status;
@@ -737,8 +763,9 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
737 if (status) 763 if (status)
738 return status; 764 return status;
739 765
740 if ((verify->ve_bmval[0] & ~NFSD_SUPPORTED_ATTRS_WORD0) 766 if ((verify->ve_bmval[0] & ~nfsd_suppattrs0(cstate->minorversion))
741 || (verify->ve_bmval[1] & ~NFSD_SUPPORTED_ATTRS_WORD1)) 767 || (verify->ve_bmval[1] & ~nfsd_suppattrs1(cstate->minorversion))
768 || (verify->ve_bmval[2] & ~nfsd_suppattrs2(cstate->minorversion)))
742 return nfserr_attrnotsupp; 769 return nfserr_attrnotsupp;
743 if ((verify->ve_bmval[0] & FATTR4_WORD0_RDATTR_ERROR) 770 if ((verify->ve_bmval[0] & FATTR4_WORD0_RDATTR_ERROR)
744 || (verify->ve_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1)) 771 || (verify->ve_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1))
@@ -766,7 +793,8 @@ _nfsd4_verify(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
766 if (status) 793 if (status)
767 goto out_kfree; 794 goto out_kfree;
768 795
769 p = buf + 3; 796 /* skip bitmap */
797 p = buf + 1 + ntohl(buf[0]);
770 status = nfserr_not_same; 798 status = nfserr_not_same;
771 if (ntohl(*p++) != verify->ve_attrlen) 799 if (ntohl(*p++) != verify->ve_attrlen)
772 goto out_kfree; 800 goto out_kfree;
@@ -813,39 +841,17 @@ static inline void nfsd4_increment_op_stats(u32 opnum)
813 nfsdstats.nfs4_opcount[opnum]++; 841 nfsdstats.nfs4_opcount[opnum]++;
814} 842}
815 843
816static void cstate_free(struct nfsd4_compound_state *cstate)
817{
818 if (cstate == NULL)
819 return;
820 fh_put(&cstate->current_fh);
821 fh_put(&cstate->save_fh);
822 BUG_ON(cstate->replay_owner);
823 kfree(cstate);
824}
825
826static struct nfsd4_compound_state *cstate_alloc(void)
827{
828 struct nfsd4_compound_state *cstate;
829
830 cstate = kmalloc(sizeof(struct nfsd4_compound_state), GFP_KERNEL);
831 if (cstate == NULL)
832 return NULL;
833 fh_init(&cstate->current_fh, NFS4_FHSIZE);
834 fh_init(&cstate->save_fh, NFS4_FHSIZE);
835 cstate->replay_owner = NULL;
836 return cstate;
837}
838
839typedef __be32(*nfsd4op_func)(struct svc_rqst *, struct nfsd4_compound_state *, 844typedef __be32(*nfsd4op_func)(struct svc_rqst *, struct nfsd4_compound_state *,
840 void *); 845 void *);
846enum nfsd4_op_flags {
847 ALLOWED_WITHOUT_FH = 1 << 0, /* No current filehandle required */
848 ALLOWED_ON_ABSENT_FS = 2 << 0, /* ops processed on absent fs */
849 ALLOWED_AS_FIRST_OP = 3 << 0, /* ops reqired first in compound */
850};
841 851
842struct nfsd4_operation { 852struct nfsd4_operation {
843 nfsd4op_func op_func; 853 nfsd4op_func op_func;
844 u32 op_flags; 854 u32 op_flags;
845/* Most ops require a valid current filehandle; a few don't: */
846#define ALLOWED_WITHOUT_FH 1
847/* GETATTR and ops not listed as returning NFS4ERR_MOVED: */
848#define ALLOWED_ON_ABSENT_FS 2
849 char *op_name; 855 char *op_name;
850}; 856};
851 857
@@ -854,6 +860,51 @@ static struct nfsd4_operation nfsd4_ops[];
854static const char *nfsd4_op_name(unsigned opnum); 860static const char *nfsd4_op_name(unsigned opnum);
855 861
856/* 862/*
863 * This is a replay of a compound for which no cache entry pages
864 * were used. Encode the sequence operation, and if cachethis is FALSE
865 * encode the uncache rep error on the next operation.
866 */
867static __be32
868nfsd4_enc_uncached_replay(struct nfsd4_compoundargs *args,
869 struct nfsd4_compoundres *resp)
870{
871 struct nfsd4_op *op;
872
873 dprintk("--> %s resp->opcnt %d ce_cachethis %u \n", __func__,
874 resp->opcnt, resp->cstate.slot->sl_cache_entry.ce_cachethis);
875
876 /* Encode the replayed sequence operation */
877 BUG_ON(resp->opcnt != 1);
878 op = &args->ops[resp->opcnt - 1];
879 nfsd4_encode_operation(resp, op);
880
881 /*return nfserr_retry_uncached_rep in next operation. */
882 if (resp->cstate.slot->sl_cache_entry.ce_cachethis == 0) {
883 op = &args->ops[resp->opcnt++];
884 op->status = nfserr_retry_uncached_rep;
885 nfsd4_encode_operation(resp, op);
886 }
887 return op->status;
888}
889
890/*
891 * Enforce NFSv4.1 COMPOUND ordering rules.
892 *
893 * TODO:
894 * - enforce NFS4ERR_NOT_ONLY_OP,
895 * - DESTROY_SESSION MUST be the final operation in the COMPOUND request.
896 */
897static bool nfs41_op_ordering_ok(struct nfsd4_compoundargs *args)
898{
899 if (args->minorversion && args->opcnt > 0) {
900 struct nfsd4_op *op = &args->ops[0];
901 return (op->status == nfserr_op_illegal) ||
902 (nfsd4_ops[op->opnum].op_flags & ALLOWED_AS_FIRST_OP);
903 }
904 return true;
905}
906
907/*
857 * COMPOUND call. 908 * COMPOUND call.
858 */ 909 */
859static __be32 910static __be32
@@ -863,12 +914,13 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
863{ 914{
864 struct nfsd4_op *op; 915 struct nfsd4_op *op;
865 struct nfsd4_operation *opdesc; 916 struct nfsd4_operation *opdesc;
866 struct nfsd4_compound_state *cstate = NULL; 917 struct nfsd4_compound_state *cstate = &resp->cstate;
867 int slack_bytes; 918 int slack_bytes;
868 __be32 status; 919 __be32 status;
869 920
870 resp->xbuf = &rqstp->rq_res; 921 resp->xbuf = &rqstp->rq_res;
871 resp->p = rqstp->rq_res.head[0].iov_base + rqstp->rq_res.head[0].iov_len; 922 resp->p = rqstp->rq_res.head[0].iov_base +
923 rqstp->rq_res.head[0].iov_len;
872 resp->tagp = resp->p; 924 resp->tagp = resp->p;
873 /* reserve space for: taglen, tag, and opcnt */ 925 /* reserve space for: taglen, tag, and opcnt */
874 resp->p += 2 + XDR_QUADLEN(args->taglen); 926 resp->p += 2 + XDR_QUADLEN(args->taglen);
@@ -877,18 +929,25 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
877 resp->tag = args->tag; 929 resp->tag = args->tag;
878 resp->opcnt = 0; 930 resp->opcnt = 0;
879 resp->rqstp = rqstp; 931 resp->rqstp = rqstp;
932 resp->cstate.minorversion = args->minorversion;
933 resp->cstate.replay_owner = NULL;
934 fh_init(&resp->cstate.current_fh, NFS4_FHSIZE);
935 fh_init(&resp->cstate.save_fh, NFS4_FHSIZE);
936 /* Use the deferral mechanism only for NFSv4.0 compounds */
937 rqstp->rq_usedeferral = (args->minorversion == 0);
880 938
881 /* 939 /*
882 * According to RFC3010, this takes precedence over all other errors. 940 * According to RFC3010, this takes precedence over all other errors.
883 */ 941 */
884 status = nfserr_minor_vers_mismatch; 942 status = nfserr_minor_vers_mismatch;
885 if (args->minorversion > NFSD_SUPPORTED_MINOR_VERSION) 943 if (args->minorversion > nfsd_supported_minorversion)
886 goto out; 944 goto out;
887 945
888 status = nfserr_resource; 946 if (!nfs41_op_ordering_ok(args)) {
889 cstate = cstate_alloc(); 947 op = &args->ops[0];
890 if (cstate == NULL) 948 op->status = nfserr_sequence_pos;
891 goto out; 949 goto encode_op;
950 }
892 951
893 status = nfs_ok; 952 status = nfs_ok;
894 while (!status && resp->opcnt < args->opcnt) { 953 while (!status && resp->opcnt < args->opcnt) {
@@ -897,7 +956,6 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
897 dprintk("nfsv4 compound op #%d/%d: %d (%s)\n", 956 dprintk("nfsv4 compound op #%d/%d: %d (%s)\n",
898 resp->opcnt, args->opcnt, op->opnum, 957 resp->opcnt, args->opcnt, op->opnum,
899 nfsd4_op_name(op->opnum)); 958 nfsd4_op_name(op->opnum));
900
901 /* 959 /*
902 * The XDR decode routines may have pre-set op->status; 960 * The XDR decode routines may have pre-set op->status;
903 * for example, if there is a miscellaneous XDR error 961 * for example, if there is a miscellaneous XDR error
@@ -938,6 +996,15 @@ nfsd4_proc_compound(struct svc_rqst *rqstp,
938 BUG_ON(op->status == nfs_ok); 996 BUG_ON(op->status == nfs_ok);
939 997
940encode_op: 998encode_op:
999 /* Only from SEQUENCE or CREATE_SESSION */
1000 if (resp->cstate.status == nfserr_replay_cache) {
1001 dprintk("%s NFS4.1 replay from cache\n", __func__);
1002 if (nfsd4_not_cached(resp))
1003 status = nfsd4_enc_uncached_replay(args, resp);
1004 else
1005 status = op->status;
1006 goto out;
1007 }
941 if (op->status == nfserr_replay_me) { 1008 if (op->status == nfserr_replay_me) {
942 op->replay = &cstate->replay_owner->so_replay; 1009 op->replay = &cstate->replay_owner->so_replay;
943 nfsd4_encode_replay(resp, op); 1010 nfsd4_encode_replay(resp, op);
@@ -961,15 +1028,24 @@ encode_op:
961 1028
962 nfsd4_increment_op_stats(op->opnum); 1029 nfsd4_increment_op_stats(op->opnum);
963 } 1030 }
1031 if (!rqstp->rq_usedeferral && status == nfserr_dropit) {
1032 dprintk("%s Dropit - send NFS4ERR_DELAY\n", __func__);
1033 status = nfserr_jukebox;
1034 }
964 1035
965 cstate_free(cstate); 1036 resp->cstate.status = status;
1037 fh_put(&resp->cstate.current_fh);
1038 fh_put(&resp->cstate.save_fh);
1039 BUG_ON(resp->cstate.replay_owner);
966out: 1040out:
967 nfsd4_release_compoundargs(args); 1041 nfsd4_release_compoundargs(args);
1042 /* Reset deferral mechanism for RPC deferrals */
1043 rqstp->rq_usedeferral = 1;
968 dprintk("nfsv4 compound returned %d\n", ntohl(status)); 1044 dprintk("nfsv4 compound returned %d\n", ntohl(status));
969 return status; 1045 return status;
970} 1046}
971 1047
972static struct nfsd4_operation nfsd4_ops[OP_RELEASE_LOCKOWNER+1] = { 1048static struct nfsd4_operation nfsd4_ops[] = {
973 [OP_ACCESS] = { 1049 [OP_ACCESS] = {
974 .op_func = (nfsd4op_func)nfsd4_access, 1050 .op_func = (nfsd4op_func)nfsd4_access,
975 .op_name = "OP_ACCESS", 1051 .op_name = "OP_ACCESS",
@@ -1045,7 +1121,7 @@ static struct nfsd4_operation nfsd4_ops[OP_RELEASE_LOCKOWNER+1] = {
1045 .op_name = "OP_PUTFH", 1121 .op_name = "OP_PUTFH",
1046 }, 1122 },
1047 [OP_PUTPUBFH] = { 1123 [OP_PUTPUBFH] = {
1048 /* unsupported, just for future reference: */ 1124 .op_func = (nfsd4op_func)nfsd4_putrootfh,
1049 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, 1125 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
1050 .op_name = "OP_PUTPUBFH", 1126 .op_name = "OP_PUTPUBFH",
1051 }, 1127 },
@@ -1119,6 +1195,28 @@ static struct nfsd4_operation nfsd4_ops[OP_RELEASE_LOCKOWNER+1] = {
1119 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS, 1195 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_ON_ABSENT_FS,
1120 .op_name = "OP_RELEASE_LOCKOWNER", 1196 .op_name = "OP_RELEASE_LOCKOWNER",
1121 }, 1197 },
1198
1199 /* NFSv4.1 operations */
1200 [OP_EXCHANGE_ID] = {
1201 .op_func = (nfsd4op_func)nfsd4_exchange_id,
1202 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
1203 .op_name = "OP_EXCHANGE_ID",
1204 },
1205 [OP_CREATE_SESSION] = {
1206 .op_func = (nfsd4op_func)nfsd4_create_session,
1207 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
1208 .op_name = "OP_CREATE_SESSION",
1209 },
1210 [OP_DESTROY_SESSION] = {
1211 .op_func = (nfsd4op_func)nfsd4_destroy_session,
1212 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
1213 .op_name = "OP_DESTROY_SESSION",
1214 },
1215 [OP_SEQUENCE] = {
1216 .op_func = (nfsd4op_func)nfsd4_sequence,
1217 .op_flags = ALLOWED_WITHOUT_FH | ALLOWED_AS_FIRST_OP,
1218 .op_name = "OP_SEQUENCE",
1219 },
1122}; 1220};
1123 1221
1124static const char *nfsd4_op_name(unsigned opnum) 1222static const char *nfsd4_op_name(unsigned opnum)
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index 74f7b67567fd..3444c0052a87 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -182,36 +182,26 @@ out_unlock:
182 182
183typedef int (recdir_func)(struct dentry *, struct dentry *); 183typedef int (recdir_func)(struct dentry *, struct dentry *);
184 184
185struct dentry_list { 185struct name_list {
186 struct dentry *dentry; 186 char name[HEXDIR_LEN];
187 struct list_head list; 187 struct list_head list;
188}; 188};
189 189
190struct dentry_list_arg {
191 struct list_head dentries;
192 struct dentry *parent;
193};
194
195static int 190static int
196nfsd4_build_dentrylist(void *arg, const char *name, int namlen, 191nfsd4_build_namelist(void *arg, const char *name, int namlen,
197 loff_t offset, u64 ino, unsigned int d_type) 192 loff_t offset, u64 ino, unsigned int d_type)
198{ 193{
199 struct dentry_list_arg *dla = arg; 194 struct list_head *names = arg;
200 struct list_head *dentries = &dla->dentries; 195 struct name_list *entry;
201 struct dentry *parent = dla->parent;
202 struct dentry *dentry;
203 struct dentry_list *child;
204 196
205 if (name && isdotent(name, namlen)) 197 if (namlen != HEXDIR_LEN - 1)
206 return 0; 198 return 0;
207 dentry = lookup_one_len(name, parent, namlen); 199 entry = kmalloc(sizeof(struct name_list), GFP_KERNEL);
208 if (IS_ERR(dentry)) 200 if (entry == NULL)
209 return PTR_ERR(dentry);
210 child = kmalloc(sizeof(*child), GFP_KERNEL);
211 if (child == NULL)
212 return -ENOMEM; 201 return -ENOMEM;
213 child->dentry = dentry; 202 memcpy(entry->name, name, HEXDIR_LEN - 1);
214 list_add(&child->list, dentries); 203 entry->name[HEXDIR_LEN - 1] = '\0';
204 list_add(&entry->list, names);
215 return 0; 205 return 0;
216} 206}
217 207
@@ -220,11 +210,9 @@ nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f)
220{ 210{
221 const struct cred *original_cred; 211 const struct cred *original_cred;
222 struct file *filp; 212 struct file *filp;
223 struct dentry_list_arg dla = { 213 LIST_HEAD(names);
224 .parent = dir, 214 struct name_list *entry;
225 }; 215 struct dentry *dentry;
226 struct list_head *dentries = &dla.dentries;
227 struct dentry_list *child;
228 int status; 216 int status;
229 217
230 if (!rec_dir_init) 218 if (!rec_dir_init)
@@ -233,31 +221,34 @@ nfsd4_list_rec_dir(struct dentry *dir, recdir_func *f)
233 status = nfs4_save_creds(&original_cred); 221 status = nfs4_save_creds(&original_cred);
234 if (status < 0) 222 if (status < 0)
235 return status; 223 return status;
236 INIT_LIST_HEAD(dentries);
237 224
238 filp = dentry_open(dget(dir), mntget(rec_dir.mnt), O_RDONLY, 225 filp = dentry_open(dget(dir), mntget(rec_dir.mnt), O_RDONLY,
239 current_cred()); 226 current_cred());
240 status = PTR_ERR(filp); 227 status = PTR_ERR(filp);
241 if (IS_ERR(filp)) 228 if (IS_ERR(filp))
242 goto out; 229 goto out;
243 INIT_LIST_HEAD(dentries); 230 status = vfs_readdir(filp, nfsd4_build_namelist, &names);
244 status = vfs_readdir(filp, nfsd4_build_dentrylist, &dla);
245 fput(filp); 231 fput(filp);
246 while (!list_empty(dentries)) { 232 while (!list_empty(&names)) {
247 child = list_entry(dentries->next, struct dentry_list, list); 233 entry = list_entry(names.next, struct name_list, list);
248 status = f(dir, child->dentry); 234
235 dentry = lookup_one_len(entry->name, dir, HEXDIR_LEN-1);
236 if (IS_ERR(dentry)) {
237 status = PTR_ERR(dentry);
238 goto out;
239 }
240 status = f(dir, dentry);
241 dput(dentry);
249 if (status) 242 if (status)
250 goto out; 243 goto out;
251 list_del(&child->list); 244 list_del(&entry->list);
252 dput(child->dentry); 245 kfree(entry);
253 kfree(child);
254 } 246 }
255out: 247out:
256 while (!list_empty(dentries)) { 248 while (!list_empty(&names)) {
257 child = list_entry(dentries->next, struct dentry_list, list); 249 entry = list_entry(names.next, struct name_list, list);
258 list_del(&child->list); 250 list_del(&entry->list);
259 dput(child->dentry); 251 kfree(entry);
260 kfree(child);
261 } 252 }
262 nfs4_reset_creds(original_cred); 253 nfs4_reset_creds(original_cred);
263 return status; 254 return status;
@@ -353,7 +344,8 @@ purge_old(struct dentry *parent, struct dentry *child)
353{ 344{
354 int status; 345 int status;
355 346
356 if (nfs4_has_reclaimed_state(child->d_name.name)) 347 /* note: we currently use this path only for minorversion 0 */
348 if (nfs4_has_reclaimed_state(child->d_name.name, false))
357 return 0; 349 return 0;
358 350
359 status = nfsd4_clear_clid_dir(parent, child); 351 status = nfsd4_clear_clid_dir(parent, child);
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index b6f60f48e94b..c65a27b76a9d 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -68,6 +68,7 @@ static u32 current_delegid = 1;
68static u32 nfs4_init; 68static u32 nfs4_init;
69static stateid_t zerostateid; /* bits all 0 */ 69static stateid_t zerostateid; /* bits all 0 */
70static stateid_t onestateid; /* bits all 1 */ 70static stateid_t onestateid; /* bits all 1 */
71static u64 current_sessionid = 1;
71 72
72#define ZERO_STATEID(stateid) (!memcmp((stateid), &zerostateid, sizeof(stateid_t))) 73#define ZERO_STATEID(stateid) (!memcmp((stateid), &zerostateid, sizeof(stateid_t)))
73#define ONE_STATEID(stateid) (!memcmp((stateid), &onestateid, sizeof(stateid_t))) 74#define ONE_STATEID(stateid) (!memcmp((stateid), &onestateid, sizeof(stateid_t)))
@@ -75,18 +76,21 @@ static stateid_t onestateid; /* bits all 1 */
75/* forward declarations */ 76/* forward declarations */
76static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags); 77static struct nfs4_stateid * find_stateid(stateid_t *stid, int flags);
77static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid); 78static struct nfs4_delegation * find_delegation_stateid(struct inode *ino, stateid_t *stid);
78static void release_stateid_lockowners(struct nfs4_stateid *open_stp);
79static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery"; 79static char user_recovery_dirname[PATH_MAX] = "/var/lib/nfs/v4recovery";
80static void nfs4_set_recdir(char *recdir); 80static void nfs4_set_recdir(char *recdir);
81 81
82/* Locking: 82/* Locking: */
83 * 83
84 * client_mutex: 84/* Currently used for almost all code touching nfsv4 state: */
85 * protects clientid_hashtbl[], clientstr_hashtbl[],
86 * unconfstr_hashtbl[], uncofid_hashtbl[].
87 */
88static DEFINE_MUTEX(client_mutex); 85static DEFINE_MUTEX(client_mutex);
89 86
87/*
88 * Currently used for the del_recall_lru and file hash table. In an
89 * effort to decrease the scope of the client_mutex, this spinlock may
90 * eventually cover more:
91 */
92static DEFINE_SPINLOCK(recall_lock);
93
90static struct kmem_cache *stateowner_slab = NULL; 94static struct kmem_cache *stateowner_slab = NULL;
91static struct kmem_cache *file_slab = NULL; 95static struct kmem_cache *file_slab = NULL;
92static struct kmem_cache *stateid_slab = NULL; 96static struct kmem_cache *stateid_slab = NULL;
@@ -117,37 +121,23 @@ opaque_hashval(const void *ptr, int nbytes)
117 return x; 121 return x;
118} 122}
119 123
120/* forward declarations */
121static void release_stateowner(struct nfs4_stateowner *sop);
122static void release_stateid(struct nfs4_stateid *stp, int flags);
123
124/*
125 * Delegation state
126 */
127
128/* recall_lock protects the del_recall_lru */
129static DEFINE_SPINLOCK(recall_lock);
130static struct list_head del_recall_lru; 124static struct list_head del_recall_lru;
131 125
132static void
133free_nfs4_file(struct kref *kref)
134{
135 struct nfs4_file *fp = container_of(kref, struct nfs4_file, fi_ref);
136 list_del(&fp->fi_hash);
137 iput(fp->fi_inode);
138 kmem_cache_free(file_slab, fp);
139}
140
141static inline void 126static inline void
142put_nfs4_file(struct nfs4_file *fi) 127put_nfs4_file(struct nfs4_file *fi)
143{ 128{
144 kref_put(&fi->fi_ref, free_nfs4_file); 129 if (atomic_dec_and_lock(&fi->fi_ref, &recall_lock)) {
130 list_del(&fi->fi_hash);
131 spin_unlock(&recall_lock);
132 iput(fi->fi_inode);
133 kmem_cache_free(file_slab, fi);
134 }
145} 135}
146 136
147static inline void 137static inline void
148get_nfs4_file(struct nfs4_file *fi) 138get_nfs4_file(struct nfs4_file *fi)
149{ 139{
150 kref_get(&fi->fi_ref); 140 atomic_inc(&fi->fi_ref);
151} 141}
152 142
153static int num_delegations; 143static int num_delegations;
@@ -220,9 +210,7 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f
220 dp->dl_stateid.si_stateownerid = current_delegid++; 210 dp->dl_stateid.si_stateownerid = current_delegid++;
221 dp->dl_stateid.si_fileid = 0; 211 dp->dl_stateid.si_fileid = 0;
222 dp->dl_stateid.si_generation = 0; 212 dp->dl_stateid.si_generation = 0;
223 dp->dl_fhlen = current_fh->fh_handle.fh_size; 213 fh_copy_shallow(&dp->dl_fh, &current_fh->fh_handle);
224 memcpy(dp->dl_fhval, &current_fh->fh_handle.fh_base,
225 current_fh->fh_handle.fh_size);
226 dp->dl_time = 0; 214 dp->dl_time = 0;
227 atomic_set(&dp->dl_count, 1); 215 atomic_set(&dp->dl_count, 1);
228 list_add(&dp->dl_perfile, &fp->fi_delegations); 216 list_add(&dp->dl_perfile, &fp->fi_delegations);
@@ -311,6 +299,291 @@ static struct list_head unconf_id_hashtbl[CLIENT_HASH_SIZE];
311static struct list_head client_lru; 299static struct list_head client_lru;
312static struct list_head close_lru; 300static struct list_head close_lru;
313 301
302static void unhash_generic_stateid(struct nfs4_stateid *stp)
303{
304 list_del(&stp->st_hash);
305 list_del(&stp->st_perfile);
306 list_del(&stp->st_perstateowner);
307}
308
309static void free_generic_stateid(struct nfs4_stateid *stp)
310{
311 put_nfs4_file(stp->st_file);
312 kmem_cache_free(stateid_slab, stp);
313}
314
315static void release_lock_stateid(struct nfs4_stateid *stp)
316{
317 unhash_generic_stateid(stp);
318 locks_remove_posix(stp->st_vfs_file, (fl_owner_t)stp->st_stateowner);
319 free_generic_stateid(stp);
320}
321
322static void unhash_lockowner(struct nfs4_stateowner *sop)
323{
324 struct nfs4_stateid *stp;
325
326 list_del(&sop->so_idhash);
327 list_del(&sop->so_strhash);
328 list_del(&sop->so_perstateid);
329 while (!list_empty(&sop->so_stateids)) {
330 stp = list_first_entry(&sop->so_stateids,
331 struct nfs4_stateid, st_perstateowner);
332 release_lock_stateid(stp);
333 }
334}
335
336static void release_lockowner(struct nfs4_stateowner *sop)
337{
338 unhash_lockowner(sop);
339 nfs4_put_stateowner(sop);
340}
341
342static void
343release_stateid_lockowners(struct nfs4_stateid *open_stp)
344{
345 struct nfs4_stateowner *lock_sop;
346
347 while (!list_empty(&open_stp->st_lockowners)) {
348 lock_sop = list_entry(open_stp->st_lockowners.next,
349 struct nfs4_stateowner, so_perstateid);
350 /* list_del(&open_stp->st_lockowners); */
351 BUG_ON(lock_sop->so_is_open_owner);
352 release_lockowner(lock_sop);
353 }
354}
355
356static void release_open_stateid(struct nfs4_stateid *stp)
357{
358 unhash_generic_stateid(stp);
359 release_stateid_lockowners(stp);
360 nfsd_close(stp->st_vfs_file);
361 free_generic_stateid(stp);
362}
363
364static void unhash_openowner(struct nfs4_stateowner *sop)
365{
366 struct nfs4_stateid *stp;
367
368 list_del(&sop->so_idhash);
369 list_del(&sop->so_strhash);
370 list_del(&sop->so_perclient);
371 list_del(&sop->so_perstateid); /* XXX: necessary? */
372 while (!list_empty(&sop->so_stateids)) {
373 stp = list_first_entry(&sop->so_stateids,
374 struct nfs4_stateid, st_perstateowner);
375 release_open_stateid(stp);
376 }
377}
378
379static void release_openowner(struct nfs4_stateowner *sop)
380{
381 unhash_openowner(sop);
382 list_del(&sop->so_close_lru);
383 nfs4_put_stateowner(sop);
384}
385
386static DEFINE_SPINLOCK(sessionid_lock);
387#define SESSION_HASH_SIZE 512
388static struct list_head sessionid_hashtbl[SESSION_HASH_SIZE];
389
390static inline int
391hash_sessionid(struct nfs4_sessionid *sessionid)
392{
393 struct nfsd4_sessionid *sid = (struct nfsd4_sessionid *)sessionid;
394
395 return sid->sequence % SESSION_HASH_SIZE;
396}
397
398static inline void
399dump_sessionid(const char *fn, struct nfs4_sessionid *sessionid)
400{
401 u32 *ptr = (u32 *)(&sessionid->data[0]);
402 dprintk("%s: %u:%u:%u:%u\n", fn, ptr[0], ptr[1], ptr[2], ptr[3]);
403}
404
405static void
406gen_sessionid(struct nfsd4_session *ses)
407{
408 struct nfs4_client *clp = ses->se_client;
409 struct nfsd4_sessionid *sid;
410
411 sid = (struct nfsd4_sessionid *)ses->se_sessionid.data;
412 sid->clientid = clp->cl_clientid;
413 sid->sequence = current_sessionid++;
414 sid->reserved = 0;
415}
416
417/*
418 * Give the client the number of slots it requests bound by
419 * NFSD_MAX_SLOTS_PER_SESSION and by sv_drc_max_pages.
420 *
421 * If we run out of pages (sv_drc_pages_used == sv_drc_max_pages) we
422 * should (up to a point) re-negotiate active sessions and reduce their
423 * slot usage to make rooom for new connections. For now we just fail the
424 * create session.
425 */
426static int set_forechannel_maxreqs(struct nfsd4_channel_attrs *fchan)
427{
428 int status = 0, np = fchan->maxreqs * NFSD_PAGES_PER_SLOT;
429
430 spin_lock(&nfsd_serv->sv_lock);
431 if (np + nfsd_serv->sv_drc_pages_used > nfsd_serv->sv_drc_max_pages)
432 np = nfsd_serv->sv_drc_max_pages - nfsd_serv->sv_drc_pages_used;
433 nfsd_serv->sv_drc_pages_used += np;
434 spin_unlock(&nfsd_serv->sv_lock);
435
436 if (np <= 0) {
437 status = nfserr_resource;
438 fchan->maxreqs = 0;
439 } else
440 fchan->maxreqs = np / NFSD_PAGES_PER_SLOT;
441
442 return status;
443}
444
445/*
446 * fchan holds the client values on input, and the server values on output
447 */
448static int init_forechannel_attrs(struct svc_rqst *rqstp,
449 struct nfsd4_session *session,
450 struct nfsd4_channel_attrs *fchan)
451{
452 int status = 0;
453 __u32 maxcount = svc_max_payload(rqstp);
454
455 /* headerpadsz set to zero in encode routine */
456
457 /* Use the client's max request and max response size if possible */
458 if (fchan->maxreq_sz > maxcount)
459 fchan->maxreq_sz = maxcount;
460 session->se_fmaxreq_sz = fchan->maxreq_sz;
461
462 if (fchan->maxresp_sz > maxcount)
463 fchan->maxresp_sz = maxcount;
464 session->se_fmaxresp_sz = fchan->maxresp_sz;
465
466 /* Set the max response cached size our default which is
467 * a multiple of PAGE_SIZE and small */
468 session->se_fmaxresp_cached = NFSD_PAGES_PER_SLOT * PAGE_SIZE;
469 fchan->maxresp_cached = session->se_fmaxresp_cached;
470
471 /* Use the client's maxops if possible */
472 if (fchan->maxops > NFSD_MAX_OPS_PER_COMPOUND)
473 fchan->maxops = NFSD_MAX_OPS_PER_COMPOUND;
474 session->se_fmaxops = fchan->maxops;
475
476 /* try to use the client requested number of slots */
477 if (fchan->maxreqs > NFSD_MAX_SLOTS_PER_SESSION)
478 fchan->maxreqs = NFSD_MAX_SLOTS_PER_SESSION;
479
480 /* FIXME: Error means no more DRC pages so the server should
481 * recover pages from existing sessions. For now fail session
482 * creation.
483 */
484 status = set_forechannel_maxreqs(fchan);
485
486 session->se_fnumslots = fchan->maxreqs;
487 return status;
488}
489
490static int
491alloc_init_session(struct svc_rqst *rqstp, struct nfs4_client *clp,
492 struct nfsd4_create_session *cses)
493{
494 struct nfsd4_session *new, tmp;
495 int idx, status = nfserr_resource, slotsize;
496
497 memset(&tmp, 0, sizeof(tmp));
498
499 /* FIXME: For now, we just accept the client back channel attributes. */
500 status = init_forechannel_attrs(rqstp, &tmp, &cses->fore_channel);
501 if (status)
502 goto out;
503
504 /* allocate struct nfsd4_session and slot table in one piece */
505 slotsize = tmp.se_fnumslots * sizeof(struct nfsd4_slot);
506 new = kzalloc(sizeof(*new) + slotsize, GFP_KERNEL);
507 if (!new)
508 goto out;
509
510 memcpy(new, &tmp, sizeof(*new));
511
512 new->se_client = clp;
513 gen_sessionid(new);
514 idx = hash_sessionid(&new->se_sessionid);
515 memcpy(clp->cl_sessionid.data, new->se_sessionid.data,
516 NFS4_MAX_SESSIONID_LEN);
517
518 new->se_flags = cses->flags;
519 kref_init(&new->se_ref);
520 spin_lock(&sessionid_lock);
521 list_add(&new->se_hash, &sessionid_hashtbl[idx]);
522 list_add(&new->se_perclnt, &clp->cl_sessions);
523 spin_unlock(&sessionid_lock);
524
525 status = nfs_ok;
526out:
527 return status;
528}
529
530/* caller must hold sessionid_lock */
531static struct nfsd4_session *
532find_in_sessionid_hashtbl(struct nfs4_sessionid *sessionid)
533{
534 struct nfsd4_session *elem;
535 int idx;
536
537 dump_sessionid(__func__, sessionid);
538 idx = hash_sessionid(sessionid);
539 dprintk("%s: idx is %d\n", __func__, idx);
540 /* Search in the appropriate list */
541 list_for_each_entry(elem, &sessionid_hashtbl[idx], se_hash) {
542 dump_sessionid("list traversal", &elem->se_sessionid);
543 if (!memcmp(elem->se_sessionid.data, sessionid->data,
544 NFS4_MAX_SESSIONID_LEN)) {
545 return elem;
546 }
547 }
548
549 dprintk("%s: session not found\n", __func__);
550 return NULL;
551}
552
553/* caller must hold sessionid_lock */
554static void
555unhash_session(struct nfsd4_session *ses)
556{
557 list_del(&ses->se_hash);
558 list_del(&ses->se_perclnt);
559}
560
561static void
562release_session(struct nfsd4_session *ses)
563{
564 spin_lock(&sessionid_lock);
565 unhash_session(ses);
566 spin_unlock(&sessionid_lock);
567 nfsd4_put_session(ses);
568}
569
570static void nfsd4_release_respages(struct page **respages, short resused);
571
572void
573free_session(struct kref *kref)
574{
575 struct nfsd4_session *ses;
576 int i;
577
578 ses = container_of(kref, struct nfsd4_session, se_ref);
579 for (i = 0; i < ses->se_fnumslots; i++) {
580 struct nfsd4_cache_entry *e = &ses->se_slots[i].sl_cache_entry;
581 nfsd4_release_respages(e->ce_respages, e->ce_resused);
582 }
583 kfree(ses->se_slots);
584 kfree(ses);
585}
586
314static inline void 587static inline void
315renew_client(struct nfs4_client *clp) 588renew_client(struct nfs4_client *clp)
316{ 589{
@@ -330,8 +603,8 @@ STALE_CLIENTID(clientid_t *clid)
330{ 603{
331 if (clid->cl_boot == boot_time) 604 if (clid->cl_boot == boot_time)
332 return 0; 605 return 0;
333 dprintk("NFSD stale clientid (%08x/%08x)\n", 606 dprintk("NFSD stale clientid (%08x/%08x) boot_time %08lx\n",
334 clid->cl_boot, clid->cl_id); 607 clid->cl_boot, clid->cl_id, boot_time);
335 return 1; 608 return 1;
336} 609}
337 610
@@ -376,6 +649,8 @@ static inline void
376free_client(struct nfs4_client *clp) 649free_client(struct nfs4_client *clp)
377{ 650{
378 shutdown_callback_client(clp); 651 shutdown_callback_client(clp);
652 nfsd4_release_respages(clp->cl_slot.sl_cache_entry.ce_respages,
653 clp->cl_slot.sl_cache_entry.ce_resused);
379 if (clp->cl_cred.cr_group_info) 654 if (clp->cl_cred.cr_group_info)
380 put_group_info(clp->cl_cred.cr_group_info); 655 put_group_info(clp->cl_cred.cr_group_info);
381 kfree(clp->cl_principal); 656 kfree(clp->cl_principal);
@@ -420,7 +695,13 @@ expire_client(struct nfs4_client *clp)
420 list_del(&clp->cl_lru); 695 list_del(&clp->cl_lru);
421 while (!list_empty(&clp->cl_openowners)) { 696 while (!list_empty(&clp->cl_openowners)) {
422 sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient); 697 sop = list_entry(clp->cl_openowners.next, struct nfs4_stateowner, so_perclient);
423 release_stateowner(sop); 698 release_openowner(sop);
699 }
700 while (!list_empty(&clp->cl_sessions)) {
701 struct nfsd4_session *ses;
702 ses = list_entry(clp->cl_sessions.next, struct nfsd4_session,
703 se_perclnt);
704 release_session(ses);
424 } 705 }
425 put_nfs4_client(clp); 706 put_nfs4_client(clp);
426} 707}
@@ -439,6 +720,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name, char *recdir)
439 INIT_LIST_HEAD(&clp->cl_strhash); 720 INIT_LIST_HEAD(&clp->cl_strhash);
440 INIT_LIST_HEAD(&clp->cl_openowners); 721 INIT_LIST_HEAD(&clp->cl_openowners);
441 INIT_LIST_HEAD(&clp->cl_delegations); 722 INIT_LIST_HEAD(&clp->cl_delegations);
723 INIT_LIST_HEAD(&clp->cl_sessions);
442 INIT_LIST_HEAD(&clp->cl_lru); 724 INIT_LIST_HEAD(&clp->cl_lru);
443 return clp; 725 return clp;
444} 726}
@@ -568,25 +850,45 @@ find_unconfirmed_client(clientid_t *clid)
568 return NULL; 850 return NULL;
569} 851}
570 852
853/*
854 * Return 1 iff clp's clientid establishment method matches the use_exchange_id
855 * parameter. Matching is based on the fact the at least one of the
856 * EXCHGID4_FLAG_USE_{NON_PNFS,PNFS_MDS,PNFS_DS} flags must be set for v4.1
857 *
858 * FIXME: we need to unify the clientid namespaces for nfsv4.x
859 * and correctly deal with client upgrade/downgrade in EXCHANGE_ID
860 * and SET_CLIENTID{,_CONFIRM}
861 */
862static inline int
863match_clientid_establishment(struct nfs4_client *clp, bool use_exchange_id)
864{
865 bool has_exchange_flags = (clp->cl_exchange_flags != 0);
866 return use_exchange_id == has_exchange_flags;
867}
868
571static struct nfs4_client * 869static struct nfs4_client *
572find_confirmed_client_by_str(const char *dname, unsigned int hashval) 870find_confirmed_client_by_str(const char *dname, unsigned int hashval,
871 bool use_exchange_id)
573{ 872{
574 struct nfs4_client *clp; 873 struct nfs4_client *clp;
575 874
576 list_for_each_entry(clp, &conf_str_hashtbl[hashval], cl_strhash) { 875 list_for_each_entry(clp, &conf_str_hashtbl[hashval], cl_strhash) {
577 if (same_name(clp->cl_recdir, dname)) 876 if (same_name(clp->cl_recdir, dname) &&
877 match_clientid_establishment(clp, use_exchange_id))
578 return clp; 878 return clp;
579 } 879 }
580 return NULL; 880 return NULL;
581} 881}
582 882
583static struct nfs4_client * 883static struct nfs4_client *
584find_unconfirmed_client_by_str(const char *dname, unsigned int hashval) 884find_unconfirmed_client_by_str(const char *dname, unsigned int hashval,
885 bool use_exchange_id)
585{ 886{
586 struct nfs4_client *clp; 887 struct nfs4_client *clp;
587 888
588 list_for_each_entry(clp, &unconf_str_hashtbl[hashval], cl_strhash) { 889 list_for_each_entry(clp, &unconf_str_hashtbl[hashval], cl_strhash) {
589 if (same_name(clp->cl_recdir, dname)) 890 if (same_name(clp->cl_recdir, dname) &&
891 match_clientid_establishment(clp, use_exchange_id))
590 return clp; 892 return clp;
591 } 893 }
592 return NULL; 894 return NULL;
@@ -685,6 +987,534 @@ out_err:
685 return; 987 return;
686} 988}
687 989
990void
991nfsd4_set_statp(struct svc_rqst *rqstp, __be32 *statp)
992{
993 struct nfsd4_compoundres *resp = rqstp->rq_resp;
994
995 resp->cstate.statp = statp;
996}
997
998/*
999 * Dereference the result pages.
1000 */
1001static void
1002nfsd4_release_respages(struct page **respages, short resused)
1003{
1004 int i;
1005
1006 dprintk("--> %s\n", __func__);
1007 for (i = 0; i < resused; i++) {
1008 if (!respages[i])
1009 continue;
1010 put_page(respages[i]);
1011 respages[i] = NULL;
1012 }
1013}
1014
1015static void
1016nfsd4_copy_pages(struct page **topages, struct page **frompages, short count)
1017{
1018 int i;
1019
1020 for (i = 0; i < count; i++) {
1021 topages[i] = frompages[i];
1022 if (!topages[i])
1023 continue;
1024 get_page(topages[i]);
1025 }
1026}
1027
1028/*
1029 * Cache the reply pages up to NFSD_PAGES_PER_SLOT + 1, clearing the previous
1030 * pages. We add a page to NFSD_PAGES_PER_SLOT for the case where the total
1031 * length of the XDR response is less than se_fmaxresp_cached
1032 * (NFSD_PAGES_PER_SLOT * PAGE_SIZE) but the xdr_buf pages is used for a
1033 * of the reply (e.g. readdir).
1034 *
1035 * Store the base and length of the rq_req.head[0] page
1036 * of the NFSv4.1 data, just past the rpc header.
1037 */
1038void
1039nfsd4_store_cache_entry(struct nfsd4_compoundres *resp)
1040{
1041 struct nfsd4_cache_entry *entry = &resp->cstate.slot->sl_cache_entry;
1042 struct svc_rqst *rqstp = resp->rqstp;
1043 struct nfsd4_compoundargs *args = rqstp->rq_argp;
1044 struct nfsd4_op *op = &args->ops[resp->opcnt];
1045 struct kvec *resv = &rqstp->rq_res.head[0];
1046
1047 dprintk("--> %s entry %p\n", __func__, entry);
1048
1049 /* Don't cache a failed OP_SEQUENCE. */
1050 if (resp->opcnt == 1 && op->opnum == OP_SEQUENCE && resp->cstate.status)
1051 return;
1052
1053 nfsd4_release_respages(entry->ce_respages, entry->ce_resused);
1054 entry->ce_opcnt = resp->opcnt;
1055 entry->ce_status = resp->cstate.status;
1056
1057 /*
1058 * Don't need a page to cache just the sequence operation - the slot
1059 * does this for us!
1060 */
1061
1062 if (nfsd4_not_cached(resp)) {
1063 entry->ce_resused = 0;
1064 entry->ce_rpchdrlen = 0;
1065 dprintk("%s Just cache SEQUENCE. ce_cachethis %d\n", __func__,
1066 resp->cstate.slot->sl_cache_entry.ce_cachethis);
1067 return;
1068 }
1069 entry->ce_resused = rqstp->rq_resused;
1070 if (entry->ce_resused > NFSD_PAGES_PER_SLOT + 1)
1071 entry->ce_resused = NFSD_PAGES_PER_SLOT + 1;
1072 nfsd4_copy_pages(entry->ce_respages, rqstp->rq_respages,
1073 entry->ce_resused);
1074 entry->ce_datav.iov_base = resp->cstate.statp;
1075 entry->ce_datav.iov_len = resv->iov_len - ((char *)resp->cstate.statp -
1076 (char *)page_address(rqstp->rq_respages[0]));
1077 /* Current request rpc header length*/
1078 entry->ce_rpchdrlen = (char *)resp->cstate.statp -
1079 (char *)page_address(rqstp->rq_respages[0]);
1080}
1081
1082/*
1083 * We keep the rpc header, but take the nfs reply from the replycache.
1084 */
1085static int
1086nfsd41_copy_replay_data(struct nfsd4_compoundres *resp,
1087 struct nfsd4_cache_entry *entry)
1088{
1089 struct svc_rqst *rqstp = resp->rqstp;
1090 struct kvec *resv = &resp->rqstp->rq_res.head[0];
1091 int len;
1092
1093 /* Current request rpc header length*/
1094 len = (char *)resp->cstate.statp -
1095 (char *)page_address(rqstp->rq_respages[0]);
1096 if (entry->ce_datav.iov_len + len > PAGE_SIZE) {
1097 dprintk("%s v41 cached reply too large (%Zd).\n", __func__,
1098 entry->ce_datav.iov_len);
1099 return 0;
1100 }
1101 /* copy the cached reply nfsd data past the current rpc header */
1102 memcpy((char *)resv->iov_base + len, entry->ce_datav.iov_base,
1103 entry->ce_datav.iov_len);
1104 resv->iov_len = len + entry->ce_datav.iov_len;
1105 return 1;
1106}
1107
1108/*
1109 * Keep the first page of the replay. Copy the NFSv4.1 data from the first
1110 * cached page. Replace any futher replay pages from the cache.
1111 */
1112__be32
1113nfsd4_replay_cache_entry(struct nfsd4_compoundres *resp,
1114 struct nfsd4_sequence *seq)
1115{
1116 struct nfsd4_cache_entry *entry = &resp->cstate.slot->sl_cache_entry;
1117 __be32 status;
1118
1119 dprintk("--> %s entry %p\n", __func__, entry);
1120
1121 /*
1122 * If this is just the sequence operation, we did not keep
1123 * a page in the cache entry because we can just use the
1124 * slot info stored in struct nfsd4_sequence that was checked
1125 * against the slot in nfsd4_sequence().
1126 *
1127 * This occurs when seq->cachethis is FALSE, or when the client
1128 * session inactivity timer fires and a solo sequence operation
1129 * is sent (lease renewal).
1130 */
1131 if (seq && nfsd4_not_cached(resp)) {
1132 seq->maxslots = resp->cstate.session->se_fnumslots;
1133 return nfs_ok;
1134 }
1135
1136 if (!nfsd41_copy_replay_data(resp, entry)) {
1137 /*
1138 * Not enough room to use the replay rpc header, send the
1139 * cached header. Release all the allocated result pages.
1140 */
1141 svc_free_res_pages(resp->rqstp);
1142 nfsd4_copy_pages(resp->rqstp->rq_respages, entry->ce_respages,
1143 entry->ce_resused);
1144 } else {
1145 /* Release all but the first allocated result page */
1146
1147 resp->rqstp->rq_resused--;
1148 svc_free_res_pages(resp->rqstp);
1149
1150 nfsd4_copy_pages(&resp->rqstp->rq_respages[1],
1151 &entry->ce_respages[1],
1152 entry->ce_resused - 1);
1153 }
1154
1155 resp->rqstp->rq_resused = entry->ce_resused;
1156 resp->opcnt = entry->ce_opcnt;
1157 resp->cstate.iovlen = entry->ce_datav.iov_len + entry->ce_rpchdrlen;
1158 status = entry->ce_status;
1159
1160 return status;
1161}
1162
1163/*
1164 * Set the exchange_id flags returned by the server.
1165 */
1166static void
1167nfsd4_set_ex_flags(struct nfs4_client *new, struct nfsd4_exchange_id *clid)
1168{
1169 /* pNFS is not supported */
1170 new->cl_exchange_flags |= EXCHGID4_FLAG_USE_NON_PNFS;
1171
1172 /* Referrals are supported, Migration is not. */
1173 new->cl_exchange_flags |= EXCHGID4_FLAG_SUPP_MOVED_REFER;
1174
1175 /* set the wire flags to return to client. */
1176 clid->flags = new->cl_exchange_flags;
1177}
1178
1179__be32
1180nfsd4_exchange_id(struct svc_rqst *rqstp,
1181 struct nfsd4_compound_state *cstate,
1182 struct nfsd4_exchange_id *exid)
1183{
1184 struct nfs4_client *unconf, *conf, *new;
1185 int status;
1186 unsigned int strhashval;
1187 char dname[HEXDIR_LEN];
1188 nfs4_verifier verf = exid->verifier;
1189 u32 ip_addr = svc_addr_in(rqstp)->sin_addr.s_addr;
1190
1191 dprintk("%s rqstp=%p exid=%p clname.len=%u clname.data=%p "
1192 " ip_addr=%u flags %x, spa_how %d\n",
1193 __func__, rqstp, exid, exid->clname.len, exid->clname.data,
1194 ip_addr, exid->flags, exid->spa_how);
1195
1196 if (!check_name(exid->clname) || (exid->flags & ~EXCHGID4_FLAG_MASK_A))
1197 return nfserr_inval;
1198
1199 /* Currently only support SP4_NONE */
1200 switch (exid->spa_how) {
1201 case SP4_NONE:
1202 break;
1203 case SP4_SSV:
1204 return nfserr_encr_alg_unsupp;
1205 default:
1206 BUG(); /* checked by xdr code */
1207 case SP4_MACH_CRED:
1208 return nfserr_serverfault; /* no excuse :-/ */
1209 }
1210
1211 status = nfs4_make_rec_clidname(dname, &exid->clname);
1212
1213 if (status)
1214 goto error;
1215
1216 strhashval = clientstr_hashval(dname);
1217
1218 nfs4_lock_state();
1219 status = nfs_ok;
1220
1221 conf = find_confirmed_client_by_str(dname, strhashval, true);
1222 if (conf) {
1223 if (!same_verf(&verf, &conf->cl_verifier)) {
1224 /* 18.35.4 case 8 */
1225 if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
1226 status = nfserr_not_same;
1227 goto out;
1228 }
1229 /* Client reboot: destroy old state */
1230 expire_client(conf);
1231 goto out_new;
1232 }
1233 if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) {
1234 /* 18.35.4 case 9 */
1235 if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
1236 status = nfserr_perm;
1237 goto out;
1238 }
1239 expire_client(conf);
1240 goto out_new;
1241 }
1242 if (ip_addr != conf->cl_addr &&
1243 !(exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A)) {
1244 /* Client collision. 18.35.4 case 3 */
1245 status = nfserr_clid_inuse;
1246 goto out;
1247 }
1248 /*
1249 * Set bit when the owner id and verifier map to an already
1250 * confirmed client id (18.35.3).
1251 */
1252 exid->flags |= EXCHGID4_FLAG_CONFIRMED_R;
1253
1254 /*
1255 * Falling into 18.35.4 case 2, possible router replay.
1256 * Leave confirmed record intact and return same result.
1257 */
1258 copy_verf(conf, &verf);
1259 new = conf;
1260 goto out_copy;
1261 } else {
1262 /* 18.35.4 case 7 */
1263 if (exid->flags & EXCHGID4_FLAG_UPD_CONFIRMED_REC_A) {
1264 status = nfserr_noent;
1265 goto out;
1266 }
1267 }
1268
1269 unconf = find_unconfirmed_client_by_str(dname, strhashval, true);
1270 if (unconf) {
1271 /*
1272 * Possible retry or client restart. Per 18.35.4 case 4,
1273 * a new unconfirmed record should be generated regardless
1274 * of whether any properties have changed.
1275 */
1276 expire_client(unconf);
1277 }
1278
1279out_new:
1280 /* Normal case */
1281 new = create_client(exid->clname, dname);
1282 if (new == NULL) {
1283 status = nfserr_resource;
1284 goto out;
1285 }
1286
1287 copy_verf(new, &verf);
1288 copy_cred(&new->cl_cred, &rqstp->rq_cred);
1289 new->cl_addr = ip_addr;
1290 gen_clid(new);
1291 gen_confirm(new);
1292 add_to_unconfirmed(new, strhashval);
1293out_copy:
1294 exid->clientid.cl_boot = new->cl_clientid.cl_boot;
1295 exid->clientid.cl_id = new->cl_clientid.cl_id;
1296
1297 new->cl_slot.sl_seqid = 0;
1298 exid->seqid = 1;
1299 nfsd4_set_ex_flags(new, exid);
1300
1301 dprintk("nfsd4_exchange_id seqid %d flags %x\n",
1302 new->cl_slot.sl_seqid, new->cl_exchange_flags);
1303 status = nfs_ok;
1304
1305out:
1306 nfs4_unlock_state();
1307error:
1308 dprintk("nfsd4_exchange_id returns %d\n", ntohl(status));
1309 return status;
1310}
1311
1312static int
1313check_slot_seqid(u32 seqid, struct nfsd4_slot *slot)
1314{
1315 dprintk("%s enter. seqid %d slot->sl_seqid %d\n", __func__, seqid,
1316 slot->sl_seqid);
1317
1318 /* The slot is in use, and no response has been sent. */
1319 if (slot->sl_inuse) {
1320 if (seqid == slot->sl_seqid)
1321 return nfserr_jukebox;
1322 else
1323 return nfserr_seq_misordered;
1324 }
1325 /* Normal */
1326 if (likely(seqid == slot->sl_seqid + 1))
1327 return nfs_ok;
1328 /* Replay */
1329 if (seqid == slot->sl_seqid)
1330 return nfserr_replay_cache;
1331 /* Wraparound */
1332 if (seqid == 1 && (slot->sl_seqid + 1) == 0)
1333 return nfs_ok;
1334 /* Misordered replay or misordered new request */
1335 return nfserr_seq_misordered;
1336}
1337
1338__be32
1339nfsd4_create_session(struct svc_rqst *rqstp,
1340 struct nfsd4_compound_state *cstate,
1341 struct nfsd4_create_session *cr_ses)
1342{
1343 u32 ip_addr = svc_addr_in(rqstp)->sin_addr.s_addr;
1344 struct nfsd4_compoundres *resp = rqstp->rq_resp;
1345 struct nfs4_client *conf, *unconf;
1346 struct nfsd4_slot *slot = NULL;
1347 int status = 0;
1348
1349 nfs4_lock_state();
1350 unconf = find_unconfirmed_client(&cr_ses->clientid);
1351 conf = find_confirmed_client(&cr_ses->clientid);
1352
1353 if (conf) {
1354 slot = &conf->cl_slot;
1355 status = check_slot_seqid(cr_ses->seqid, slot);
1356 if (status == nfserr_replay_cache) {
1357 dprintk("Got a create_session replay! seqid= %d\n",
1358 slot->sl_seqid);
1359 cstate->slot = slot;
1360 cstate->status = status;
1361 /* Return the cached reply status */
1362 status = nfsd4_replay_cache_entry(resp, NULL);
1363 goto out;
1364 } else if (cr_ses->seqid != conf->cl_slot.sl_seqid + 1) {
1365 status = nfserr_seq_misordered;
1366 dprintk("Sequence misordered!\n");
1367 dprintk("Expected seqid= %d but got seqid= %d\n",
1368 slot->sl_seqid, cr_ses->seqid);
1369 goto out;
1370 }
1371 conf->cl_slot.sl_seqid++;
1372 } else if (unconf) {
1373 if (!same_creds(&unconf->cl_cred, &rqstp->rq_cred) ||
1374 (ip_addr != unconf->cl_addr)) {
1375 status = nfserr_clid_inuse;
1376 goto out;
1377 }
1378
1379 slot = &unconf->cl_slot;
1380 status = check_slot_seqid(cr_ses->seqid, slot);
1381 if (status) {
1382 /* an unconfirmed replay returns misordered */
1383 status = nfserr_seq_misordered;
1384 goto out;
1385 }
1386
1387 slot->sl_seqid++; /* from 0 to 1 */
1388 move_to_confirmed(unconf);
1389
1390 /*
1391 * We do not support RDMA or persistent sessions
1392 */
1393 cr_ses->flags &= ~SESSION4_PERSIST;
1394 cr_ses->flags &= ~SESSION4_RDMA;
1395
1396 conf = unconf;
1397 } else {
1398 status = nfserr_stale_clientid;
1399 goto out;
1400 }
1401
1402 status = alloc_init_session(rqstp, conf, cr_ses);
1403 if (status)
1404 goto out;
1405
1406 memcpy(cr_ses->sessionid.data, conf->cl_sessionid.data,
1407 NFS4_MAX_SESSIONID_LEN);
1408 cr_ses->seqid = slot->sl_seqid;
1409
1410 slot->sl_inuse = true;
1411 cstate->slot = slot;
1412 /* Ensure a page is used for the cache */
1413 slot->sl_cache_entry.ce_cachethis = 1;
1414out:
1415 nfs4_unlock_state();
1416 dprintk("%s returns %d\n", __func__, ntohl(status));
1417 return status;
1418}
1419
1420__be32
1421nfsd4_destroy_session(struct svc_rqst *r,
1422 struct nfsd4_compound_state *cstate,
1423 struct nfsd4_destroy_session *sessionid)
1424{
1425 struct nfsd4_session *ses;
1426 u32 status = nfserr_badsession;
1427
1428 /* Notes:
1429 * - The confirmed nfs4_client->cl_sessionid holds destroyed sessinid
1430 * - Should we return nfserr_back_chan_busy if waiting for
1431 * callbacks on to-be-destroyed session?
1432 * - Do we need to clear any callback info from previous session?
1433 */
1434
1435 dump_sessionid(__func__, &sessionid->sessionid);
1436 spin_lock(&sessionid_lock);
1437 ses = find_in_sessionid_hashtbl(&sessionid->sessionid);
1438 if (!ses) {
1439 spin_unlock(&sessionid_lock);
1440 goto out;
1441 }
1442
1443 unhash_session(ses);
1444 spin_unlock(&sessionid_lock);
1445
1446 /* wait for callbacks */
1447 shutdown_callback_client(ses->se_client);
1448 nfsd4_put_session(ses);
1449 status = nfs_ok;
1450out:
1451 dprintk("%s returns %d\n", __func__, ntohl(status));
1452 return status;
1453}
1454
1455__be32
1456nfsd4_sequence(struct svc_rqst *rqstp,
1457 struct nfsd4_compound_state *cstate,
1458 struct nfsd4_sequence *seq)
1459{
1460 struct nfsd4_compoundres *resp = rqstp->rq_resp;
1461 struct nfsd4_session *session;
1462 struct nfsd4_slot *slot;
1463 int status;
1464
1465 if (resp->opcnt != 1)
1466 return nfserr_sequence_pos;
1467
1468 spin_lock(&sessionid_lock);
1469 status = nfserr_badsession;
1470 session = find_in_sessionid_hashtbl(&seq->sessionid);
1471 if (!session)
1472 goto out;
1473
1474 status = nfserr_badslot;
1475 if (seq->slotid >= session->se_fnumslots)
1476 goto out;
1477
1478 slot = &session->se_slots[seq->slotid];
1479 dprintk("%s: slotid %d\n", __func__, seq->slotid);
1480
1481 status = check_slot_seqid(seq->seqid, slot);
1482 if (status == nfserr_replay_cache) {
1483 cstate->slot = slot;
1484 cstate->session = session;
1485 /* Return the cached reply status and set cstate->status
1486 * for nfsd4_svc_encode_compoundres processing */
1487 status = nfsd4_replay_cache_entry(resp, seq);
1488 cstate->status = nfserr_replay_cache;
1489 goto replay_cache;
1490 }
1491 if (status)
1492 goto out;
1493
1494 /* Success! bump slot seqid */
1495 slot->sl_inuse = true;
1496 slot->sl_seqid = seq->seqid;
1497 slot->sl_cache_entry.ce_cachethis = seq->cachethis;
1498 /* Always set the cache entry cachethis for solo sequence */
1499 if (nfsd4_is_solo_sequence(resp))
1500 slot->sl_cache_entry.ce_cachethis = 1;
1501
1502 cstate->slot = slot;
1503 cstate->session = session;
1504
1505replay_cache:
1506 /* Renew the clientid on success and on replay.
1507 * Hold a session reference until done processing the compound:
1508 * nfsd4_put_session called only if the cstate slot is set.
1509 */
1510 renew_client(session->se_client);
1511 nfsd4_get_session(session);
1512out:
1513 spin_unlock(&sessionid_lock);
1514 dprintk("%s: return %d\n", __func__, ntohl(status));
1515 return status;
1516}
1517
688__be32 1518__be32
689nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 1519nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
690 struct nfsd4_setclientid *setclid) 1520 struct nfsd4_setclientid *setclid)
@@ -716,14 +1546,13 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
716 strhashval = clientstr_hashval(dname); 1546 strhashval = clientstr_hashval(dname);
717 1547
718 nfs4_lock_state(); 1548 nfs4_lock_state();
719 conf = find_confirmed_client_by_str(dname, strhashval); 1549 conf = find_confirmed_client_by_str(dname, strhashval, false);
720 if (conf) { 1550 if (conf) {
721 /* RFC 3530 14.2.33 CASE 0: */ 1551 /* RFC 3530 14.2.33 CASE 0: */
722 status = nfserr_clid_inuse; 1552 status = nfserr_clid_inuse;
723 if (!same_creds(&conf->cl_cred, &rqstp->rq_cred) 1553 if (!same_creds(&conf->cl_cred, &rqstp->rq_cred)) {
724 || conf->cl_addr != sin->sin_addr.s_addr) { 1554 dprintk("NFSD: setclientid: string in use by client"
725 dprintk("NFSD: setclientid: string in use by clientat %pI4\n", 1555 " at %pI4\n", &conf->cl_addr);
726 &conf->cl_addr);
727 goto out; 1556 goto out;
728 } 1557 }
729 } 1558 }
@@ -732,7 +1561,7 @@ nfsd4_setclientid(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
732 * has a description of SETCLIENTID request processing consisting 1561 * has a description of SETCLIENTID request processing consisting
733 * of 5 bullet points, labeled as CASE0 - CASE4 below. 1562 * of 5 bullet points, labeled as CASE0 - CASE4 below.
734 */ 1563 */
735 unconf = find_unconfirmed_client_by_str(dname, strhashval); 1564 unconf = find_unconfirmed_client_by_str(dname, strhashval, false);
736 status = nfserr_resource; 1565 status = nfserr_resource;
737 if (!conf) { 1566 if (!conf) {
738 /* 1567 /*
@@ -887,7 +1716,7 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
887 unsigned int hash = 1716 unsigned int hash =
888 clientstr_hashval(unconf->cl_recdir); 1717 clientstr_hashval(unconf->cl_recdir);
889 conf = find_confirmed_client_by_str(unconf->cl_recdir, 1718 conf = find_confirmed_client_by_str(unconf->cl_recdir,
890 hash); 1719 hash, false);
891 if (conf) { 1720 if (conf) {
892 nfsd4_remove_clid_dir(conf); 1721 nfsd4_remove_clid_dir(conf);
893 expire_client(conf); 1722 expire_client(conf);
@@ -923,11 +1752,13 @@ alloc_init_file(struct inode *ino)
923 1752
924 fp = kmem_cache_alloc(file_slab, GFP_KERNEL); 1753 fp = kmem_cache_alloc(file_slab, GFP_KERNEL);
925 if (fp) { 1754 if (fp) {
926 kref_init(&fp->fi_ref); 1755 atomic_set(&fp->fi_ref, 1);
927 INIT_LIST_HEAD(&fp->fi_hash); 1756 INIT_LIST_HEAD(&fp->fi_hash);
928 INIT_LIST_HEAD(&fp->fi_stateids); 1757 INIT_LIST_HEAD(&fp->fi_stateids);
929 INIT_LIST_HEAD(&fp->fi_delegations); 1758 INIT_LIST_HEAD(&fp->fi_delegations);
1759 spin_lock(&recall_lock);
930 list_add(&fp->fi_hash, &file_hashtbl[hashval]); 1760 list_add(&fp->fi_hash, &file_hashtbl[hashval]);
1761 spin_unlock(&recall_lock);
931 fp->fi_inode = igrab(ino); 1762 fp->fi_inode = igrab(ino);
932 fp->fi_id = current_fileid++; 1763 fp->fi_id = current_fileid++;
933 fp->fi_had_conflict = false; 1764 fp->fi_had_conflict = false;
@@ -1037,48 +1868,6 @@ alloc_init_open_stateowner(unsigned int strhashval, struct nfs4_client *clp, str
1037 return sop; 1868 return sop;
1038} 1869}
1039 1870
1040static void
1041release_stateid_lockowners(struct nfs4_stateid *open_stp)
1042{
1043 struct nfs4_stateowner *lock_sop;
1044
1045 while (!list_empty(&open_stp->st_lockowners)) {
1046 lock_sop = list_entry(open_stp->st_lockowners.next,
1047 struct nfs4_stateowner, so_perstateid);
1048 /* list_del(&open_stp->st_lockowners); */
1049 BUG_ON(lock_sop->so_is_open_owner);
1050 release_stateowner(lock_sop);
1051 }
1052}
1053
1054static void
1055unhash_stateowner(struct nfs4_stateowner *sop)
1056{
1057 struct nfs4_stateid *stp;
1058
1059 list_del(&sop->so_idhash);
1060 list_del(&sop->so_strhash);
1061 if (sop->so_is_open_owner)
1062 list_del(&sop->so_perclient);
1063 list_del(&sop->so_perstateid);
1064 while (!list_empty(&sop->so_stateids)) {
1065 stp = list_entry(sop->so_stateids.next,
1066 struct nfs4_stateid, st_perstateowner);
1067 if (sop->so_is_open_owner)
1068 release_stateid(stp, OPEN_STATE);
1069 else
1070 release_stateid(stp, LOCK_STATE);
1071 }
1072}
1073
1074static void
1075release_stateowner(struct nfs4_stateowner *sop)
1076{
1077 unhash_stateowner(sop);
1078 list_del(&sop->so_close_lru);
1079 nfs4_put_stateowner(sop);
1080}
1081
1082static inline void 1871static inline void
1083init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) { 1872init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *open) {
1084 struct nfs4_stateowner *sop = open->op_stateowner; 1873 struct nfs4_stateowner *sop = open->op_stateowner;
@@ -1100,30 +1889,13 @@ init_stateid(struct nfs4_stateid *stp, struct nfs4_file *fp, struct nfsd4_open *
1100 stp->st_stateid.si_generation = 0; 1889 stp->st_stateid.si_generation = 0;
1101 stp->st_access_bmap = 0; 1890 stp->st_access_bmap = 0;
1102 stp->st_deny_bmap = 0; 1891 stp->st_deny_bmap = 0;
1103 __set_bit(open->op_share_access, &stp->st_access_bmap); 1892 __set_bit(open->op_share_access & ~NFS4_SHARE_WANT_MASK,
1893 &stp->st_access_bmap);
1104 __set_bit(open->op_share_deny, &stp->st_deny_bmap); 1894 __set_bit(open->op_share_deny, &stp->st_deny_bmap);
1105 stp->st_openstp = NULL; 1895 stp->st_openstp = NULL;
1106} 1896}
1107 1897
1108static void 1898static void
1109release_stateid(struct nfs4_stateid *stp, int flags)
1110{
1111 struct file *filp = stp->st_vfs_file;
1112
1113 list_del(&stp->st_hash);
1114 list_del(&stp->st_perfile);
1115 list_del(&stp->st_perstateowner);
1116 if (flags & OPEN_STATE) {
1117 release_stateid_lockowners(stp);
1118 stp->st_vfs_file = NULL;
1119 nfsd_close(filp);
1120 } else if (flags & LOCK_STATE)
1121 locks_remove_posix(filp, (fl_owner_t) stp->st_stateowner);
1122 put_nfs4_file(stp->st_file);
1123 kmem_cache_free(stateid_slab, stp);
1124}
1125
1126static void
1127move_to_close_lru(struct nfs4_stateowner *sop) 1899move_to_close_lru(struct nfs4_stateowner *sop)
1128{ 1900{
1129 dprintk("NFSD: move_to_close_lru nfs4_stateowner %p\n", sop); 1901 dprintk("NFSD: move_to_close_lru nfs4_stateowner %p\n", sop);
@@ -1160,20 +1932,33 @@ find_file(struct inode *ino)
1160 unsigned int hashval = file_hashval(ino); 1932 unsigned int hashval = file_hashval(ino);
1161 struct nfs4_file *fp; 1933 struct nfs4_file *fp;
1162 1934
1935 spin_lock(&recall_lock);
1163 list_for_each_entry(fp, &file_hashtbl[hashval], fi_hash) { 1936 list_for_each_entry(fp, &file_hashtbl[hashval], fi_hash) {
1164 if (fp->fi_inode == ino) { 1937 if (fp->fi_inode == ino) {
1165 get_nfs4_file(fp); 1938 get_nfs4_file(fp);
1939 spin_unlock(&recall_lock);
1166 return fp; 1940 return fp;
1167 } 1941 }
1168 } 1942 }
1943 spin_unlock(&recall_lock);
1169 return NULL; 1944 return NULL;
1170} 1945}
1171 1946
1172static inline int access_valid(u32 x) 1947static inline int access_valid(u32 x, u32 minorversion)
1173{ 1948{
1174 if (x < NFS4_SHARE_ACCESS_READ) 1949 if ((x & NFS4_SHARE_ACCESS_MASK) < NFS4_SHARE_ACCESS_READ)
1175 return 0; 1950 return 0;
1176 if (x > NFS4_SHARE_ACCESS_BOTH) 1951 if ((x & NFS4_SHARE_ACCESS_MASK) > NFS4_SHARE_ACCESS_BOTH)
1952 return 0;
1953 x &= ~NFS4_SHARE_ACCESS_MASK;
1954 if (minorversion && x) {
1955 if ((x & NFS4_SHARE_WANT_MASK) > NFS4_SHARE_WANT_CANCEL)
1956 return 0;
1957 if ((x & NFS4_SHARE_WHEN_MASK) > NFS4_SHARE_PUSH_DELEG_WHEN_UNCONTENDED)
1958 return 0;
1959 x &= ~(NFS4_SHARE_WANT_MASK | NFS4_SHARE_WHEN_MASK);
1960 }
1961 if (x)
1177 return 0; 1962 return 0;
1178 return 1; 1963 return 1;
1179} 1964}
@@ -1409,7 +2194,8 @@ static struct lock_manager_operations nfsd_lease_mng_ops = {
1409 2194
1410 2195
1411__be32 2196__be32
1412nfsd4_process_open1(struct nfsd4_open *open) 2197nfsd4_process_open1(struct nfsd4_compound_state *cstate,
2198 struct nfsd4_open *open)
1413{ 2199{
1414 clientid_t *clientid = &open->op_clientid; 2200 clientid_t *clientid = &open->op_clientid;
1415 struct nfs4_client *clp = NULL; 2201 struct nfs4_client *clp = NULL;
@@ -1432,10 +2218,13 @@ nfsd4_process_open1(struct nfsd4_open *open)
1432 return nfserr_expired; 2218 return nfserr_expired;
1433 goto renew; 2219 goto renew;
1434 } 2220 }
2221 /* When sessions are used, skip open sequenceid processing */
2222 if (nfsd4_has_session(cstate))
2223 goto renew;
1435 if (!sop->so_confirmed) { 2224 if (!sop->so_confirmed) {
1436 /* Replace unconfirmed owners without checking for replay. */ 2225 /* Replace unconfirmed owners without checking for replay. */
1437 clp = sop->so_client; 2226 clp = sop->so_client;
1438 release_stateowner(sop); 2227 release_openowner(sop);
1439 open->op_stateowner = NULL; 2228 open->op_stateowner = NULL;
1440 goto renew; 2229 goto renew;
1441 } 2230 }
@@ -1709,6 +2498,7 @@ out:
1709__be32 2498__be32
1710nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open) 2499nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_open *open)
1711{ 2500{
2501 struct nfsd4_compoundres *resp = rqstp->rq_resp;
1712 struct nfs4_file *fp = NULL; 2502 struct nfs4_file *fp = NULL;
1713 struct inode *ino = current_fh->fh_dentry->d_inode; 2503 struct inode *ino = current_fh->fh_dentry->d_inode;
1714 struct nfs4_stateid *stp = NULL; 2504 struct nfs4_stateid *stp = NULL;
@@ -1716,7 +2506,7 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
1716 __be32 status; 2506 __be32 status;
1717 2507
1718 status = nfserr_inval; 2508 status = nfserr_inval;
1719 if (!access_valid(open->op_share_access) 2509 if (!access_valid(open->op_share_access, resp->cstate.minorversion)
1720 || !deny_valid(open->op_share_deny)) 2510 || !deny_valid(open->op_share_deny))
1721 goto out; 2511 goto out;
1722 /* 2512 /*
@@ -1764,12 +2554,17 @@ nfsd4_process_open2(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nf
1764 init_stateid(stp, fp, open); 2554 init_stateid(stp, fp, open);
1765 status = nfsd4_truncate(rqstp, current_fh, open); 2555 status = nfsd4_truncate(rqstp, current_fh, open);
1766 if (status) { 2556 if (status) {
1767 release_stateid(stp, OPEN_STATE); 2557 release_open_stateid(stp);
1768 goto out; 2558 goto out;
1769 } 2559 }
2560 if (nfsd4_has_session(&resp->cstate))
2561 update_stateid(&stp->st_stateid);
1770 } 2562 }
1771 memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t)); 2563 memcpy(&open->op_stateid, &stp->st_stateid, sizeof(stateid_t));
1772 2564
2565 if (nfsd4_has_session(&resp->cstate))
2566 open->op_stateowner->so_confirmed = 1;
2567
1773 /* 2568 /*
1774 * Attempt to hand out a delegation. No error return, because the 2569 * Attempt to hand out a delegation. No error return, because the
1775 * OPEN succeeds even if we fail. 2570 * OPEN succeeds even if we fail.
@@ -1790,7 +2585,8 @@ out:
1790 * To finish the open response, we just need to set the rflags. 2585 * To finish the open response, we just need to set the rflags.
1791 */ 2586 */
1792 open->op_rflags = NFS4_OPEN_RESULT_LOCKTYPE_POSIX; 2587 open->op_rflags = NFS4_OPEN_RESULT_LOCKTYPE_POSIX;
1793 if (!open->op_stateowner->so_confirmed) 2588 if (!open->op_stateowner->so_confirmed &&
2589 !nfsd4_has_session(&resp->cstate))
1794 open->op_rflags |= NFS4_OPEN_RESULT_CONFIRM; 2590 open->op_rflags |= NFS4_OPEN_RESULT_CONFIRM;
1795 2591
1796 return status; 2592 return status;
@@ -1898,7 +2694,7 @@ nfs4_laundromat(void)
1898 } 2694 }
1899 dprintk("NFSD: purging unused open stateowner (so_id %d)\n", 2695 dprintk("NFSD: purging unused open stateowner (so_id %d)\n",
1900 sop->so_id); 2696 sop->so_id);
1901 release_stateowner(sop); 2697 release_openowner(sop);
1902 } 2698 }
1903 if (clientid_val < NFSD_LAUNDROMAT_MINTIMEOUT) 2699 if (clientid_val < NFSD_LAUNDROMAT_MINTIMEOUT)
1904 clientid_val = NFSD_LAUNDROMAT_MINTIMEOUT; 2700 clientid_val = NFSD_LAUNDROMAT_MINTIMEOUT;
@@ -1983,10 +2779,7 @@ out:
1983static inline __be32 2779static inline __be32
1984check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags) 2780check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags)
1985{ 2781{
1986 /* Trying to call delegreturn with a special stateid? Yuch: */ 2782 if (ONE_STATEID(stateid) && (flags & RD_STATE))
1987 if (!(flags & (RD_STATE | WR_STATE)))
1988 return nfserr_bad_stateid;
1989 else if (ONE_STATEID(stateid) && (flags & RD_STATE))
1990 return nfs_ok; 2783 return nfs_ok;
1991 else if (locks_in_grace()) { 2784 else if (locks_in_grace()) {
1992 /* Answer in remaining cases depends on existance of 2785 /* Answer in remaining cases depends on existance of
@@ -2005,14 +2798,20 @@ check_special_stateids(svc_fh *current_fh, stateid_t *stateid, int flags)
2005 * that are not able to provide mandatory locking. 2798 * that are not able to provide mandatory locking.
2006 */ 2799 */
2007static inline int 2800static inline int
2008io_during_grace_disallowed(struct inode *inode, int flags) 2801grace_disallows_io(struct inode *inode)
2009{ 2802{
2010 return locks_in_grace() && (flags & (RD_STATE | WR_STATE)) 2803 return locks_in_grace() && mandatory_lock(inode);
2011 && mandatory_lock(inode);
2012} 2804}
2013 2805
2014static int check_stateid_generation(stateid_t *in, stateid_t *ref) 2806static int check_stateid_generation(stateid_t *in, stateid_t *ref, int flags)
2015{ 2807{
2808 /*
2809 * When sessions are used the stateid generation number is ignored
2810 * when it is zero.
2811 */
2812 if ((flags & HAS_SESSION) && in->si_generation == 0)
2813 goto out;
2814
2016 /* If the client sends us a stateid from the future, it's buggy: */ 2815 /* If the client sends us a stateid from the future, it's buggy: */
2017 if (in->si_generation > ref->si_generation) 2816 if (in->si_generation > ref->si_generation)
2018 return nfserr_bad_stateid; 2817 return nfserr_bad_stateid;
@@ -2028,74 +2827,77 @@ static int check_stateid_generation(stateid_t *in, stateid_t *ref)
2028 */ 2827 */
2029 if (in->si_generation < ref->si_generation) 2828 if (in->si_generation < ref->si_generation)
2030 return nfserr_old_stateid; 2829 return nfserr_old_stateid;
2830out:
2031 return nfs_ok; 2831 return nfs_ok;
2032} 2832}
2033 2833
2834static int is_delegation_stateid(stateid_t *stateid)
2835{
2836 return stateid->si_fileid == 0;
2837}
2838
2034/* 2839/*
2035* Checks for stateid operations 2840* Checks for stateid operations
2036*/ 2841*/
2037__be32 2842__be32
2038nfs4_preprocess_stateid_op(struct svc_fh *current_fh, stateid_t *stateid, int flags, struct file **filpp) 2843nfs4_preprocess_stateid_op(struct nfsd4_compound_state *cstate,
2844 stateid_t *stateid, int flags, struct file **filpp)
2039{ 2845{
2040 struct nfs4_stateid *stp = NULL; 2846 struct nfs4_stateid *stp = NULL;
2041 struct nfs4_delegation *dp = NULL; 2847 struct nfs4_delegation *dp = NULL;
2042 stateid_t *stidp; 2848 struct svc_fh *current_fh = &cstate->current_fh;
2043 struct inode *ino = current_fh->fh_dentry->d_inode; 2849 struct inode *ino = current_fh->fh_dentry->d_inode;
2044 __be32 status; 2850 __be32 status;
2045 2851
2046 dprintk("NFSD: preprocess_stateid_op: stateid = (%08x/%08x/%08x/%08x)\n",
2047 stateid->si_boot, stateid->si_stateownerid,
2048 stateid->si_fileid, stateid->si_generation);
2049 if (filpp) 2852 if (filpp)
2050 *filpp = NULL; 2853 *filpp = NULL;
2051 2854
2052 if (io_during_grace_disallowed(ino, flags)) 2855 if (grace_disallows_io(ino))
2053 return nfserr_grace; 2856 return nfserr_grace;
2054 2857
2858 if (nfsd4_has_session(cstate))
2859 flags |= HAS_SESSION;
2860
2055 if (ZERO_STATEID(stateid) || ONE_STATEID(stateid)) 2861 if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
2056 return check_special_stateids(current_fh, stateid, flags); 2862 return check_special_stateids(current_fh, stateid, flags);
2057 2863
2058 /* STALE STATEID */
2059 status = nfserr_stale_stateid; 2864 status = nfserr_stale_stateid;
2060 if (STALE_STATEID(stateid)) 2865 if (STALE_STATEID(stateid))
2061 goto out; 2866 goto out;
2062 2867
2063 /* BAD STATEID */
2064 status = nfserr_bad_stateid; 2868 status = nfserr_bad_stateid;
2065 if (!stateid->si_fileid) { /* delegation stateid */ 2869 if (is_delegation_stateid(stateid)) {
2066 if(!(dp = find_delegation_stateid(ino, stateid))) { 2870 dp = find_delegation_stateid(ino, stateid);
2067 dprintk("NFSD: delegation stateid not found\n"); 2871 if (!dp)
2068 goto out; 2872 goto out;
2069 } 2873 status = check_stateid_generation(stateid, &dp->dl_stateid,
2070 stidp = &dp->dl_stateid; 2874 flags);
2875 if (status)
2876 goto out;
2877 status = nfs4_check_delegmode(dp, flags);
2878 if (status)
2879 goto out;
2880 renew_client(dp->dl_client);
2881 if (filpp)
2882 *filpp = dp->dl_vfs_file;
2071 } else { /* open or lock stateid */ 2883 } else { /* open or lock stateid */
2072 if (!(stp = find_stateid(stateid, flags))) { 2884 stp = find_stateid(stateid, flags);
2073 dprintk("NFSD: open or lock stateid not found\n"); 2885 if (!stp)
2074 goto out; 2886 goto out;
2075 } 2887 if (nfs4_check_fh(current_fh, stp))
2076 if ((flags & CHECK_FH) && nfs4_check_fh(current_fh, stp))
2077 goto out; 2888 goto out;
2078 if (!stp->st_stateowner->so_confirmed) 2889 if (!stp->st_stateowner->so_confirmed)
2079 goto out; 2890 goto out;
2080 stidp = &stp->st_stateid; 2891 status = check_stateid_generation(stateid, &stp->st_stateid,
2081 } 2892 flags);
2082 status = check_stateid_generation(stateid, stidp); 2893 if (status)
2083 if (status) 2894 goto out;
2084 goto out; 2895 status = nfs4_check_openmode(stp, flags);
2085 if (stp) { 2896 if (status)
2086 if ((status = nfs4_check_openmode(stp,flags)))
2087 goto out; 2897 goto out;
2088 renew_client(stp->st_stateowner->so_client); 2898 renew_client(stp->st_stateowner->so_client);
2089 if (filpp) 2899 if (filpp)
2090 *filpp = stp->st_vfs_file; 2900 *filpp = stp->st_vfs_file;
2091 } else {
2092 if ((status = nfs4_check_delegmode(dp, flags)))
2093 goto out;
2094 renew_client(dp->dl_client);
2095 if (flags & DELEG_RET)
2096 unhash_delegation(dp);
2097 if (filpp)
2098 *filpp = dp->dl_vfs_file;
2099 } 2901 }
2100 status = nfs_ok; 2902 status = nfs_ok;
2101out: 2903out:
@@ -2113,10 +2915,14 @@ setlkflg (int type)
2113 * Checks for sequence id mutating operations. 2915 * Checks for sequence id mutating operations.
2114 */ 2916 */
2115static __be32 2917static __be32
2116nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *stateid, int flags, struct nfs4_stateowner **sopp, struct nfs4_stateid **stpp, struct nfsd4_lock *lock) 2918nfs4_preprocess_seqid_op(struct nfsd4_compound_state *cstate, u32 seqid,
2919 stateid_t *stateid, int flags,
2920 struct nfs4_stateowner **sopp,
2921 struct nfs4_stateid **stpp, struct nfsd4_lock *lock)
2117{ 2922{
2118 struct nfs4_stateid *stp; 2923 struct nfs4_stateid *stp;
2119 struct nfs4_stateowner *sop; 2924 struct nfs4_stateowner *sop;
2925 struct svc_fh *current_fh = &cstate->current_fh;
2120 __be32 status; 2926 __be32 status;
2121 2927
2122 dprintk("NFSD: preprocess_seqid_op: seqid=%d " 2928 dprintk("NFSD: preprocess_seqid_op: seqid=%d "
@@ -2134,6 +2940,10 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
2134 2940
2135 if (STALE_STATEID(stateid)) 2941 if (STALE_STATEID(stateid))
2136 return nfserr_stale_stateid; 2942 return nfserr_stale_stateid;
2943
2944 if (nfsd4_has_session(cstate))
2945 flags |= HAS_SESSION;
2946
2137 /* 2947 /*
2138 * We return BAD_STATEID if filehandle doesn't match stateid, 2948 * We return BAD_STATEID if filehandle doesn't match stateid,
2139 * the confirmed flag is incorrecly set, or the generation 2949 * the confirmed flag is incorrecly set, or the generation
@@ -2166,8 +2976,9 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
2166 if (lock->lk_is_new) { 2976 if (lock->lk_is_new) {
2167 if (!sop->so_is_open_owner) 2977 if (!sop->so_is_open_owner)
2168 return nfserr_bad_stateid; 2978 return nfserr_bad_stateid;
2169 if (!same_clid(&clp->cl_clientid, lockclid)) 2979 if (!(flags & HAS_SESSION) &&
2170 return nfserr_bad_stateid; 2980 !same_clid(&clp->cl_clientid, lockclid))
2981 return nfserr_bad_stateid;
2171 /* stp is the open stateid */ 2982 /* stp is the open stateid */
2172 status = nfs4_check_openmode(stp, lkflg); 2983 status = nfs4_check_openmode(stp, lkflg);
2173 if (status) 2984 if (status)
@@ -2190,7 +3001,7 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
2190 * For the moment, we ignore the possibility of 3001 * For the moment, we ignore the possibility of
2191 * generation number wraparound. 3002 * generation number wraparound.
2192 */ 3003 */
2193 if (seqid != sop->so_seqid) 3004 if (!(flags & HAS_SESSION) && seqid != sop->so_seqid)
2194 goto check_replay; 3005 goto check_replay;
2195 3006
2196 if (sop->so_confirmed && flags & CONFIRM) { 3007 if (sop->so_confirmed && flags & CONFIRM) {
@@ -2203,7 +3014,7 @@ nfs4_preprocess_seqid_op(struct svc_fh *current_fh, u32 seqid, stateid_t *statei
2203 " confirmed yet!\n"); 3014 " confirmed yet!\n");
2204 return nfserr_bad_stateid; 3015 return nfserr_bad_stateid;
2205 } 3016 }
2206 status = check_stateid_generation(stateid, &stp->st_stateid); 3017 status = check_stateid_generation(stateid, &stp->st_stateid, flags);
2207 if (status) 3018 if (status)
2208 return status; 3019 return status;
2209 renew_client(sop->so_client); 3020 renew_client(sop->so_client);
@@ -2239,7 +3050,7 @@ nfsd4_open_confirm(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2239 3050
2240 nfs4_lock_state(); 3051 nfs4_lock_state();
2241 3052
2242 if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh, 3053 if ((status = nfs4_preprocess_seqid_op(cstate,
2243 oc->oc_seqid, &oc->oc_req_stateid, 3054 oc->oc_seqid, &oc->oc_req_stateid,
2244 CONFIRM | OPEN_STATE, 3055 CONFIRM | OPEN_STATE,
2245 &oc->oc_stateowner, &stp, NULL))) 3056 &oc->oc_stateowner, &stp, NULL)))
@@ -2304,12 +3115,12 @@ nfsd4_open_downgrade(struct svc_rqst *rqstp,
2304 (int)cstate->current_fh.fh_dentry->d_name.len, 3115 (int)cstate->current_fh.fh_dentry->d_name.len,
2305 cstate->current_fh.fh_dentry->d_name.name); 3116 cstate->current_fh.fh_dentry->d_name.name);
2306 3117
2307 if (!access_valid(od->od_share_access) 3118 if (!access_valid(od->od_share_access, cstate->minorversion)
2308 || !deny_valid(od->od_share_deny)) 3119 || !deny_valid(od->od_share_deny))
2309 return nfserr_inval; 3120 return nfserr_inval;
2310 3121
2311 nfs4_lock_state(); 3122 nfs4_lock_state();
2312 if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh, 3123 if ((status = nfs4_preprocess_seqid_op(cstate,
2313 od->od_seqid, 3124 od->od_seqid,
2314 &od->od_stateid, 3125 &od->od_stateid,
2315 OPEN_STATE, 3126 OPEN_STATE,
@@ -2362,7 +3173,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2362 3173
2363 nfs4_lock_state(); 3174 nfs4_lock_state();
2364 /* check close_lru for replay */ 3175 /* check close_lru for replay */
2365 if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh, 3176 if ((status = nfs4_preprocess_seqid_op(cstate,
2366 close->cl_seqid, 3177 close->cl_seqid,
2367 &close->cl_stateid, 3178 &close->cl_stateid,
2368 OPEN_STATE | CLOSE_STATE, 3179 OPEN_STATE | CLOSE_STATE,
@@ -2373,7 +3184,7 @@ nfsd4_close(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2373 memcpy(&close->cl_stateid, &stp->st_stateid, sizeof(stateid_t)); 3184 memcpy(&close->cl_stateid, &stp->st_stateid, sizeof(stateid_t));
2374 3185
2375 /* release_stateid() calls nfsd_close() if needed */ 3186 /* release_stateid() calls nfsd_close() if needed */
2376 release_stateid(stp, OPEN_STATE); 3187 release_open_stateid(stp);
2377 3188
2378 /* place unused nfs4_stateowners on so_close_lru list to be 3189 /* place unused nfs4_stateowners on so_close_lru list to be
2379 * released by the laundromat service after the lease period 3190 * released by the laundromat service after the lease period
@@ -2394,16 +3205,40 @@ __be32
2394nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, 3205nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2395 struct nfsd4_delegreturn *dr) 3206 struct nfsd4_delegreturn *dr)
2396{ 3207{
3208 struct nfs4_delegation *dp;
3209 stateid_t *stateid = &dr->dr_stateid;
3210 struct inode *inode;
2397 __be32 status; 3211 __be32 status;
3212 int flags = 0;
2398 3213
2399 if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) 3214 if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0)))
2400 goto out; 3215 return status;
3216 inode = cstate->current_fh.fh_dentry->d_inode;
2401 3217
3218 if (nfsd4_has_session(cstate))
3219 flags |= HAS_SESSION;
2402 nfs4_lock_state(); 3220 nfs4_lock_state();
2403 status = nfs4_preprocess_stateid_op(&cstate->current_fh, 3221 status = nfserr_bad_stateid;
2404 &dr->dr_stateid, DELEG_RET, NULL); 3222 if (ZERO_STATEID(stateid) || ONE_STATEID(stateid))
2405 nfs4_unlock_state(); 3223 goto out;
3224 status = nfserr_stale_stateid;
3225 if (STALE_STATEID(stateid))
3226 goto out;
3227 status = nfserr_bad_stateid;
3228 if (!is_delegation_stateid(stateid))
3229 goto out;
3230 dp = find_delegation_stateid(inode, stateid);
3231 if (!dp)
3232 goto out;
3233 status = check_stateid_generation(stateid, &dp->dl_stateid, flags);
3234 if (status)
3235 goto out;
3236 renew_client(dp->dl_client);
3237
3238 unhash_delegation(dp);
2406out: 3239out:
3240 nfs4_unlock_state();
3241
2407 return status; 3242 return status;
2408} 3243}
2409 3244
@@ -2684,11 +3519,12 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2684 struct nfs4_file *fp; 3519 struct nfs4_file *fp;
2685 3520
2686 status = nfserr_stale_clientid; 3521 status = nfserr_stale_clientid;
2687 if (STALE_CLIENTID(&lock->lk_new_clientid)) 3522 if (!nfsd4_has_session(cstate) &&
3523 STALE_CLIENTID(&lock->lk_new_clientid))
2688 goto out; 3524 goto out;
2689 3525
2690 /* validate and update open stateid and open seqid */ 3526 /* validate and update open stateid and open seqid */
2691 status = nfs4_preprocess_seqid_op(&cstate->current_fh, 3527 status = nfs4_preprocess_seqid_op(cstate,
2692 lock->lk_new_open_seqid, 3528 lock->lk_new_open_seqid,
2693 &lock->lk_new_open_stateid, 3529 &lock->lk_new_open_stateid,
2694 OPEN_STATE, 3530 OPEN_STATE,
@@ -2715,7 +3551,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2715 goto out; 3551 goto out;
2716 } else { 3552 } else {
2717 /* lock (lock owner + lock stateid) already exists */ 3553 /* lock (lock owner + lock stateid) already exists */
2718 status = nfs4_preprocess_seqid_op(&cstate->current_fh, 3554 status = nfs4_preprocess_seqid_op(cstate,
2719 lock->lk_old_lock_seqid, 3555 lock->lk_old_lock_seqid,
2720 &lock->lk_old_lock_stateid, 3556 &lock->lk_old_lock_stateid,
2721 LOCK_STATE, 3557 LOCK_STATE,
@@ -2788,7 +3624,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2788 } 3624 }
2789out: 3625out:
2790 if (status && lock->lk_is_new && lock_sop) 3626 if (status && lock->lk_is_new && lock_sop)
2791 release_stateowner(lock_sop); 3627 release_lockowner(lock_sop);
2792 if (lock->lk_replay_owner) { 3628 if (lock->lk_replay_owner) {
2793 nfs4_get_stateowner(lock->lk_replay_owner); 3629 nfs4_get_stateowner(lock->lk_replay_owner);
2794 cstate->replay_owner = lock->lk_replay_owner; 3630 cstate->replay_owner = lock->lk_replay_owner;
@@ -2838,7 +3674,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2838 nfs4_lock_state(); 3674 nfs4_lock_state();
2839 3675
2840 status = nfserr_stale_clientid; 3676 status = nfserr_stale_clientid;
2841 if (STALE_CLIENTID(&lockt->lt_clientid)) 3677 if (!nfsd4_has_session(cstate) && STALE_CLIENTID(&lockt->lt_clientid))
2842 goto out; 3678 goto out;
2843 3679
2844 if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) { 3680 if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) {
@@ -2911,7 +3747,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
2911 3747
2912 nfs4_lock_state(); 3748 nfs4_lock_state();
2913 3749
2914 if ((status = nfs4_preprocess_seqid_op(&cstate->current_fh, 3750 if ((status = nfs4_preprocess_seqid_op(cstate,
2915 locku->lu_seqid, 3751 locku->lu_seqid,
2916 &locku->lu_stateid, 3752 &locku->lu_stateid,
2917 LOCK_STATE, 3753 LOCK_STATE,
@@ -3037,7 +3873,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp,
3037 /* unhash_stateowner deletes so_perclient only 3873 /* unhash_stateowner deletes so_perclient only
3038 * for openowners. */ 3874 * for openowners. */
3039 list_del(&sop->so_perclient); 3875 list_del(&sop->so_perclient);
3040 release_stateowner(sop); 3876 release_lockowner(sop);
3041 } 3877 }
3042out: 3878out:
3043 nfs4_unlock_state(); 3879 nfs4_unlock_state();
@@ -3051,12 +3887,12 @@ alloc_reclaim(void)
3051} 3887}
3052 3888
3053int 3889int
3054nfs4_has_reclaimed_state(const char *name) 3890nfs4_has_reclaimed_state(const char *name, bool use_exchange_id)
3055{ 3891{
3056 unsigned int strhashval = clientstr_hashval(name); 3892 unsigned int strhashval = clientstr_hashval(name);
3057 struct nfs4_client *clp; 3893 struct nfs4_client *clp;
3058 3894
3059 clp = find_confirmed_client_by_str(name, strhashval); 3895 clp = find_confirmed_client_by_str(name, strhashval, use_exchange_id);
3060 return clp ? 1 : 0; 3896 return clp ? 1 : 0;
3061} 3897}
3062 3898
@@ -3153,6 +3989,8 @@ nfs4_state_init(void)
3153 INIT_LIST_HEAD(&unconf_str_hashtbl[i]); 3989 INIT_LIST_HEAD(&unconf_str_hashtbl[i]);
3154 INIT_LIST_HEAD(&unconf_id_hashtbl[i]); 3990 INIT_LIST_HEAD(&unconf_id_hashtbl[i]);
3155 } 3991 }
3992 for (i = 0; i < SESSION_HASH_SIZE; i++)
3993 INIT_LIST_HEAD(&sessionid_hashtbl[i]);
3156 for (i = 0; i < FILE_HASH_SIZE; i++) { 3994 for (i = 0; i < FILE_HASH_SIZE; i++) {
3157 INIT_LIST_HEAD(&file_hashtbl[i]); 3995 INIT_LIST_HEAD(&file_hashtbl[i]);
3158 } 3996 }
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 9250067943d8..b820c311931c 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -45,6 +45,7 @@
45#include <linux/fs.h> 45#include <linux/fs.h>
46#include <linux/namei.h> 46#include <linux/namei.h>
47#include <linux/vfs.h> 47#include <linux/vfs.h>
48#include <linux/utsname.h>
48#include <linux/sunrpc/xdr.h> 49#include <linux/sunrpc/xdr.h>
49#include <linux/sunrpc/svc.h> 50#include <linux/sunrpc/svc.h>
50#include <linux/sunrpc/clnt.h> 51#include <linux/sunrpc/clnt.h>
@@ -188,6 +189,11 @@ static __be32 *read_buf(struct nfsd4_compoundargs *argp, u32 nbytes)
188 return p; 189 return p;
189} 190}
190 191
192static int zero_clientid(clientid_t *clid)
193{
194 return (clid->cl_boot == 0) && (clid->cl_id == 0);
195}
196
191static int 197static int
192defer_free(struct nfsd4_compoundargs *argp, 198defer_free(struct nfsd4_compoundargs *argp,
193 void (*release)(const void *), void *p) 199 void (*release)(const void *), void *p)
@@ -230,6 +236,7 @@ nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
230 236
231 bmval[0] = 0; 237 bmval[0] = 0;
232 bmval[1] = 0; 238 bmval[1] = 0;
239 bmval[2] = 0;
233 240
234 READ_BUF(4); 241 READ_BUF(4);
235 READ32(bmlen); 242 READ32(bmlen);
@@ -241,13 +248,27 @@ nfsd4_decode_bitmap(struct nfsd4_compoundargs *argp, u32 *bmval)
241 READ32(bmval[0]); 248 READ32(bmval[0]);
242 if (bmlen > 1) 249 if (bmlen > 1)
243 READ32(bmval[1]); 250 READ32(bmval[1]);
251 if (bmlen > 2)
252 READ32(bmval[2]);
244 253
245 DECODE_TAIL; 254 DECODE_TAIL;
246} 255}
247 256
257static u32 nfsd_attrmask[] = {
258 NFSD_WRITEABLE_ATTRS_WORD0,
259 NFSD_WRITEABLE_ATTRS_WORD1,
260 NFSD_WRITEABLE_ATTRS_WORD2
261};
262
263static u32 nfsd41_ex_attrmask[] = {
264 NFSD_SUPPATTR_EXCLCREAT_WORD0,
265 NFSD_SUPPATTR_EXCLCREAT_WORD1,
266 NFSD_SUPPATTR_EXCLCREAT_WORD2
267};
268
248static __be32 269static __be32
249nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *iattr, 270nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, u32 *writable,
250 struct nfs4_acl **acl) 271 struct iattr *iattr, struct nfs4_acl **acl)
251{ 272{
252 int expected_len, len = 0; 273 int expected_len, len = 0;
253 u32 dummy32; 274 u32 dummy32;
@@ -263,9 +284,12 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia
263 * According to spec, unsupported attributes return ERR_ATTRNOTSUPP; 284 * According to spec, unsupported attributes return ERR_ATTRNOTSUPP;
264 * read-only attributes return ERR_INVAL. 285 * read-only attributes return ERR_INVAL.
265 */ 286 */
266 if ((bmval[0] & ~NFSD_SUPPORTED_ATTRS_WORD0) || (bmval[1] & ~NFSD_SUPPORTED_ATTRS_WORD1)) 287 if ((bmval[0] & ~nfsd_suppattrs0(argp->minorversion)) ||
288 (bmval[1] & ~nfsd_suppattrs1(argp->minorversion)) ||
289 (bmval[2] & ~nfsd_suppattrs2(argp->minorversion)))
267 return nfserr_attrnotsupp; 290 return nfserr_attrnotsupp;
268 if ((bmval[0] & ~NFSD_WRITEABLE_ATTRS_WORD0) || (bmval[1] & ~NFSD_WRITEABLE_ATTRS_WORD1)) 291 if ((bmval[0] & ~writable[0]) || (bmval[1] & ~writable[1]) ||
292 (bmval[2] & ~writable[2]))
269 return nfserr_inval; 293 return nfserr_inval;
270 294
271 READ_BUF(4); 295 READ_BUF(4);
@@ -400,6 +424,7 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia
400 goto xdr_error; 424 goto xdr_error;
401 } 425 }
402 } 426 }
427 BUG_ON(bmval[2]); /* no such writeable attr supported yet */
403 if (len != expected_len) 428 if (len != expected_len)
404 goto xdr_error; 429 goto xdr_error;
405 430
@@ -493,7 +518,9 @@ nfsd4_decode_create(struct nfsd4_compoundargs *argp, struct nfsd4_create *create
493 if ((status = check_filename(create->cr_name, create->cr_namelen, nfserr_inval))) 518 if ((status = check_filename(create->cr_name, create->cr_namelen, nfserr_inval)))
494 return status; 519 return status;
495 520
496 if ((status = nfsd4_decode_fattr(argp, create->cr_bmval, &create->cr_iattr, &create->cr_acl))) 521 status = nfsd4_decode_fattr(argp, create->cr_bmval, nfsd_attrmask,
522 &create->cr_iattr, &create->cr_acl);
523 if (status)
497 goto out; 524 goto out;
498 525
499 DECODE_TAIL; 526 DECODE_TAIL;
@@ -583,6 +610,8 @@ nfsd4_decode_lockt(struct nfsd4_compoundargs *argp, struct nfsd4_lockt *lockt)
583 READ_BUF(lockt->lt_owner.len); 610 READ_BUF(lockt->lt_owner.len);
584 READMEM(lockt->lt_owner.data, lockt->lt_owner.len); 611 READMEM(lockt->lt_owner.data, lockt->lt_owner.len);
585 612
613 if (argp->minorversion && !zero_clientid(&lockt->lt_clientid))
614 return nfserr_inval;
586 DECODE_TAIL; 615 DECODE_TAIL;
587} 616}
588 617
@@ -652,13 +681,26 @@ nfsd4_decode_open(struct nfsd4_compoundargs *argp, struct nfsd4_open *open)
652 switch (open->op_createmode) { 681 switch (open->op_createmode) {
653 case NFS4_CREATE_UNCHECKED: 682 case NFS4_CREATE_UNCHECKED:
654 case NFS4_CREATE_GUARDED: 683 case NFS4_CREATE_GUARDED:
655 if ((status = nfsd4_decode_fattr(argp, open->op_bmval, &open->op_iattr, &open->op_acl))) 684 status = nfsd4_decode_fattr(argp, open->op_bmval,
685 nfsd_attrmask, &open->op_iattr, &open->op_acl);
686 if (status)
656 goto out; 687 goto out;
657 break; 688 break;
658 case NFS4_CREATE_EXCLUSIVE: 689 case NFS4_CREATE_EXCLUSIVE:
659 READ_BUF(8); 690 READ_BUF(8);
660 COPYMEM(open->op_verf.data, 8); 691 COPYMEM(open->op_verf.data, 8);
661 break; 692 break;
693 case NFS4_CREATE_EXCLUSIVE4_1:
694 if (argp->minorversion < 1)
695 goto xdr_error;
696 READ_BUF(8);
697 COPYMEM(open->op_verf.data, 8);
698 status = nfsd4_decode_fattr(argp, open->op_bmval,
699 nfsd41_ex_attrmask, &open->op_iattr,
700 &open->op_acl);
701 if (status)
702 goto out;
703 break;
662 default: 704 default:
663 goto xdr_error; 705 goto xdr_error;
664 } 706 }
@@ -851,7 +893,7 @@ nfsd4_decode_setattr(struct nfsd4_compoundargs *argp, struct nfsd4_setattr *seta
851 status = nfsd4_decode_stateid(argp, &setattr->sa_stateid); 893 status = nfsd4_decode_stateid(argp, &setattr->sa_stateid);
852 if (status) 894 if (status)
853 return status; 895 return status;
854 return nfsd4_decode_fattr(argp, setattr->sa_bmval, 896 return nfsd4_decode_fattr(argp, setattr->sa_bmval, nfsd_attrmask,
855 &setattr->sa_iattr, &setattr->sa_acl); 897 &setattr->sa_iattr, &setattr->sa_acl);
856} 898}
857 899
@@ -993,6 +1035,241 @@ nfsd4_decode_release_lockowner(struct nfsd4_compoundargs *argp, struct nfsd4_rel
993 READ_BUF(rlockowner->rl_owner.len); 1035 READ_BUF(rlockowner->rl_owner.len);
994 READMEM(rlockowner->rl_owner.data, rlockowner->rl_owner.len); 1036 READMEM(rlockowner->rl_owner.data, rlockowner->rl_owner.len);
995 1037
1038 if (argp->minorversion && !zero_clientid(&rlockowner->rl_clientid))
1039 return nfserr_inval;
1040 DECODE_TAIL;
1041}
1042
1043static __be32
1044nfsd4_decode_exchange_id(struct nfsd4_compoundargs *argp,
1045 struct nfsd4_exchange_id *exid)
1046{
1047 int dummy;
1048 DECODE_HEAD;
1049
1050 READ_BUF(NFS4_VERIFIER_SIZE);
1051 COPYMEM(exid->verifier.data, NFS4_VERIFIER_SIZE);
1052
1053 READ_BUF(4);
1054 READ32(exid->clname.len);
1055
1056 READ_BUF(exid->clname.len);
1057 SAVEMEM(exid->clname.data, exid->clname.len);
1058
1059 READ_BUF(4);
1060 READ32(exid->flags);
1061
1062 /* Ignore state_protect4_a */
1063 READ_BUF(4);
1064 READ32(exid->spa_how);
1065 switch (exid->spa_how) {
1066 case SP4_NONE:
1067 break;
1068 case SP4_MACH_CRED:
1069 /* spo_must_enforce */
1070 READ_BUF(4);
1071 READ32(dummy);
1072 READ_BUF(dummy * 4);
1073 p += dummy;
1074
1075 /* spo_must_allow */
1076 READ_BUF(4);
1077 READ32(dummy);
1078 READ_BUF(dummy * 4);
1079 p += dummy;
1080 break;
1081 case SP4_SSV:
1082 /* ssp_ops */
1083 READ_BUF(4);
1084 READ32(dummy);
1085 READ_BUF(dummy * 4);
1086 p += dummy;
1087
1088 READ_BUF(4);
1089 READ32(dummy);
1090 READ_BUF(dummy * 4);
1091 p += dummy;
1092
1093 /* ssp_hash_algs<> */
1094 READ_BUF(4);
1095 READ32(dummy);
1096 READ_BUF(dummy);
1097 p += XDR_QUADLEN(dummy);
1098
1099 /* ssp_encr_algs<> */
1100 READ_BUF(4);
1101 READ32(dummy);
1102 READ_BUF(dummy);
1103 p += XDR_QUADLEN(dummy);
1104
1105 /* ssp_window and ssp_num_gss_handles */
1106 READ_BUF(8);
1107 READ32(dummy);
1108 READ32(dummy);
1109 break;
1110 default:
1111 goto xdr_error;
1112 }
1113
1114 /* Ignore Implementation ID */
1115 READ_BUF(4); /* nfs_impl_id4 array length */
1116 READ32(dummy);
1117
1118 if (dummy > 1)
1119 goto xdr_error;
1120
1121 if (dummy == 1) {
1122 /* nii_domain */
1123 READ_BUF(4);
1124 READ32(dummy);
1125 READ_BUF(dummy);
1126 p += XDR_QUADLEN(dummy);
1127
1128 /* nii_name */
1129 READ_BUF(4);
1130 READ32(dummy);
1131 READ_BUF(dummy);
1132 p += XDR_QUADLEN(dummy);
1133
1134 /* nii_date */
1135 READ_BUF(12);
1136 p += 3;
1137 }
1138 DECODE_TAIL;
1139}
1140
1141static __be32
1142nfsd4_decode_create_session(struct nfsd4_compoundargs *argp,
1143 struct nfsd4_create_session *sess)
1144{
1145 DECODE_HEAD;
1146
1147 u32 dummy;
1148 char *machine_name;
1149 int i;
1150 int nr_secflavs;
1151
1152 READ_BUF(16);
1153 COPYMEM(&sess->clientid, 8);
1154 READ32(sess->seqid);
1155 READ32(sess->flags);
1156
1157 /* Fore channel attrs */
1158 READ_BUF(28);
1159 READ32(dummy); /* headerpadsz is always 0 */
1160 READ32(sess->fore_channel.maxreq_sz);
1161 READ32(sess->fore_channel.maxresp_sz);
1162 READ32(sess->fore_channel.maxresp_cached);
1163 READ32(sess->fore_channel.maxops);
1164 READ32(sess->fore_channel.maxreqs);
1165 READ32(sess->fore_channel.nr_rdma_attrs);
1166 if (sess->fore_channel.nr_rdma_attrs == 1) {
1167 READ_BUF(4);
1168 READ32(sess->fore_channel.rdma_attrs);
1169 } else if (sess->fore_channel.nr_rdma_attrs > 1) {
1170 dprintk("Too many fore channel attr bitmaps!\n");
1171 goto xdr_error;
1172 }
1173
1174 /* Back channel attrs */
1175 READ_BUF(28);
1176 READ32(dummy); /* headerpadsz is always 0 */
1177 READ32(sess->back_channel.maxreq_sz);
1178 READ32(sess->back_channel.maxresp_sz);
1179 READ32(sess->back_channel.maxresp_cached);
1180 READ32(sess->back_channel.maxops);
1181 READ32(sess->back_channel.maxreqs);
1182 READ32(sess->back_channel.nr_rdma_attrs);
1183 if (sess->back_channel.nr_rdma_attrs == 1) {
1184 READ_BUF(4);
1185 READ32(sess->back_channel.rdma_attrs);
1186 } else if (sess->back_channel.nr_rdma_attrs > 1) {
1187 dprintk("Too many back channel attr bitmaps!\n");
1188 goto xdr_error;
1189 }
1190
1191 READ_BUF(8);
1192 READ32(sess->callback_prog);
1193
1194 /* callback_sec_params4 */
1195 READ32(nr_secflavs);
1196 for (i = 0; i < nr_secflavs; ++i) {
1197 READ_BUF(4);
1198 READ32(dummy);
1199 switch (dummy) {
1200 case RPC_AUTH_NULL:
1201 /* Nothing to read */
1202 break;
1203 case RPC_AUTH_UNIX:
1204 READ_BUF(8);
1205 /* stamp */
1206 READ32(dummy);
1207
1208 /* machine name */
1209 READ32(dummy);
1210 READ_BUF(dummy);
1211 SAVEMEM(machine_name, dummy);
1212
1213 /* uid, gid */
1214 READ_BUF(8);
1215 READ32(sess->uid);
1216 READ32(sess->gid);
1217
1218 /* more gids */
1219 READ_BUF(4);
1220 READ32(dummy);
1221 READ_BUF(dummy * 4);
1222 for (i = 0; i < dummy; ++i)
1223 READ32(dummy);
1224 break;
1225 case RPC_AUTH_GSS:
1226 dprintk("RPC_AUTH_GSS callback secflavor "
1227 "not supported!\n");
1228 READ_BUF(8);
1229 /* gcbp_service */
1230 READ32(dummy);
1231 /* gcbp_handle_from_server */
1232 READ32(dummy);
1233 READ_BUF(dummy);
1234 p += XDR_QUADLEN(dummy);
1235 /* gcbp_handle_from_client */
1236 READ_BUF(4);
1237 READ32(dummy);
1238 READ_BUF(dummy);
1239 p += XDR_QUADLEN(dummy);
1240 break;
1241 default:
1242 dprintk("Illegal callback secflavor\n");
1243 return nfserr_inval;
1244 }
1245 }
1246 DECODE_TAIL;
1247}
1248
1249static __be32
1250nfsd4_decode_destroy_session(struct nfsd4_compoundargs *argp,
1251 struct nfsd4_destroy_session *destroy_session)
1252{
1253 DECODE_HEAD;
1254 READ_BUF(NFS4_MAX_SESSIONID_LEN);
1255 COPYMEM(destroy_session->sessionid.data, NFS4_MAX_SESSIONID_LEN);
1256
1257 DECODE_TAIL;
1258}
1259
1260static __be32
1261nfsd4_decode_sequence(struct nfsd4_compoundargs *argp,
1262 struct nfsd4_sequence *seq)
1263{
1264 DECODE_HEAD;
1265
1266 READ_BUF(NFS4_MAX_SESSIONID_LEN + 16);
1267 COPYMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN);
1268 READ32(seq->seqid);
1269 READ32(seq->slotid);
1270 READ32(seq->maxslots);
1271 READ32(seq->cachethis);
1272
996 DECODE_TAIL; 1273 DECODE_TAIL;
997} 1274}
998 1275
@@ -1005,7 +1282,7 @@ nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p)
1005static __be32 1282static __be32
1006nfsd4_decode_notsupp(struct nfsd4_compoundargs *argp, void *p) 1283nfsd4_decode_notsupp(struct nfsd4_compoundargs *argp, void *p)
1007{ 1284{
1008 return nfserr_opnotsupp; 1285 return nfserr_notsupp;
1009} 1286}
1010 1287
1011typedef __be32(*nfsd4_dec)(struct nfsd4_compoundargs *argp, void *); 1288typedef __be32(*nfsd4_dec)(struct nfsd4_compoundargs *argp, void *);
@@ -1031,7 +1308,7 @@ static nfsd4_dec nfsd4_dec_ops[] = {
1031 [OP_OPEN_CONFIRM] = (nfsd4_dec)nfsd4_decode_open_confirm, 1308 [OP_OPEN_CONFIRM] = (nfsd4_dec)nfsd4_decode_open_confirm,
1032 [OP_OPEN_DOWNGRADE] = (nfsd4_dec)nfsd4_decode_open_downgrade, 1309 [OP_OPEN_DOWNGRADE] = (nfsd4_dec)nfsd4_decode_open_downgrade,
1033 [OP_PUTFH] = (nfsd4_dec)nfsd4_decode_putfh, 1310 [OP_PUTFH] = (nfsd4_dec)nfsd4_decode_putfh,
1034 [OP_PUTPUBFH] = (nfsd4_dec)nfsd4_decode_notsupp, 1311 [OP_PUTPUBFH] = (nfsd4_dec)nfsd4_decode_noop,
1035 [OP_PUTROOTFH] = (nfsd4_dec)nfsd4_decode_noop, 1312 [OP_PUTROOTFH] = (nfsd4_dec)nfsd4_decode_noop,
1036 [OP_READ] = (nfsd4_dec)nfsd4_decode_read, 1313 [OP_READ] = (nfsd4_dec)nfsd4_decode_read,
1037 [OP_READDIR] = (nfsd4_dec)nfsd4_decode_readdir, 1314 [OP_READDIR] = (nfsd4_dec)nfsd4_decode_readdir,
@@ -1050,6 +1327,67 @@ static nfsd4_dec nfsd4_dec_ops[] = {
1050 [OP_RELEASE_LOCKOWNER] = (nfsd4_dec)nfsd4_decode_release_lockowner, 1327 [OP_RELEASE_LOCKOWNER] = (nfsd4_dec)nfsd4_decode_release_lockowner,
1051}; 1328};
1052 1329
1330static nfsd4_dec nfsd41_dec_ops[] = {
1331 [OP_ACCESS] (nfsd4_dec)nfsd4_decode_access,
1332 [OP_CLOSE] (nfsd4_dec)nfsd4_decode_close,
1333 [OP_COMMIT] (nfsd4_dec)nfsd4_decode_commit,
1334 [OP_CREATE] (nfsd4_dec)nfsd4_decode_create,
1335 [OP_DELEGPURGE] (nfsd4_dec)nfsd4_decode_notsupp,
1336 [OP_DELEGRETURN] (nfsd4_dec)nfsd4_decode_delegreturn,
1337 [OP_GETATTR] (nfsd4_dec)nfsd4_decode_getattr,
1338 [OP_GETFH] (nfsd4_dec)nfsd4_decode_noop,
1339 [OP_LINK] (nfsd4_dec)nfsd4_decode_link,
1340 [OP_LOCK] (nfsd4_dec)nfsd4_decode_lock,
1341 [OP_LOCKT] (nfsd4_dec)nfsd4_decode_lockt,
1342 [OP_LOCKU] (nfsd4_dec)nfsd4_decode_locku,
1343 [OP_LOOKUP] (nfsd4_dec)nfsd4_decode_lookup,
1344 [OP_LOOKUPP] (nfsd4_dec)nfsd4_decode_noop,
1345 [OP_NVERIFY] (nfsd4_dec)nfsd4_decode_verify,
1346 [OP_OPEN] (nfsd4_dec)nfsd4_decode_open,
1347 [OP_OPENATTR] (nfsd4_dec)nfsd4_decode_notsupp,
1348 [OP_OPEN_CONFIRM] (nfsd4_dec)nfsd4_decode_notsupp,
1349 [OP_OPEN_DOWNGRADE] (nfsd4_dec)nfsd4_decode_open_downgrade,
1350 [OP_PUTFH] (nfsd4_dec)nfsd4_decode_putfh,
1351 [OP_PUTPUBFH] (nfsd4_dec)nfsd4_decode_notsupp,
1352 [OP_PUTROOTFH] (nfsd4_dec)nfsd4_decode_noop,
1353 [OP_READ] (nfsd4_dec)nfsd4_decode_read,
1354 [OP_READDIR] (nfsd4_dec)nfsd4_decode_readdir,
1355 [OP_READLINK] (nfsd4_dec)nfsd4_decode_noop,
1356 [OP_REMOVE] (nfsd4_dec)nfsd4_decode_remove,
1357 [OP_RENAME] (nfsd4_dec)nfsd4_decode_rename,
1358 [OP_RENEW] (nfsd4_dec)nfsd4_decode_notsupp,
1359 [OP_RESTOREFH] (nfsd4_dec)nfsd4_decode_noop,
1360 [OP_SAVEFH] (nfsd4_dec)nfsd4_decode_noop,
1361 [OP_SECINFO] (nfsd4_dec)nfsd4_decode_secinfo,
1362 [OP_SETATTR] (nfsd4_dec)nfsd4_decode_setattr,
1363 [OP_SETCLIENTID] (nfsd4_dec)nfsd4_decode_notsupp,
1364 [OP_SETCLIENTID_CONFIRM](nfsd4_dec)nfsd4_decode_notsupp,
1365 [OP_VERIFY] (nfsd4_dec)nfsd4_decode_verify,
1366 [OP_WRITE] (nfsd4_dec)nfsd4_decode_write,
1367 [OP_RELEASE_LOCKOWNER] (nfsd4_dec)nfsd4_decode_notsupp,
1368
1369 /* new operations for NFSv4.1 */
1370 [OP_BACKCHANNEL_CTL] (nfsd4_dec)nfsd4_decode_notsupp,
1371 [OP_BIND_CONN_TO_SESSION](nfsd4_dec)nfsd4_decode_notsupp,
1372 [OP_EXCHANGE_ID] (nfsd4_dec)nfsd4_decode_exchange_id,
1373 [OP_CREATE_SESSION] (nfsd4_dec)nfsd4_decode_create_session,
1374 [OP_DESTROY_SESSION] (nfsd4_dec)nfsd4_decode_destroy_session,
1375 [OP_FREE_STATEID] (nfsd4_dec)nfsd4_decode_notsupp,
1376 [OP_GET_DIR_DELEGATION] (nfsd4_dec)nfsd4_decode_notsupp,
1377 [OP_GETDEVICEINFO] (nfsd4_dec)nfsd4_decode_notsupp,
1378 [OP_GETDEVICELIST] (nfsd4_dec)nfsd4_decode_notsupp,
1379 [OP_LAYOUTCOMMIT] (nfsd4_dec)nfsd4_decode_notsupp,
1380 [OP_LAYOUTGET] (nfsd4_dec)nfsd4_decode_notsupp,
1381 [OP_LAYOUTRETURN] (nfsd4_dec)nfsd4_decode_notsupp,
1382 [OP_SECINFO_NO_NAME] (nfsd4_dec)nfsd4_decode_notsupp,
1383 [OP_SEQUENCE] (nfsd4_dec)nfsd4_decode_sequence,
1384 [OP_SET_SSV] (nfsd4_dec)nfsd4_decode_notsupp,
1385 [OP_TEST_STATEID] (nfsd4_dec)nfsd4_decode_notsupp,
1386 [OP_WANT_DELEGATION] (nfsd4_dec)nfsd4_decode_notsupp,
1387 [OP_DESTROY_CLIENTID] (nfsd4_dec)nfsd4_decode_notsupp,
1388 [OP_RECLAIM_COMPLETE] (nfsd4_dec)nfsd4_decode_notsupp,
1389};
1390
1053struct nfsd4_minorversion_ops { 1391struct nfsd4_minorversion_ops {
1054 nfsd4_dec *decoders; 1392 nfsd4_dec *decoders;
1055 int nops; 1393 int nops;
@@ -1057,6 +1395,7 @@ struct nfsd4_minorversion_ops {
1057 1395
1058static struct nfsd4_minorversion_ops nfsd4_minorversion[] = { 1396static struct nfsd4_minorversion_ops nfsd4_minorversion[] = {
1059 [0] = { nfsd4_dec_ops, ARRAY_SIZE(nfsd4_dec_ops) }, 1397 [0] = { nfsd4_dec_ops, ARRAY_SIZE(nfsd4_dec_ops) },
1398 [1] = { nfsd41_dec_ops, ARRAY_SIZE(nfsd41_dec_ops) },
1060}; 1399};
1061 1400
1062static __be32 1401static __be32
@@ -1412,6 +1751,7 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
1412{ 1751{
1413 u32 bmval0 = bmval[0]; 1752 u32 bmval0 = bmval[0];
1414 u32 bmval1 = bmval[1]; 1753 u32 bmval1 = bmval[1];
1754 u32 bmval2 = bmval[2];
1415 struct kstat stat; 1755 struct kstat stat;
1416 struct svc_fh tempfh; 1756 struct svc_fh tempfh;
1417 struct kstatfs statfs; 1757 struct kstatfs statfs;
@@ -1425,12 +1765,16 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
1425 int err; 1765 int err;
1426 int aclsupport = 0; 1766 int aclsupport = 0;
1427 struct nfs4_acl *acl = NULL; 1767 struct nfs4_acl *acl = NULL;
1768 struct nfsd4_compoundres *resp = rqstp->rq_resp;
1769 u32 minorversion = resp->cstate.minorversion;
1428 1770
1429 BUG_ON(bmval1 & NFSD_WRITEONLY_ATTRS_WORD1); 1771 BUG_ON(bmval1 & NFSD_WRITEONLY_ATTRS_WORD1);
1430 BUG_ON(bmval0 & ~NFSD_SUPPORTED_ATTRS_WORD0); 1772 BUG_ON(bmval0 & ~nfsd_suppattrs0(minorversion));
1431 BUG_ON(bmval1 & ~NFSD_SUPPORTED_ATTRS_WORD1); 1773 BUG_ON(bmval1 & ~nfsd_suppattrs1(minorversion));
1774 BUG_ON(bmval2 & ~nfsd_suppattrs2(minorversion));
1432 1775
1433 if (exp->ex_fslocs.migrated) { 1776 if (exp->ex_fslocs.migrated) {
1777 BUG_ON(bmval[2]);
1434 status = fattr_handle_absent_fs(&bmval0, &bmval1, &rdattr_err); 1778 status = fattr_handle_absent_fs(&bmval0, &bmval1, &rdattr_err);
1435 if (status) 1779 if (status)
1436 goto out; 1780 goto out;
@@ -1476,22 +1820,42 @@ nfsd4_encode_fattr(struct svc_fh *fhp, struct svc_export *exp,
1476 if ((buflen -= 16) < 0) 1820 if ((buflen -= 16) < 0)
1477 goto out_resource; 1821 goto out_resource;
1478 1822
1479 WRITE32(2); 1823 if (unlikely(bmval2)) {
1480 WRITE32(bmval0); 1824 WRITE32(3);
1481 WRITE32(bmval1); 1825 WRITE32(bmval0);
1826 WRITE32(bmval1);
1827 WRITE32(bmval2);
1828 } else if (likely(bmval1)) {
1829 WRITE32(2);
1830 WRITE32(bmval0);
1831 WRITE32(bmval1);
1832 } else {
1833 WRITE32(1);
1834 WRITE32(bmval0);
1835 }
1482 attrlenp = p++; /* to be backfilled later */ 1836 attrlenp = p++; /* to be backfilled later */
1483 1837
1484 if (bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) { 1838 if (bmval0 & FATTR4_WORD0_SUPPORTED_ATTRS) {
1485 u32 word0 = NFSD_SUPPORTED_ATTRS_WORD0; 1839 u32 word0 = nfsd_suppattrs0(minorversion);
1840 u32 word1 = nfsd_suppattrs1(minorversion);
1841 u32 word2 = nfsd_suppattrs2(minorversion);
1842
1486 if ((buflen -= 12) < 0) 1843 if ((buflen -= 12) < 0)
1487 goto out_resource; 1844 goto out_resource;
1488 if (!aclsupport) 1845 if (!aclsupport)
1489 word0 &= ~FATTR4_WORD0_ACL; 1846 word0 &= ~FATTR4_WORD0_ACL;
1490 if (!exp->ex_fslocs.locations) 1847 if (!exp->ex_fslocs.locations)
1491 word0 &= ~FATTR4_WORD0_FS_LOCATIONS; 1848 word0 &= ~FATTR4_WORD0_FS_LOCATIONS;
1492 WRITE32(2); 1849 if (!word2) {
1493 WRITE32(word0); 1850 WRITE32(2);
1494 WRITE32(NFSD_SUPPORTED_ATTRS_WORD1); 1851 WRITE32(word0);
1852 WRITE32(word1);
1853 } else {
1854 WRITE32(3);
1855 WRITE32(word0);
1856 WRITE32(word1);
1857 WRITE32(word2);
1858 }
1495 } 1859 }
1496 if (bmval0 & FATTR4_WORD0_TYPE) { 1860 if (bmval0 & FATTR4_WORD0_TYPE) {
1497 if ((buflen -= 4) < 0) 1861 if ((buflen -= 4) < 0)
@@ -1801,6 +2165,13 @@ out_acl:
1801 } 2165 }
1802 WRITE64(stat.ino); 2166 WRITE64(stat.ino);
1803 } 2167 }
2168 if (bmval2 & FATTR4_WORD2_SUPPATTR_EXCLCREAT) {
2169 WRITE32(3);
2170 WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD0);
2171 WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD1);
2172 WRITE32(NFSD_SUPPATTR_EXCLCREAT_WORD2);
2173 }
2174
1804 *attrlenp = htonl((char *)p - (char *)attrlenp - 4); 2175 *attrlenp = htonl((char *)p - (char *)attrlenp - 4);
1805 *countp = p - buffer; 2176 *countp = p - buffer;
1806 status = nfs_ok; 2177 status = nfs_ok;
@@ -2572,6 +2943,143 @@ nfsd4_encode_write(struct nfsd4_compoundres *resp, __be32 nfserr, struct nfsd4_w
2572} 2943}
2573 2944
2574static __be32 2945static __be32
2946nfsd4_encode_exchange_id(struct nfsd4_compoundres *resp, int nfserr,
2947 struct nfsd4_exchange_id *exid)
2948{
2949 ENCODE_HEAD;
2950 char *major_id;
2951 char *server_scope;
2952 int major_id_sz;
2953 int server_scope_sz;
2954 uint64_t minor_id = 0;
2955
2956 if (nfserr)
2957 return nfserr;
2958
2959 major_id = utsname()->nodename;
2960 major_id_sz = strlen(major_id);
2961 server_scope = utsname()->nodename;
2962 server_scope_sz = strlen(server_scope);
2963
2964 RESERVE_SPACE(
2965 8 /* eir_clientid */ +
2966 4 /* eir_sequenceid */ +
2967 4 /* eir_flags */ +
2968 4 /* spr_how (SP4_NONE) */ +
2969 8 /* so_minor_id */ +
2970 4 /* so_major_id.len */ +
2971 (XDR_QUADLEN(major_id_sz) * 4) +
2972 4 /* eir_server_scope.len */ +
2973 (XDR_QUADLEN(server_scope_sz) * 4) +
2974 4 /* eir_server_impl_id.count (0) */);
2975
2976 WRITEMEM(&exid->clientid, 8);
2977 WRITE32(exid->seqid);
2978 WRITE32(exid->flags);
2979
2980 /* state_protect4_r. Currently only support SP4_NONE */
2981 BUG_ON(exid->spa_how != SP4_NONE);
2982 WRITE32(exid->spa_how);
2983
2984 /* The server_owner struct */
2985 WRITE64(minor_id); /* Minor id */
2986 /* major id */
2987 WRITE32(major_id_sz);
2988 WRITEMEM(major_id, major_id_sz);
2989
2990 /* Server scope */
2991 WRITE32(server_scope_sz);
2992 WRITEMEM(server_scope, server_scope_sz);
2993
2994 /* Implementation id */
2995 WRITE32(0); /* zero length nfs_impl_id4 array */
2996 ADJUST_ARGS();
2997 return 0;
2998}
2999
3000static __be32
3001nfsd4_encode_create_session(struct nfsd4_compoundres *resp, int nfserr,
3002 struct nfsd4_create_session *sess)
3003{
3004 ENCODE_HEAD;
3005
3006 if (nfserr)
3007 return nfserr;
3008
3009 RESERVE_SPACE(24);
3010 WRITEMEM(sess->sessionid.data, NFS4_MAX_SESSIONID_LEN);
3011 WRITE32(sess->seqid);
3012 WRITE32(sess->flags);
3013 ADJUST_ARGS();
3014
3015 RESERVE_SPACE(28);
3016 WRITE32(0); /* headerpadsz */
3017 WRITE32(sess->fore_channel.maxreq_sz);
3018 WRITE32(sess->fore_channel.maxresp_sz);
3019 WRITE32(sess->fore_channel.maxresp_cached);
3020 WRITE32(sess->fore_channel.maxops);
3021 WRITE32(sess->fore_channel.maxreqs);
3022 WRITE32(sess->fore_channel.nr_rdma_attrs);
3023 ADJUST_ARGS();
3024
3025 if (sess->fore_channel.nr_rdma_attrs) {
3026 RESERVE_SPACE(4);
3027 WRITE32(sess->fore_channel.rdma_attrs);
3028 ADJUST_ARGS();
3029 }
3030
3031 RESERVE_SPACE(28);
3032 WRITE32(0); /* headerpadsz */
3033 WRITE32(sess->back_channel.maxreq_sz);
3034 WRITE32(sess->back_channel.maxresp_sz);
3035 WRITE32(sess->back_channel.maxresp_cached);
3036 WRITE32(sess->back_channel.maxops);
3037 WRITE32(sess->back_channel.maxreqs);
3038 WRITE32(sess->back_channel.nr_rdma_attrs);
3039 ADJUST_ARGS();
3040
3041 if (sess->back_channel.nr_rdma_attrs) {
3042 RESERVE_SPACE(4);
3043 WRITE32(sess->back_channel.rdma_attrs);
3044 ADJUST_ARGS();
3045 }
3046 return 0;
3047}
3048
3049static __be32
3050nfsd4_encode_destroy_session(struct nfsd4_compoundres *resp, int nfserr,
3051 struct nfsd4_destroy_session *destroy_session)
3052{
3053 return nfserr;
3054}
3055
3056__be32
3057nfsd4_encode_sequence(struct nfsd4_compoundres *resp, int nfserr,
3058 struct nfsd4_sequence *seq)
3059{
3060 ENCODE_HEAD;
3061
3062 if (nfserr)
3063 return nfserr;
3064
3065 RESERVE_SPACE(NFS4_MAX_SESSIONID_LEN + 20);
3066 WRITEMEM(seq->sessionid.data, NFS4_MAX_SESSIONID_LEN);
3067 WRITE32(seq->seqid);
3068 WRITE32(seq->slotid);
3069 WRITE32(seq->maxslots);
3070 /*
3071 * FIXME: for now:
3072 * target_maxslots = maxslots
3073 * status_flags = 0
3074 */
3075 WRITE32(seq->maxslots);
3076 WRITE32(0);
3077
3078 ADJUST_ARGS();
3079 return 0;
3080}
3081
3082static __be32
2575nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p) 3083nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
2576{ 3084{
2577 return nfserr; 3085 return nfserr;
@@ -2579,6 +3087,11 @@ nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p)
2579 3087
2580typedef __be32(* nfsd4_enc)(struct nfsd4_compoundres *, __be32, void *); 3088typedef __be32(* nfsd4_enc)(struct nfsd4_compoundres *, __be32, void *);
2581 3089
3090/*
3091 * Note: nfsd4_enc_ops vector is shared for v4.0 and v4.1
3092 * since we don't need to filter out obsolete ops as this is
3093 * done in the decoding phase.
3094 */
2582static nfsd4_enc nfsd4_enc_ops[] = { 3095static nfsd4_enc nfsd4_enc_ops[] = {
2583 [OP_ACCESS] = (nfsd4_enc)nfsd4_encode_access, 3096 [OP_ACCESS] = (nfsd4_enc)nfsd4_encode_access,
2584 [OP_CLOSE] = (nfsd4_enc)nfsd4_encode_close, 3097 [OP_CLOSE] = (nfsd4_enc)nfsd4_encode_close,
@@ -2617,8 +3130,77 @@ static nfsd4_enc nfsd4_enc_ops[] = {
2617 [OP_VERIFY] = (nfsd4_enc)nfsd4_encode_noop, 3130 [OP_VERIFY] = (nfsd4_enc)nfsd4_encode_noop,
2618 [OP_WRITE] = (nfsd4_enc)nfsd4_encode_write, 3131 [OP_WRITE] = (nfsd4_enc)nfsd4_encode_write,
2619 [OP_RELEASE_LOCKOWNER] = (nfsd4_enc)nfsd4_encode_noop, 3132 [OP_RELEASE_LOCKOWNER] = (nfsd4_enc)nfsd4_encode_noop,
3133
3134 /* NFSv4.1 operations */
3135 [OP_BACKCHANNEL_CTL] = (nfsd4_enc)nfsd4_encode_noop,
3136 [OP_BIND_CONN_TO_SESSION] = (nfsd4_enc)nfsd4_encode_noop,
3137 [OP_EXCHANGE_ID] = (nfsd4_enc)nfsd4_encode_exchange_id,
3138 [OP_CREATE_SESSION] = (nfsd4_enc)nfsd4_encode_create_session,
3139 [OP_DESTROY_SESSION] = (nfsd4_enc)nfsd4_encode_destroy_session,
3140 [OP_FREE_STATEID] = (nfsd4_enc)nfsd4_encode_noop,
3141 [OP_GET_DIR_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop,
3142 [OP_GETDEVICEINFO] = (nfsd4_enc)nfsd4_encode_noop,
3143 [OP_GETDEVICELIST] = (nfsd4_enc)nfsd4_encode_noop,
3144 [OP_LAYOUTCOMMIT] = (nfsd4_enc)nfsd4_encode_noop,
3145 [OP_LAYOUTGET] = (nfsd4_enc)nfsd4_encode_noop,
3146 [OP_LAYOUTRETURN] = (nfsd4_enc)nfsd4_encode_noop,
3147 [OP_SECINFO_NO_NAME] = (nfsd4_enc)nfsd4_encode_noop,
3148 [OP_SEQUENCE] = (nfsd4_enc)nfsd4_encode_sequence,
3149 [OP_SET_SSV] = (nfsd4_enc)nfsd4_encode_noop,
3150 [OP_TEST_STATEID] = (nfsd4_enc)nfsd4_encode_noop,
3151 [OP_WANT_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop,
3152 [OP_DESTROY_CLIENTID] = (nfsd4_enc)nfsd4_encode_noop,
3153 [OP_RECLAIM_COMPLETE] = (nfsd4_enc)nfsd4_encode_noop,
2620}; 3154};
2621 3155
3156/*
3157 * Calculate the total amount of memory that the compound response has taken
3158 * after encoding the current operation.
3159 *
3160 * pad: add on 8 bytes for the next operation's op_code and status so that
3161 * there is room to cache a failure on the next operation.
3162 *
3163 * Compare this length to the session se_fmaxresp_cached.
3164 *
3165 * Our se_fmaxresp_cached will always be a multiple of PAGE_SIZE, and so
3166 * will be at least a page and will therefore hold the xdr_buf head.
3167 */
3168static int nfsd4_check_drc_limit(struct nfsd4_compoundres *resp)
3169{
3170 int status = 0;
3171 struct xdr_buf *xb = &resp->rqstp->rq_res;
3172 struct nfsd4_compoundargs *args = resp->rqstp->rq_argp;
3173 struct nfsd4_session *session = NULL;
3174 struct nfsd4_slot *slot = resp->cstate.slot;
3175 u32 length, tlen = 0, pad = 8;
3176
3177 if (!nfsd4_has_session(&resp->cstate))
3178 return status;
3179
3180 session = resp->cstate.session;
3181 if (session == NULL || slot->sl_cache_entry.ce_cachethis == 0)
3182 return status;
3183
3184 if (resp->opcnt >= args->opcnt)
3185 pad = 0; /* this is the last operation */
3186
3187 if (xb->page_len == 0) {
3188 length = (char *)resp->p - (char *)xb->head[0].iov_base + pad;
3189 } else {
3190 if (xb->tail[0].iov_base && xb->tail[0].iov_len > 0)
3191 tlen = (char *)resp->p - (char *)xb->tail[0].iov_base;
3192
3193 length = xb->head[0].iov_len + xb->page_len + tlen + pad;
3194 }
3195 dprintk("%s length %u, xb->page_len %u tlen %u pad %u\n", __func__,
3196 length, xb->page_len, tlen, pad);
3197
3198 if (length <= session->se_fmaxresp_cached)
3199 return status;
3200 else
3201 return nfserr_rep_too_big_to_cache;
3202}
3203
2622void 3204void
2623nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op) 3205nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
2624{ 3206{
@@ -2635,6 +3217,9 @@ nfsd4_encode_operation(struct nfsd4_compoundres *resp, struct nfsd4_op *op)
2635 BUG_ON(op->opnum < 0 || op->opnum >= ARRAY_SIZE(nfsd4_enc_ops) || 3217 BUG_ON(op->opnum < 0 || op->opnum >= ARRAY_SIZE(nfsd4_enc_ops) ||
2636 !nfsd4_enc_ops[op->opnum]); 3218 !nfsd4_enc_ops[op->opnum]);
2637 op->status = nfsd4_enc_ops[op->opnum](resp, op->status, &op->u); 3219 op->status = nfsd4_enc_ops[op->opnum](resp, op->status, &op->u);
3220 /* nfsd4_check_drc_limit guarantees enough room for error status */
3221 if (!op->status && nfsd4_check_drc_limit(resp))
3222 op->status = nfserr_rep_too_big_to_cache;
2638status: 3223status:
2639 /* 3224 /*
2640 * Note: We write the status directly, instead of using WRITE32(), 3225 * Note: We write the status directly, instead of using WRITE32(),
@@ -2735,6 +3320,18 @@ nfs4svc_encode_compoundres(struct svc_rqst *rqstp, __be32 *p, struct nfsd4_compo
2735 iov = &rqstp->rq_res.head[0]; 3320 iov = &rqstp->rq_res.head[0];
2736 iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base; 3321 iov->iov_len = ((char*)resp->p) - (char*)iov->iov_base;
2737 BUG_ON(iov->iov_len > PAGE_SIZE); 3322 BUG_ON(iov->iov_len > PAGE_SIZE);
3323 if (nfsd4_has_session(&resp->cstate)) {
3324 if (resp->cstate.status == nfserr_replay_cache &&
3325 !nfsd4_not_cached(resp)) {
3326 iov->iov_len = resp->cstate.iovlen;
3327 } else {
3328 nfsd4_store_cache_entry(resp);
3329 dprintk("%s: SET SLOT STATE TO AVAILABLE\n", __func__);
3330 resp->cstate.slot->sl_inuse = 0;
3331 }
3332 if (resp->cstate.session)
3333 nfsd4_put_session(resp->cstate.session);
3334 }
2738 return 1; 3335 return 1;
2739} 3336}
2740 3337
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index a4ed8644d69c..af16849d243a 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -60,6 +60,7 @@ enum {
60 NFSD_FO_UnlockFS, 60 NFSD_FO_UnlockFS,
61 NFSD_Threads, 61 NFSD_Threads,
62 NFSD_Pool_Threads, 62 NFSD_Pool_Threads,
63 NFSD_Pool_Stats,
63 NFSD_Versions, 64 NFSD_Versions,
64 NFSD_Ports, 65 NFSD_Ports,
65 NFSD_MaxBlkSize, 66 NFSD_MaxBlkSize,
@@ -172,6 +173,16 @@ static const struct file_operations exports_operations = {
172 .owner = THIS_MODULE, 173 .owner = THIS_MODULE,
173}; 174};
174 175
176extern int nfsd_pool_stats_open(struct inode *inode, struct file *file);
177
178static struct file_operations pool_stats_operations = {
179 .open = nfsd_pool_stats_open,
180 .read = seq_read,
181 .llseek = seq_lseek,
182 .release = seq_release,
183 .owner = THIS_MODULE,
184};
185
175/*----------------------------------------------------------------------------*/ 186/*----------------------------------------------------------------------------*/
176/* 187/*
177 * payload - write methods 188 * payload - write methods
@@ -781,8 +792,9 @@ out_free:
781static ssize_t __write_versions(struct file *file, char *buf, size_t size) 792static ssize_t __write_versions(struct file *file, char *buf, size_t size)
782{ 793{
783 char *mesg = buf; 794 char *mesg = buf;
784 char *vers, sign; 795 char *vers, *minorp, sign;
785 int len, num; 796 int len, num;
797 unsigned minor;
786 ssize_t tlen = 0; 798 ssize_t tlen = 0;
787 char *sep; 799 char *sep;
788 800
@@ -803,9 +815,20 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
803 do { 815 do {
804 sign = *vers; 816 sign = *vers;
805 if (sign == '+' || sign == '-') 817 if (sign == '+' || sign == '-')
806 num = simple_strtol((vers+1), NULL, 0); 818 num = simple_strtol((vers+1), &minorp, 0);
807 else 819 else
808 num = simple_strtol(vers, NULL, 0); 820 num = simple_strtol(vers, &minorp, 0);
821 if (*minorp == '.') {
822 if (num < 4)
823 return -EINVAL;
824 minor = simple_strtoul(minorp+1, NULL, 0);
825 if (minor == 0)
826 return -EINVAL;
827 if (nfsd_minorversion(minor, sign == '-' ?
828 NFSD_CLEAR : NFSD_SET) < 0)
829 return -EINVAL;
830 goto next;
831 }
809 switch(num) { 832 switch(num) {
810 case 2: 833 case 2:
811 case 3: 834 case 3:
@@ -815,6 +838,7 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
815 default: 838 default:
816 return -EINVAL; 839 return -EINVAL;
817 } 840 }
841 next:
818 vers += len + 1; 842 vers += len + 1;
819 tlen += len; 843 tlen += len;
820 } while ((len = qword_get(&mesg, vers, size)) > 0); 844 } while ((len = qword_get(&mesg, vers, size)) > 0);
@@ -833,6 +857,13 @@ static ssize_t __write_versions(struct file *file, char *buf, size_t size)
833 num); 857 num);
834 sep = " "; 858 sep = " ";
835 } 859 }
860 if (nfsd_vers(4, NFSD_AVAIL))
861 for (minor = 1; minor <= NFSD_SUPPORTED_MINOR_VERSION; minor++)
862 len += sprintf(buf+len, " %c4.%u",
863 (nfsd_vers(4, NFSD_TEST) &&
864 nfsd_minorversion(minor, NFSD_TEST)) ?
865 '+' : '-',
866 minor);
836 len += sprintf(buf+len, "\n"); 867 len += sprintf(buf+len, "\n");
837 return len; 868 return len;
838} 869}
@@ -1248,6 +1279,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent)
1248 [NFSD_Fh] = {"filehandle", &transaction_ops, S_IWUSR|S_IRUSR}, 1279 [NFSD_Fh] = {"filehandle", &transaction_ops, S_IWUSR|S_IRUSR},
1249 [NFSD_Threads] = {"threads", &transaction_ops, S_IWUSR|S_IRUSR}, 1280 [NFSD_Threads] = {"threads", &transaction_ops, S_IWUSR|S_IRUSR},
1250 [NFSD_Pool_Threads] = {"pool_threads", &transaction_ops, S_IWUSR|S_IRUSR}, 1281 [NFSD_Pool_Threads] = {"pool_threads", &transaction_ops, S_IWUSR|S_IRUSR},
1282 [NFSD_Pool_Stats] = {"pool_stats", &pool_stats_operations, S_IRUGO},
1251 [NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR}, 1283 [NFSD_Versions] = {"versions", &transaction_ops, S_IWUSR|S_IRUSR},
1252 [NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO}, 1284 [NFSD_Ports] = {"portlist", &transaction_ops, S_IWUSR|S_IRUGO},
1253 [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO}, 1285 [NFSD_MaxBlkSize] = {"max_block_size", &transaction_ops, S_IWUSR|S_IRUGO},
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 6f7f26351227..e298e260b5f1 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -180,6 +180,7 @@ nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp,
180{ 180{
181 __be32 nfserr; 181 __be32 nfserr;
182 int stable = 1; 182 int stable = 1;
183 unsigned long cnt = argp->len;
183 184
184 dprintk("nfsd: WRITE %s %d bytes at %d\n", 185 dprintk("nfsd: WRITE %s %d bytes at %d\n",
185 SVCFH_fmt(&argp->fh), 186 SVCFH_fmt(&argp->fh),
@@ -188,7 +189,7 @@ nfsd_proc_write(struct svc_rqst *rqstp, struct nfsd_writeargs *argp,
188 nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), NULL, 189 nfserr = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), NULL,
189 argp->offset, 190 argp->offset,
190 rqstp->rq_vec, argp->vlen, 191 rqstp->rq_vec, argp->vlen,
191 argp->len, 192 &cnt,
192 &stable); 193 &stable);
193 return nfsd_return_attrs(nfserr, resp); 194 return nfsd_return_attrs(nfserr, resp);
194} 195}
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 7c09852be713..cbba4a935786 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -22,6 +22,7 @@
22#include <linux/freezer.h> 22#include <linux/freezer.h>
23#include <linux/fs_struct.h> 23#include <linux/fs_struct.h>
24#include <linux/kthread.h> 24#include <linux/kthread.h>
25#include <linux/swap.h>
25 26
26#include <linux/sunrpc/types.h> 27#include <linux/sunrpc/types.h>
27#include <linux/sunrpc/stats.h> 28#include <linux/sunrpc/stats.h>
@@ -40,9 +41,6 @@
40extern struct svc_program nfsd_program; 41extern struct svc_program nfsd_program;
41static int nfsd(void *vrqstp); 42static int nfsd(void *vrqstp);
42struct timeval nfssvc_boot; 43struct timeval nfssvc_boot;
43static atomic_t nfsd_busy;
44static unsigned long nfsd_last_call;
45static DEFINE_SPINLOCK(nfsd_call_lock);
46 44
47/* 45/*
48 * nfsd_mutex protects nfsd_serv -- both the pointer itself and the members 46 * nfsd_mutex protects nfsd_serv -- both the pointer itself and the members
@@ -123,6 +121,8 @@ struct svc_program nfsd_program = {
123 121
124}; 122};
125 123
124u32 nfsd_supported_minorversion;
125
126int nfsd_vers(int vers, enum vers_op change) 126int nfsd_vers(int vers, enum vers_op change)
127{ 127{
128 if (vers < NFSD_MINVERS || vers >= NFSD_NRVERS) 128 if (vers < NFSD_MINVERS || vers >= NFSD_NRVERS)
@@ -149,6 +149,28 @@ int nfsd_vers(int vers, enum vers_op change)
149 } 149 }
150 return 0; 150 return 0;
151} 151}
152
153int nfsd_minorversion(u32 minorversion, enum vers_op change)
154{
155 if (minorversion > NFSD_SUPPORTED_MINOR_VERSION)
156 return -1;
157 switch(change) {
158 case NFSD_SET:
159 nfsd_supported_minorversion = minorversion;
160 break;
161 case NFSD_CLEAR:
162 if (minorversion == 0)
163 return -1;
164 nfsd_supported_minorversion = minorversion - 1;
165 break;
166 case NFSD_TEST:
167 return minorversion <= nfsd_supported_minorversion;
168 case NFSD_AVAIL:
169 return minorversion <= NFSD_SUPPORTED_MINOR_VERSION;
170 }
171 return 0;
172}
173
152/* 174/*
153 * Maximum number of nfsd processes 175 * Maximum number of nfsd processes
154 */ 176 */
@@ -200,6 +222,28 @@ void nfsd_reset_versions(void)
200 } 222 }
201} 223}
202 224
225/*
226 * Each session guarantees a negotiated per slot memory cache for replies
227 * which in turn consumes memory beyond the v2/v3/v4.0 server. A dedicated
228 * NFSv4.1 server might want to use more memory for a DRC than a machine
229 * with mutiple services.
230 *
231 * Impose a hard limit on the number of pages for the DRC which varies
232 * according to the machines free pages. This is of course only a default.
233 *
234 * For now this is a #defined shift which could be under admin control
235 * in the future.
236 */
237static void set_max_drc(void)
238{
239 /* The percent of nr_free_buffer_pages used by the V4.1 server DRC */
240 #define NFSD_DRC_SIZE_SHIFT 7
241 nfsd_serv->sv_drc_max_pages = nr_free_buffer_pages()
242 >> NFSD_DRC_SIZE_SHIFT;
243 nfsd_serv->sv_drc_pages_used = 0;
244 dprintk("%s svc_drc_max_pages %u\n", __func__,
245 nfsd_serv->sv_drc_max_pages);
246}
203 247
204int nfsd_create_serv(void) 248int nfsd_create_serv(void)
205{ 249{
@@ -227,11 +271,12 @@ int nfsd_create_serv(void)
227 nfsd_max_blksize /= 2; 271 nfsd_max_blksize /= 2;
228 } 272 }
229 273
230 atomic_set(&nfsd_busy, 0);
231 nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize, 274 nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
232 nfsd_last_thread, nfsd, THIS_MODULE); 275 nfsd_last_thread, nfsd, THIS_MODULE);
233 if (nfsd_serv == NULL) 276 if (nfsd_serv == NULL)
234 err = -ENOMEM; 277 err = -ENOMEM;
278 else
279 set_max_drc();
235 280
236 do_gettimeofday(&nfssvc_boot); /* record boot time */ 281 do_gettimeofday(&nfssvc_boot); /* record boot time */
237 return err; 282 return err;
@@ -375,26 +420,6 @@ nfsd_svc(unsigned short port, int nrservs)
375 return error; 420 return error;
376} 421}
377 422
378static inline void
379update_thread_usage(int busy_threads)
380{
381 unsigned long prev_call;
382 unsigned long diff;
383 int decile;
384
385 spin_lock(&nfsd_call_lock);
386 prev_call = nfsd_last_call;
387 nfsd_last_call = jiffies;
388 decile = busy_threads*10/nfsdstats.th_cnt;
389 if (decile>0 && decile <= 10) {
390 diff = nfsd_last_call - prev_call;
391 if ( (nfsdstats.th_usage[decile-1] += diff) >= NFSD_USAGE_WRAP)
392 nfsdstats.th_usage[decile-1] -= NFSD_USAGE_WRAP;
393 if (decile == 10)
394 nfsdstats.th_fullcnt++;
395 }
396 spin_unlock(&nfsd_call_lock);
397}
398 423
399/* 424/*
400 * This is the NFS server kernel thread 425 * This is the NFS server kernel thread
@@ -460,8 +485,6 @@ nfsd(void *vrqstp)
460 continue; 485 continue;
461 } 486 }
462 487
463 update_thread_usage(atomic_read(&nfsd_busy));
464 atomic_inc(&nfsd_busy);
465 488
466 /* Lock the export hash tables for reading. */ 489 /* Lock the export hash tables for reading. */
467 exp_readlock(); 490 exp_readlock();
@@ -470,8 +493,6 @@ nfsd(void *vrqstp)
470 493
471 /* Unlock export hash tables */ 494 /* Unlock export hash tables */
472 exp_readunlock(); 495 exp_readunlock();
473 update_thread_usage(atomic_read(&nfsd_busy));
474 atomic_dec(&nfsd_busy);
475 } 496 }
476 497
477 /* Clear signals before calling svc_exit_thread() */ 498 /* Clear signals before calling svc_exit_thread() */
@@ -539,6 +560,10 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
539 + rqstp->rq_res.head[0].iov_len; 560 + rqstp->rq_res.head[0].iov_len;
540 rqstp->rq_res.head[0].iov_len += sizeof(__be32); 561 rqstp->rq_res.head[0].iov_len += sizeof(__be32);
541 562
563 /* NFSv4.1 DRC requires statp */
564 if (rqstp->rq_vers == 4)
565 nfsd4_set_statp(rqstp, statp);
566
542 /* Now call the procedure handler, and encode NFS status. */ 567 /* Now call the procedure handler, and encode NFS status. */
543 nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp); 568 nfserr = proc->pc_func(rqstp, rqstp->rq_argp, rqstp->rq_resp);
544 nfserr = map_new_errors(rqstp->rq_vers, nfserr); 569 nfserr = map_new_errors(rqstp->rq_vers, nfserr);
@@ -570,3 +595,10 @@ nfsd_dispatch(struct svc_rqst *rqstp, __be32 *statp)
570 nfsd_cache_update(rqstp, proc->pc_cachetype, statp + 1); 595 nfsd_cache_update(rqstp, proc->pc_cachetype, statp + 1);
571 return 1; 596 return 1;
572} 597}
598
599int nfsd_pool_stats_open(struct inode *inode, struct file *file)
600{
601 if (nfsd_serv == NULL)
602 return -ENODEV;
603 return svc_pool_stats_open(nfsd_serv, file);
604}
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 78376b6c0236..ab93fcfef254 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -366,8 +366,9 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap,
366 } 366 }
367 367
368 /* Revoke setuid/setgid on chown */ 368 /* Revoke setuid/setgid on chown */
369 if (((iap->ia_valid & ATTR_UID) && iap->ia_uid != inode->i_uid) || 369 if (!S_ISDIR(inode->i_mode) &&
370 ((iap->ia_valid & ATTR_GID) && iap->ia_gid != inode->i_gid)) { 370 (((iap->ia_valid & ATTR_UID) && iap->ia_uid != inode->i_uid) ||
371 ((iap->ia_valid & ATTR_GID) && iap->ia_gid != inode->i_gid))) {
371 iap->ia_valid |= ATTR_KILL_PRIV; 372 iap->ia_valid |= ATTR_KILL_PRIV;
372 if (iap->ia_valid & ATTR_MODE) { 373 if (iap->ia_valid & ATTR_MODE) {
373 /* we're setting mode too, just clear the s*id bits */ 374 /* we're setting mode too, just clear the s*id bits */
@@ -960,7 +961,7 @@ static void kill_suid(struct dentry *dentry)
960static __be32 961static __be32
961nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, 962nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
962 loff_t offset, struct kvec *vec, int vlen, 963 loff_t offset, struct kvec *vec, int vlen,
963 unsigned long cnt, int *stablep) 964 unsigned long *cnt, int *stablep)
964{ 965{
965 struct svc_export *exp; 966 struct svc_export *exp;
966 struct dentry *dentry; 967 struct dentry *dentry;
@@ -974,7 +975,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
974 err = nfserr_perm; 975 err = nfserr_perm;
975 976
976 if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) && 977 if ((fhp->fh_export->ex_flags & NFSEXP_MSNFS) &&
977 (!lock_may_write(file->f_path.dentry->d_inode, offset, cnt))) 978 (!lock_may_write(file->f_path.dentry->d_inode, offset, *cnt)))
978 goto out; 979 goto out;
979#endif 980#endif
980 981
@@ -1009,7 +1010,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
1009 host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset); 1010 host_err = vfs_writev(file, (struct iovec __user *)vec, vlen, &offset);
1010 set_fs(oldfs); 1011 set_fs(oldfs);
1011 if (host_err >= 0) { 1012 if (host_err >= 0) {
1012 nfsdstats.io_write += cnt; 1013 nfsdstats.io_write += host_err;
1013 fsnotify_modify(file->f_path.dentry); 1014 fsnotify_modify(file->f_path.dentry);
1014 } 1015 }
1015 1016
@@ -1054,9 +1055,10 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
1054 } 1055 }
1055 1056
1056 dprintk("nfsd: write complete host_err=%d\n", host_err); 1057 dprintk("nfsd: write complete host_err=%d\n", host_err);
1057 if (host_err >= 0) 1058 if (host_err >= 0) {
1058 err = 0; 1059 err = 0;
1059 else 1060 *cnt = host_err;
1061 } else
1060 err = nfserrno(host_err); 1062 err = nfserrno(host_err);
1061out: 1063out:
1062 return err; 1064 return err;
@@ -1098,7 +1100,7 @@ out:
1098 */ 1100 */
1099__be32 1101__be32
1100nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file, 1102nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct file *file,
1101 loff_t offset, struct kvec *vec, int vlen, unsigned long cnt, 1103 loff_t offset, struct kvec *vec, int vlen, unsigned long *cnt,
1102 int *stablep) 1104 int *stablep)
1103{ 1105{
1104 __be32 err = 0; 1106 __be32 err = 0;
@@ -1179,6 +1181,21 @@ nfsd_create_setattr(struct svc_rqst *rqstp, struct svc_fh *resfhp,
1179 return 0; 1181 return 0;
1180} 1182}
1181 1183
1184/* HPUX client sometimes creates a file in mode 000, and sets size to 0.
1185 * setting size to 0 may fail for some specific file systems by the permission
1186 * checking which requires WRITE permission but the mode is 000.
1187 * we ignore the resizing(to 0) on the just new created file, since the size is
1188 * 0 after file created.
1189 *
1190 * call this only after vfs_create() is called.
1191 * */
1192static void
1193nfsd_check_ignore_resizing(struct iattr *iap)
1194{
1195 if ((iap->ia_valid & ATTR_SIZE) && (iap->ia_size == 0))
1196 iap->ia_valid &= ~ATTR_SIZE;
1197}
1198
1182/* 1199/*
1183 * Create a file (regular, directory, device, fifo); UNIX sockets 1200 * Create a file (regular, directory, device, fifo); UNIX sockets
1184 * not yet implemented. 1201 * not yet implemented.
@@ -1274,6 +1291,8 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
1274 switch (type) { 1291 switch (type) {
1275 case S_IFREG: 1292 case S_IFREG:
1276 host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL); 1293 host_err = vfs_create(dirp, dchild, iap->ia_mode, NULL);
1294 if (!host_err)
1295 nfsd_check_ignore_resizing(iap);
1277 break; 1296 break;
1278 case S_IFDIR: 1297 case S_IFDIR:
1279 host_err = vfs_mkdir(dirp, dchild, iap->ia_mode); 1298 host_err = vfs_mkdir(dirp, dchild, iap->ia_mode);
@@ -1427,6 +1446,8 @@ nfsd_create_v3(struct svc_rqst *rqstp, struct svc_fh *fhp,
1427 /* setattr will sync the child (or not) */ 1446 /* setattr will sync the child (or not) */
1428 } 1447 }
1429 1448
1449 nfsd_check_ignore_resizing(iap);
1450
1430 if (createmode == NFS3_CREATE_EXCLUSIVE) { 1451 if (createmode == NFS3_CREATE_EXCLUSIVE) {
1431 /* Cram the verifier into atime/mtime */ 1452 /* Cram the verifier into atime/mtime */
1432 iap->ia_valid = ATTR_MTIME|ATTR_ATIME 1453 iap->ia_valid = ATTR_MTIME|ATTR_ATIME
diff --git a/fs/nilfs2/Makefile b/fs/nilfs2/Makefile
new file mode 100644
index 000000000000..df3e62c1ddc5
--- /dev/null
+++ b/fs/nilfs2/Makefile
@@ -0,0 +1,5 @@
1obj-$(CONFIG_NILFS2_FS) += nilfs2.o
2nilfs2-y := inode.o file.o dir.o super.o namei.o page.o mdt.o \
3 btnode.o bmap.o btree.o direct.o dat.o recovery.o \
4 the_nilfs.o segbuf.o segment.o cpfile.o sufile.o \
5 ifile.o alloc.o gcinode.o ioctl.o gcdat.o
diff --git a/fs/nilfs2/alloc.c b/fs/nilfs2/alloc.c
new file mode 100644
index 000000000000..d69e6ae59251
--- /dev/null
+++ b/fs/nilfs2/alloc.c
@@ -0,0 +1,504 @@
1/*
2 * alloc.c - NILFS dat/inode allocator
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Original code was written by Koji Sato <koji@osrg.net>.
21 * Two allocators were unified by Ryusuke Konishi <ryusuke@osrg.net>,
22 * Amagai Yoshiji <amagai@osrg.net>.
23 */
24
25#include <linux/types.h>
26#include <linux/buffer_head.h>
27#include <linux/fs.h>
28#include <linux/bitops.h>
29#include "mdt.h"
30#include "alloc.h"
31
32
33static inline unsigned long
34nilfs_palloc_groups_per_desc_block(const struct inode *inode)
35{
36 return (1UL << inode->i_blkbits) /
37 sizeof(struct nilfs_palloc_group_desc);
38}
39
40static inline unsigned long
41nilfs_palloc_groups_count(const struct inode *inode)
42{
43 return 1UL << (BITS_PER_LONG - (inode->i_blkbits + 3 /* log2(8) */));
44}
45
46int nilfs_palloc_init_blockgroup(struct inode *inode, unsigned entry_size)
47{
48 struct nilfs_mdt_info *mi = NILFS_MDT(inode);
49
50 mi->mi_bgl = kmalloc(sizeof(*mi->mi_bgl), GFP_NOFS);
51 if (!mi->mi_bgl)
52 return -ENOMEM;
53
54 bgl_lock_init(mi->mi_bgl);
55
56 nilfs_mdt_set_entry_size(inode, entry_size, 0);
57
58 mi->mi_blocks_per_group =
59 DIV_ROUND_UP(nilfs_palloc_entries_per_group(inode),
60 mi->mi_entries_per_block) + 1;
61 /* Number of blocks in a group including entry blocks and
62 a bitmap block */
63 mi->mi_blocks_per_desc_block =
64 nilfs_palloc_groups_per_desc_block(inode) *
65 mi->mi_blocks_per_group + 1;
66 /* Number of blocks per descriptor including the
67 descriptor block */
68 return 0;
69}
70
71static unsigned long nilfs_palloc_group(const struct inode *inode, __u64 nr,
72 unsigned long *offset)
73{
74 __u64 group = nr;
75
76 *offset = do_div(group, nilfs_palloc_entries_per_group(inode));
77 return group;
78}
79
80static unsigned long
81nilfs_palloc_desc_blkoff(const struct inode *inode, unsigned long group)
82{
83 unsigned long desc_block =
84 group / nilfs_palloc_groups_per_desc_block(inode);
85 return desc_block * NILFS_MDT(inode)->mi_blocks_per_desc_block;
86}
87
88static unsigned long
89nilfs_palloc_bitmap_blkoff(const struct inode *inode, unsigned long group)
90{
91 unsigned long desc_offset =
92 group % nilfs_palloc_groups_per_desc_block(inode);
93 return nilfs_palloc_desc_blkoff(inode, group) + 1 +
94 desc_offset * NILFS_MDT(inode)->mi_blocks_per_group;
95}
96
97static unsigned long
98nilfs_palloc_group_desc_nfrees(struct inode *inode, unsigned long group,
99 const struct nilfs_palloc_group_desc *desc)
100{
101 unsigned long nfree;
102
103 spin_lock(nilfs_mdt_bgl_lock(inode, group));
104 nfree = le32_to_cpu(desc->pg_nfrees);
105 spin_unlock(nilfs_mdt_bgl_lock(inode, group));
106 return nfree;
107}
108
109static void
110nilfs_palloc_group_desc_add_entries(struct inode *inode,
111 unsigned long group,
112 struct nilfs_palloc_group_desc *desc,
113 u32 n)
114{
115 spin_lock(nilfs_mdt_bgl_lock(inode, group));
116 le32_add_cpu(&desc->pg_nfrees, n);
117 spin_unlock(nilfs_mdt_bgl_lock(inode, group));
118}
119
120static unsigned long
121nilfs_palloc_entry_blkoff(const struct inode *inode, __u64 nr)
122{
123 unsigned long group, group_offset;
124
125 group = nilfs_palloc_group(inode, nr, &group_offset);
126
127 return nilfs_palloc_bitmap_blkoff(inode, group) + 1 +
128 group_offset / NILFS_MDT(inode)->mi_entries_per_block;
129}
130
131static void nilfs_palloc_desc_block_init(struct inode *inode,
132 struct buffer_head *bh, void *kaddr)
133{
134 struct nilfs_palloc_group_desc *desc = kaddr + bh_offset(bh);
135 unsigned long n = nilfs_palloc_groups_per_desc_block(inode);
136 __le32 nfrees;
137
138 nfrees = cpu_to_le32(nilfs_palloc_entries_per_group(inode));
139 while (n-- > 0) {
140 desc->pg_nfrees = nfrees;
141 desc++;
142 }
143}
144
145static int nilfs_palloc_get_desc_block(struct inode *inode,
146 unsigned long group,
147 int create, struct buffer_head **bhp)
148{
149 return nilfs_mdt_get_block(inode,
150 nilfs_palloc_desc_blkoff(inode, group),
151 create, nilfs_palloc_desc_block_init, bhp);
152}
153
154static int nilfs_palloc_get_bitmap_block(struct inode *inode,
155 unsigned long group,
156 int create, struct buffer_head **bhp)
157{
158 return nilfs_mdt_get_block(inode,
159 nilfs_palloc_bitmap_blkoff(inode, group),
160 create, NULL, bhp);
161}
162
163int nilfs_palloc_get_entry_block(struct inode *inode, __u64 nr,
164 int create, struct buffer_head **bhp)
165{
166 return nilfs_mdt_get_block(inode, nilfs_palloc_entry_blkoff(inode, nr),
167 create, NULL, bhp);
168}
169
170static struct nilfs_palloc_group_desc *
171nilfs_palloc_block_get_group_desc(const struct inode *inode,
172 unsigned long group,
173 const struct buffer_head *bh, void *kaddr)
174{
175 return (struct nilfs_palloc_group_desc *)(kaddr + bh_offset(bh)) +
176 group % nilfs_palloc_groups_per_desc_block(inode);
177}
178
179static unsigned char *
180nilfs_palloc_block_get_bitmap(const struct inode *inode,
181 const struct buffer_head *bh, void *kaddr)
182{
183 return (unsigned char *)(kaddr + bh_offset(bh));
184}
185
186void *nilfs_palloc_block_get_entry(const struct inode *inode, __u64 nr,
187 const struct buffer_head *bh, void *kaddr)
188{
189 unsigned long entry_offset, group_offset;
190
191 nilfs_palloc_group(inode, nr, &group_offset);
192 entry_offset = group_offset % NILFS_MDT(inode)->mi_entries_per_block;
193
194 return kaddr + bh_offset(bh) +
195 entry_offset * NILFS_MDT(inode)->mi_entry_size;
196}
197
198static int nilfs_palloc_find_available_slot(struct inode *inode,
199 unsigned long group,
200 unsigned long target,
201 unsigned char *bitmap,
202 int bsize) /* size in bits */
203{
204 int curr, pos, end, i;
205
206 if (target > 0) {
207 end = (target + BITS_PER_LONG - 1) & ~(BITS_PER_LONG - 1);
208 if (end > bsize)
209 end = bsize;
210 pos = nilfs_find_next_zero_bit(bitmap, end, target);
211 if (pos < end &&
212 !nilfs_set_bit_atomic(
213 nilfs_mdt_bgl_lock(inode, group), pos, bitmap))
214 return pos;
215 } else
216 end = 0;
217
218 for (i = 0, curr = end;
219 i < bsize;
220 i += BITS_PER_LONG, curr += BITS_PER_LONG) {
221 /* wrap around */
222 if (curr >= bsize)
223 curr = 0;
224 while (*((unsigned long *)bitmap + curr / BITS_PER_LONG)
225 != ~0UL) {
226 end = curr + BITS_PER_LONG;
227 if (end > bsize)
228 end = bsize;
229 pos = nilfs_find_next_zero_bit(bitmap, end, curr);
230 if ((pos < end) &&
231 !nilfs_set_bit_atomic(
232 nilfs_mdt_bgl_lock(inode, group), pos,
233 bitmap))
234 return pos;
235 }
236 }
237 return -ENOSPC;
238}
239
240static unsigned long
241nilfs_palloc_rest_groups_in_desc_block(const struct inode *inode,
242 unsigned long curr, unsigned long max)
243{
244 return min_t(unsigned long,
245 nilfs_palloc_groups_per_desc_block(inode) -
246 curr % nilfs_palloc_groups_per_desc_block(inode),
247 max - curr + 1);
248}
249
250int nilfs_palloc_prepare_alloc_entry(struct inode *inode,
251 struct nilfs_palloc_req *req)
252{
253 struct buffer_head *desc_bh, *bitmap_bh;
254 struct nilfs_palloc_group_desc *desc;
255 unsigned char *bitmap;
256 void *desc_kaddr, *bitmap_kaddr;
257 unsigned long group, maxgroup, ngroups;
258 unsigned long group_offset, maxgroup_offset;
259 unsigned long n, entries_per_group, groups_per_desc_block;
260 unsigned long i, j;
261 int pos, ret;
262
263 ngroups = nilfs_palloc_groups_count(inode);
264 maxgroup = ngroups - 1;
265 group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
266 entries_per_group = nilfs_palloc_entries_per_group(inode);
267 groups_per_desc_block = nilfs_palloc_groups_per_desc_block(inode);
268
269 for (i = 0; i < ngroups; i += n) {
270 if (group >= ngroups) {
271 /* wrap around */
272 group = 0;
273 maxgroup = nilfs_palloc_group(inode, req->pr_entry_nr,
274 &maxgroup_offset) - 1;
275 }
276 ret = nilfs_palloc_get_desc_block(inode, group, 1, &desc_bh);
277 if (ret < 0)
278 return ret;
279 desc_kaddr = kmap(desc_bh->b_page);
280 desc = nilfs_palloc_block_get_group_desc(
281 inode, group, desc_bh, desc_kaddr);
282 n = nilfs_palloc_rest_groups_in_desc_block(inode, group,
283 maxgroup);
284 for (j = 0; j < n; j++, desc++, group++) {
285 if (nilfs_palloc_group_desc_nfrees(inode, group, desc)
286 > 0) {
287 ret = nilfs_palloc_get_bitmap_block(
288 inode, group, 1, &bitmap_bh);
289 if (ret < 0)
290 goto out_desc;
291 bitmap_kaddr = kmap(bitmap_bh->b_page);
292 bitmap = nilfs_palloc_block_get_bitmap(
293 inode, bitmap_bh, bitmap_kaddr);
294 pos = nilfs_palloc_find_available_slot(
295 inode, group, group_offset, bitmap,
296 entries_per_group);
297 if (pos >= 0) {
298 /* found a free entry */
299 nilfs_palloc_group_desc_add_entries(
300 inode, group, desc, -1);
301 req->pr_entry_nr =
302 entries_per_group * group + pos;
303 kunmap(desc_bh->b_page);
304 kunmap(bitmap_bh->b_page);
305
306 req->pr_desc_bh = desc_bh;
307 req->pr_bitmap_bh = bitmap_bh;
308 return 0;
309 }
310 kunmap(bitmap_bh->b_page);
311 brelse(bitmap_bh);
312 }
313
314 group_offset = 0;
315 }
316
317 kunmap(desc_bh->b_page);
318 brelse(desc_bh);
319 }
320
321 /* no entries left */
322 return -ENOSPC;
323
324 out_desc:
325 kunmap(desc_bh->b_page);
326 brelse(desc_bh);
327 return ret;
328}
329
330void nilfs_palloc_commit_alloc_entry(struct inode *inode,
331 struct nilfs_palloc_req *req)
332{
333 nilfs_mdt_mark_buffer_dirty(req->pr_bitmap_bh);
334 nilfs_mdt_mark_buffer_dirty(req->pr_desc_bh);
335 nilfs_mdt_mark_dirty(inode);
336
337 brelse(req->pr_bitmap_bh);
338 brelse(req->pr_desc_bh);
339}
340
341void nilfs_palloc_commit_free_entry(struct inode *inode,
342 struct nilfs_palloc_req *req)
343{
344 struct nilfs_palloc_group_desc *desc;
345 unsigned long group, group_offset;
346 unsigned char *bitmap;
347 void *desc_kaddr, *bitmap_kaddr;
348
349 group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
350 desc_kaddr = kmap(req->pr_desc_bh->b_page);
351 desc = nilfs_palloc_block_get_group_desc(inode, group,
352 req->pr_desc_bh, desc_kaddr);
353 bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
354 bitmap = nilfs_palloc_block_get_bitmap(inode, req->pr_bitmap_bh,
355 bitmap_kaddr);
356
357 if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
358 group_offset, bitmap))
359 printk(KERN_WARNING "%s: entry number %llu already freed\n",
360 __func__, (unsigned long long)req->pr_entry_nr);
361
362 nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
363
364 kunmap(req->pr_bitmap_bh->b_page);
365 kunmap(req->pr_desc_bh->b_page);
366
367 nilfs_mdt_mark_buffer_dirty(req->pr_desc_bh);
368 nilfs_mdt_mark_buffer_dirty(req->pr_bitmap_bh);
369 nilfs_mdt_mark_dirty(inode);
370
371 brelse(req->pr_bitmap_bh);
372 brelse(req->pr_desc_bh);
373}
374
375void nilfs_palloc_abort_alloc_entry(struct inode *inode,
376 struct nilfs_palloc_req *req)
377{
378 struct nilfs_palloc_group_desc *desc;
379 void *desc_kaddr, *bitmap_kaddr;
380 unsigned char *bitmap;
381 unsigned long group, group_offset;
382
383 group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
384 desc_kaddr = kmap(req->pr_desc_bh->b_page);
385 desc = nilfs_palloc_block_get_group_desc(inode, group,
386 req->pr_desc_bh, desc_kaddr);
387 bitmap_kaddr = kmap(req->pr_bitmap_bh->b_page);
388 bitmap = nilfs_palloc_block_get_bitmap(inode, req->pr_bitmap_bh,
389 bitmap_kaddr);
390 if (!nilfs_clear_bit_atomic(nilfs_mdt_bgl_lock(inode, group),
391 group_offset, bitmap))
392 printk(KERN_WARNING "%s: entry numer %llu already freed\n",
393 __func__, (unsigned long long)req->pr_entry_nr);
394
395 nilfs_palloc_group_desc_add_entries(inode, group, desc, 1);
396
397 kunmap(req->pr_bitmap_bh->b_page);
398 kunmap(req->pr_desc_bh->b_page);
399
400 brelse(req->pr_bitmap_bh);
401 brelse(req->pr_desc_bh);
402
403 req->pr_entry_nr = 0;
404 req->pr_bitmap_bh = NULL;
405 req->pr_desc_bh = NULL;
406}
407
408int nilfs_palloc_prepare_free_entry(struct inode *inode,
409 struct nilfs_palloc_req *req)
410{
411 struct buffer_head *desc_bh, *bitmap_bh;
412 unsigned long group, group_offset;
413 int ret;
414
415 group = nilfs_palloc_group(inode, req->pr_entry_nr, &group_offset);
416 ret = nilfs_palloc_get_desc_block(inode, group, 1, &desc_bh);
417 if (ret < 0)
418 return ret;
419 ret = nilfs_palloc_get_bitmap_block(inode, group, 1, &bitmap_bh);
420 if (ret < 0) {
421 brelse(desc_bh);
422 return ret;
423 }
424
425 req->pr_desc_bh = desc_bh;
426 req->pr_bitmap_bh = bitmap_bh;
427 return 0;
428}
429
430void nilfs_palloc_abort_free_entry(struct inode *inode,
431 struct nilfs_palloc_req *req)
432{
433 brelse(req->pr_bitmap_bh);
434 brelse(req->pr_desc_bh);
435
436 req->pr_entry_nr = 0;
437 req->pr_bitmap_bh = NULL;
438 req->pr_desc_bh = NULL;
439}
440
441static int
442nilfs_palloc_group_is_in(struct inode *inode, unsigned long group, __u64 nr)
443{
444 __u64 first, last;
445
446 first = group * nilfs_palloc_entries_per_group(inode);
447 last = first + nilfs_palloc_entries_per_group(inode) - 1;
448 return (nr >= first) && (nr <= last);
449}
450
451int nilfs_palloc_freev(struct inode *inode, __u64 *entry_nrs, size_t nitems)
452{
453 struct buffer_head *desc_bh, *bitmap_bh;
454 struct nilfs_palloc_group_desc *desc;
455 unsigned char *bitmap;
456 void *desc_kaddr, *bitmap_kaddr;
457 unsigned long group, group_offset;
458 int i, j, n, ret;
459
460 for (i = 0; i < nitems; i += n) {
461 group = nilfs_palloc_group(inode, entry_nrs[i], &group_offset);
462 ret = nilfs_palloc_get_desc_block(inode, group, 0, &desc_bh);
463 if (ret < 0)
464 return ret;
465 ret = nilfs_palloc_get_bitmap_block(inode, group, 0,
466 &bitmap_bh);
467 if (ret < 0) {
468 brelse(desc_bh);
469 return ret;
470 }
471 desc_kaddr = kmap(desc_bh->b_page);
472 desc = nilfs_palloc_block_get_group_desc(
473 inode, group, desc_bh, desc_kaddr);
474 bitmap_kaddr = kmap(bitmap_bh->b_page);
475 bitmap = nilfs_palloc_block_get_bitmap(
476 inode, bitmap_bh, bitmap_kaddr);
477 for (j = i, n = 0;
478 (j < nitems) && nilfs_palloc_group_is_in(inode, group,
479 entry_nrs[j]);
480 j++, n++) {
481 nilfs_palloc_group(inode, entry_nrs[j], &group_offset);
482 if (!nilfs_clear_bit_atomic(
483 nilfs_mdt_bgl_lock(inode, group),
484 group_offset, bitmap)) {
485 printk(KERN_WARNING
486 "%s: entry number %llu already freed\n",
487 __func__,
488 (unsigned long long)entry_nrs[j]);
489 }
490 }
491 nilfs_palloc_group_desc_add_entries(inode, group, desc, n);
492
493 kunmap(bitmap_bh->b_page);
494 kunmap(desc_bh->b_page);
495
496 nilfs_mdt_mark_buffer_dirty(desc_bh);
497 nilfs_mdt_mark_buffer_dirty(bitmap_bh);
498 nilfs_mdt_mark_dirty(inode);
499
500 brelse(bitmap_bh);
501 brelse(desc_bh);
502 }
503 return 0;
504}
diff --git a/fs/nilfs2/alloc.h b/fs/nilfs2/alloc.h
new file mode 100644
index 000000000000..4ace5475c2c7
--- /dev/null
+++ b/fs/nilfs2/alloc.h
@@ -0,0 +1,72 @@
1/*
2 * alloc.h - persistent object (dat entry/disk inode) allocator/deallocator
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Original code was written by Koji Sato <koji@osrg.net>.
21 * Two allocators were unified by Ryusuke Konishi <ryusuke@osrg.net>,
22 * Amagai Yoshiji <amagai@osrg.net>.
23 */
24
25#ifndef _NILFS_ALLOC_H
26#define _NILFS_ALLOC_H
27
28#include <linux/types.h>
29#include <linux/buffer_head.h>
30#include <linux/fs.h>
31
32static inline unsigned long
33nilfs_palloc_entries_per_group(const struct inode *inode)
34{
35 return 1UL << (inode->i_blkbits + 3 /* log2(8 = CHAR_BITS) */);
36}
37
38int nilfs_palloc_init_blockgroup(struct inode *, unsigned);
39int nilfs_palloc_get_entry_block(struct inode *, __u64, int,
40 struct buffer_head **);
41void *nilfs_palloc_block_get_entry(const struct inode *, __u64,
42 const struct buffer_head *, void *);
43
44/**
45 * nilfs_palloc_req - persistent alloctor request and reply
46 * @pr_entry_nr: entry number (vblocknr or inode number)
47 * @pr_desc_bh: buffer head of the buffer containing block group descriptors
48 * @pr_bitmap_bh: buffer head of the buffer containing a block group bitmap
49 * @pr_entry_bh: buffer head of the buffer containing translation entries
50 */
51struct nilfs_palloc_req {
52 __u64 pr_entry_nr;
53 struct buffer_head *pr_desc_bh;
54 struct buffer_head *pr_bitmap_bh;
55 struct buffer_head *pr_entry_bh;
56};
57
58int nilfs_palloc_prepare_alloc_entry(struct inode *,
59 struct nilfs_palloc_req *);
60void nilfs_palloc_commit_alloc_entry(struct inode *,
61 struct nilfs_palloc_req *);
62void nilfs_palloc_abort_alloc_entry(struct inode *, struct nilfs_palloc_req *);
63void nilfs_palloc_commit_free_entry(struct inode *, struct nilfs_palloc_req *);
64int nilfs_palloc_prepare_free_entry(struct inode *, struct nilfs_palloc_req *);
65void nilfs_palloc_abort_free_entry(struct inode *, struct nilfs_palloc_req *);
66int nilfs_palloc_freev(struct inode *, __u64 *, size_t);
67
68#define nilfs_set_bit_atomic ext2_set_bit_atomic
69#define nilfs_clear_bit_atomic ext2_clear_bit_atomic
70#define nilfs_find_next_zero_bit ext2_find_next_zero_bit
71
72#endif /* _NILFS_ALLOC_H */
diff --git a/fs/nilfs2/bmap.c b/fs/nilfs2/bmap.c
new file mode 100644
index 000000000000..24638e059bf3
--- /dev/null
+++ b/fs/nilfs2/bmap.c
@@ -0,0 +1,783 @@
1/*
2 * bmap.c - NILFS block mapping.
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#include <linux/fs.h>
24#include <linux/string.h>
25#include <linux/errno.h>
26#include "nilfs.h"
27#include "bmap.h"
28#include "sb.h"
29#include "btnode.h"
30#include "mdt.h"
31#include "dat.h"
32#include "alloc.h"
33
34int nilfs_bmap_lookup_at_level(struct nilfs_bmap *bmap, __u64 key, int level,
35 __u64 *ptrp)
36{
37 __u64 ptr;
38 int ret;
39
40 down_read(&bmap->b_sem);
41 ret = bmap->b_ops->bop_lookup(bmap, key, level, ptrp);
42 if (ret < 0)
43 goto out;
44 if (bmap->b_pops->bpop_translate != NULL) {
45 ret = bmap->b_pops->bpop_translate(bmap, *ptrp, &ptr);
46 if (ret < 0)
47 goto out;
48 *ptrp = ptr;
49 }
50
51 out:
52 up_read(&bmap->b_sem);
53 return ret;
54}
55
56
57/**
58 * nilfs_bmap_lookup - find a record
59 * @bmap: bmap
60 * @key: key
61 * @recp: pointer to record
62 *
63 * Description: nilfs_bmap_lookup() finds a record whose key matches @key in
64 * @bmap.
65 *
66 * Return Value: On success, 0 is returned and the record associated with @key
67 * is stored in the place pointed by @recp. On error, one of the following
68 * negative error codes is returned.
69 *
70 * %-EIO - I/O error.
71 *
72 * %-ENOMEM - Insufficient amount of memory available.
73 *
74 * %-ENOENT - A record associated with @key does not exist.
75 */
76int nilfs_bmap_lookup(struct nilfs_bmap *bmap,
77 unsigned long key,
78 unsigned long *recp)
79{
80 __u64 ptr;
81 int ret;
82
83 /* XXX: use macro for level 1 */
84 ret = nilfs_bmap_lookup_at_level(bmap, key, 1, &ptr);
85 if (recp != NULL)
86 *recp = ptr;
87 return ret;
88}
89
90static int nilfs_bmap_do_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
91{
92 __u64 keys[NILFS_BMAP_SMALL_HIGH + 1];
93 __u64 ptrs[NILFS_BMAP_SMALL_HIGH + 1];
94 int ret, n;
95
96 if (bmap->b_ops->bop_check_insert != NULL) {
97 ret = bmap->b_ops->bop_check_insert(bmap, key);
98 if (ret > 0) {
99 n = bmap->b_ops->bop_gather_data(
100 bmap, keys, ptrs, NILFS_BMAP_SMALL_HIGH + 1);
101 if (n < 0)
102 return n;
103 ret = nilfs_btree_convert_and_insert(
104 bmap, key, ptr, keys, ptrs, n,
105 NILFS_BMAP_LARGE_LOW, NILFS_BMAP_LARGE_HIGH);
106 if (ret == 0)
107 bmap->b_u.u_flags |= NILFS_BMAP_LARGE;
108
109 return ret;
110 } else if (ret < 0)
111 return ret;
112 }
113
114 return bmap->b_ops->bop_insert(bmap, key, ptr);
115}
116
117/**
118 * nilfs_bmap_insert - insert a new key-record pair into a bmap
119 * @bmap: bmap
120 * @key: key
121 * @rec: record
122 *
123 * Description: nilfs_bmap_insert() inserts the new key-record pair specified
124 * by @key and @rec into @bmap.
125 *
126 * Return Value: On success, 0 is returned. On error, one of the following
127 * negative error codes is returned.
128 *
129 * %-EIO - I/O error.
130 *
131 * %-ENOMEM - Insufficient amount of memory available.
132 *
133 * %-EEXIST - A record associated with @key already exist.
134 */
135int nilfs_bmap_insert(struct nilfs_bmap *bmap,
136 unsigned long key,
137 unsigned long rec)
138{
139 int ret;
140
141 down_write(&bmap->b_sem);
142 ret = nilfs_bmap_do_insert(bmap, key, rec);
143 up_write(&bmap->b_sem);
144 return ret;
145}
146
147static int nilfs_bmap_do_delete(struct nilfs_bmap *bmap, __u64 key)
148{
149 __u64 keys[NILFS_BMAP_LARGE_LOW + 1];
150 __u64 ptrs[NILFS_BMAP_LARGE_LOW + 1];
151 int ret, n;
152
153 if (bmap->b_ops->bop_check_delete != NULL) {
154 ret = bmap->b_ops->bop_check_delete(bmap, key);
155 if (ret > 0) {
156 n = bmap->b_ops->bop_gather_data(
157 bmap, keys, ptrs, NILFS_BMAP_LARGE_LOW + 1);
158 if (n < 0)
159 return n;
160 ret = nilfs_direct_delete_and_convert(
161 bmap, key, keys, ptrs, n,
162 NILFS_BMAP_SMALL_LOW, NILFS_BMAP_SMALL_HIGH);
163 if (ret == 0)
164 bmap->b_u.u_flags &= ~NILFS_BMAP_LARGE;
165
166 return ret;
167 } else if (ret < 0)
168 return ret;
169 }
170
171 return bmap->b_ops->bop_delete(bmap, key);
172}
173
174int nilfs_bmap_last_key(struct nilfs_bmap *bmap, unsigned long *key)
175{
176 __u64 lastkey;
177 int ret;
178
179 down_read(&bmap->b_sem);
180 ret = bmap->b_ops->bop_last_key(bmap, &lastkey);
181 if (!ret)
182 *key = lastkey;
183 up_read(&bmap->b_sem);
184 return ret;
185}
186
187/**
188 * nilfs_bmap_delete - delete a key-record pair from a bmap
189 * @bmap: bmap
190 * @key: key
191 *
192 * Description: nilfs_bmap_delete() deletes the key-record pair specified by
193 * @key from @bmap.
194 *
195 * Return Value: On success, 0 is returned. On error, one of the following
196 * negative error codes is returned.
197 *
198 * %-EIO - I/O error.
199 *
200 * %-ENOMEM - Insufficient amount of memory available.
201 *
202 * %-ENOENT - A record associated with @key does not exist.
203 */
204int nilfs_bmap_delete(struct nilfs_bmap *bmap, unsigned long key)
205{
206 int ret;
207
208 down_write(&bmap->b_sem);
209 ret = nilfs_bmap_do_delete(bmap, key);
210 up_write(&bmap->b_sem);
211 return ret;
212}
213
214static int nilfs_bmap_do_truncate(struct nilfs_bmap *bmap, unsigned long key)
215{
216 __u64 lastkey;
217 int ret;
218
219 ret = bmap->b_ops->bop_last_key(bmap, &lastkey);
220 if (ret < 0) {
221 if (ret == -ENOENT)
222 ret = 0;
223 return ret;
224 }
225
226 while (key <= lastkey) {
227 ret = nilfs_bmap_do_delete(bmap, lastkey);
228 if (ret < 0)
229 return ret;
230 ret = bmap->b_ops->bop_last_key(bmap, &lastkey);
231 if (ret < 0) {
232 if (ret == -ENOENT)
233 ret = 0;
234 return ret;
235 }
236 }
237 return 0;
238}
239
240/**
241 * nilfs_bmap_truncate - truncate a bmap to a specified key
242 * @bmap: bmap
243 * @key: key
244 *
245 * Description: nilfs_bmap_truncate() removes key-record pairs whose keys are
246 * greater than or equal to @key from @bmap.
247 *
248 * Return Value: On success, 0 is returned. On error, one of the following
249 * negative error codes is returned.
250 *
251 * %-EIO - I/O error.
252 *
253 * %-ENOMEM - Insufficient amount of memory available.
254 */
255int nilfs_bmap_truncate(struct nilfs_bmap *bmap, unsigned long key)
256{
257 int ret;
258
259 down_write(&bmap->b_sem);
260 ret = nilfs_bmap_do_truncate(bmap, key);
261 up_write(&bmap->b_sem);
262 return ret;
263}
264
265/**
266 * nilfs_bmap_clear - free resources a bmap holds
267 * @bmap: bmap
268 *
269 * Description: nilfs_bmap_clear() frees resources associated with @bmap.
270 */
271void nilfs_bmap_clear(struct nilfs_bmap *bmap)
272{
273 down_write(&bmap->b_sem);
274 if (bmap->b_ops->bop_clear != NULL)
275 bmap->b_ops->bop_clear(bmap);
276 up_write(&bmap->b_sem);
277}
278
279/**
280 * nilfs_bmap_propagate - propagate dirty state
281 * @bmap: bmap
282 * @bh: buffer head
283 *
284 * Description: nilfs_bmap_propagate() marks the buffers that directly or
285 * indirectly refer to the block specified by @bh dirty.
286 *
287 * Return Value: On success, 0 is returned. On error, one of the following
288 * negative error codes is returned.
289 *
290 * %-EIO - I/O error.
291 *
292 * %-ENOMEM - Insufficient amount of memory available.
293 */
294int nilfs_bmap_propagate(struct nilfs_bmap *bmap, struct buffer_head *bh)
295{
296 int ret;
297
298 down_write(&bmap->b_sem);
299 ret = bmap->b_ops->bop_propagate(bmap, bh);
300 up_write(&bmap->b_sem);
301 return ret;
302}
303
304/**
305 * nilfs_bmap_lookup_dirty_buffers -
306 * @bmap: bmap
307 * @listp: pointer to buffer head list
308 */
309void nilfs_bmap_lookup_dirty_buffers(struct nilfs_bmap *bmap,
310 struct list_head *listp)
311{
312 if (bmap->b_ops->bop_lookup_dirty_buffers != NULL)
313 bmap->b_ops->bop_lookup_dirty_buffers(bmap, listp);
314}
315
316/**
317 * nilfs_bmap_assign - assign a new block number to a block
318 * @bmap: bmap
319 * @bhp: pointer to buffer head
320 * @blocknr: block number
321 * @binfo: block information
322 *
323 * Description: nilfs_bmap_assign() assigns the block number @blocknr to the
324 * buffer specified by @bh.
325 *
326 * Return Value: On success, 0 is returned and the buffer head of a newly
327 * create buffer and the block information associated with the buffer are
328 * stored in the place pointed by @bh and @binfo, respectively. On error, one
329 * of the following negative error codes is returned.
330 *
331 * %-EIO - I/O error.
332 *
333 * %-ENOMEM - Insufficient amount of memory available.
334 */
335int nilfs_bmap_assign(struct nilfs_bmap *bmap,
336 struct buffer_head **bh,
337 unsigned long blocknr,
338 union nilfs_binfo *binfo)
339{
340 int ret;
341
342 down_write(&bmap->b_sem);
343 ret = bmap->b_ops->bop_assign(bmap, bh, blocknr, binfo);
344 up_write(&bmap->b_sem);
345 return ret;
346}
347
348/**
349 * nilfs_bmap_mark - mark block dirty
350 * @bmap: bmap
351 * @key: key
352 * @level: level
353 *
354 * Description: nilfs_bmap_mark() marks the block specified by @key and @level
355 * as dirty.
356 *
357 * Return Value: On success, 0 is returned. On error, one of the following
358 * negative error codes is returned.
359 *
360 * %-EIO - I/O error.
361 *
362 * %-ENOMEM - Insufficient amount of memory available.
363 */
364int nilfs_bmap_mark(struct nilfs_bmap *bmap, __u64 key, int level)
365{
366 int ret;
367
368 if (bmap->b_ops->bop_mark == NULL)
369 return 0;
370
371 down_write(&bmap->b_sem);
372 ret = bmap->b_ops->bop_mark(bmap, key, level);
373 up_write(&bmap->b_sem);
374 return ret;
375}
376
377/**
378 * nilfs_bmap_test_and_clear_dirty - test and clear a bmap dirty state
379 * @bmap: bmap
380 *
381 * Description: nilfs_test_and_clear() is the atomic operation to test and
382 * clear the dirty state of @bmap.
383 *
384 * Return Value: 1 is returned if @bmap is dirty, or 0 if clear.
385 */
386int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *bmap)
387{
388 int ret;
389
390 down_write(&bmap->b_sem);
391 ret = nilfs_bmap_dirty(bmap);
392 nilfs_bmap_clear_dirty(bmap);
393 up_write(&bmap->b_sem);
394 return ret;
395}
396
397
398/*
399 * Internal use only
400 */
401
402void nilfs_bmap_add_blocks(const struct nilfs_bmap *bmap, int n)
403{
404 inode_add_bytes(bmap->b_inode, (1 << bmap->b_inode->i_blkbits) * n);
405 if (NILFS_MDT(bmap->b_inode))
406 nilfs_mdt_mark_dirty(bmap->b_inode);
407 else
408 mark_inode_dirty(bmap->b_inode);
409}
410
411void nilfs_bmap_sub_blocks(const struct nilfs_bmap *bmap, int n)
412{
413 inode_sub_bytes(bmap->b_inode, (1 << bmap->b_inode->i_blkbits) * n);
414 if (NILFS_MDT(bmap->b_inode))
415 nilfs_mdt_mark_dirty(bmap->b_inode);
416 else
417 mark_inode_dirty(bmap->b_inode);
418}
419
420int nilfs_bmap_get_block(const struct nilfs_bmap *bmap, __u64 ptr,
421 struct buffer_head **bhp)
422{
423 return nilfs_btnode_get(&NILFS_BMAP_I(bmap)->i_btnode_cache,
424 ptr, 0, bhp, 0);
425}
426
427void nilfs_bmap_put_block(const struct nilfs_bmap *bmap,
428 struct buffer_head *bh)
429{
430 brelse(bh);
431}
432
433int nilfs_bmap_get_new_block(const struct nilfs_bmap *bmap, __u64 ptr,
434 struct buffer_head **bhp)
435{
436 int ret;
437
438 ret = nilfs_btnode_get(&NILFS_BMAP_I(bmap)->i_btnode_cache,
439 ptr, 0, bhp, 1);
440 if (ret < 0)
441 return ret;
442 set_buffer_nilfs_volatile(*bhp);
443 return 0;
444}
445
446void nilfs_bmap_delete_block(const struct nilfs_bmap *bmap,
447 struct buffer_head *bh)
448{
449 nilfs_btnode_delete(bh);
450}
451
452__u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *bmap,
453 const struct buffer_head *bh)
454{
455 struct buffer_head *pbh;
456 __u64 key;
457
458 key = page_index(bh->b_page) << (PAGE_CACHE_SHIFT -
459 bmap->b_inode->i_blkbits);
460 for (pbh = page_buffers(bh->b_page); pbh != bh;
461 pbh = pbh->b_this_page, key++);
462
463 return key;
464}
465
466__u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *bmap, __u64 key)
467{
468 __s64 diff;
469
470 diff = key - bmap->b_last_allocated_key;
471 if ((nilfs_bmap_keydiff_abs(diff) < NILFS_INODE_BMAP_SIZE) &&
472 (bmap->b_last_allocated_ptr != NILFS_BMAP_INVALID_PTR) &&
473 (bmap->b_last_allocated_ptr + diff > 0))
474 return bmap->b_last_allocated_ptr + diff;
475 else
476 return NILFS_BMAP_INVALID_PTR;
477}
478
479static struct inode *nilfs_bmap_get_dat(const struct nilfs_bmap *bmap)
480{
481 return nilfs_dat_inode(NILFS_I_NILFS(bmap->b_inode));
482}
483
484#define NILFS_BMAP_GROUP_DIV 8
485__u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *bmap)
486{
487 struct inode *dat = nilfs_bmap_get_dat(bmap);
488 unsigned long entries_per_group = nilfs_palloc_entries_per_group(dat);
489 unsigned long group = bmap->b_inode->i_ino / entries_per_group;
490
491 return group * entries_per_group +
492 (bmap->b_inode->i_ino % NILFS_BMAP_GROUP_DIV) *
493 (entries_per_group / NILFS_BMAP_GROUP_DIV);
494}
495
496static int nilfs_bmap_prepare_alloc_v(struct nilfs_bmap *bmap,
497 union nilfs_bmap_ptr_req *req)
498{
499 return nilfs_dat_prepare_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
500}
501
502static void nilfs_bmap_commit_alloc_v(struct nilfs_bmap *bmap,
503 union nilfs_bmap_ptr_req *req)
504{
505 nilfs_dat_commit_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
506}
507
508static void nilfs_bmap_abort_alloc_v(struct nilfs_bmap *bmap,
509 union nilfs_bmap_ptr_req *req)
510{
511 nilfs_dat_abort_alloc(nilfs_bmap_get_dat(bmap), &req->bpr_req);
512}
513
514static int nilfs_bmap_prepare_start_v(struct nilfs_bmap *bmap,
515 union nilfs_bmap_ptr_req *req)
516{
517 return nilfs_dat_prepare_start(nilfs_bmap_get_dat(bmap), &req->bpr_req);
518}
519
520static void nilfs_bmap_commit_start_v(struct nilfs_bmap *bmap,
521 union nilfs_bmap_ptr_req *req,
522 sector_t blocknr)
523{
524 nilfs_dat_commit_start(nilfs_bmap_get_dat(bmap), &req->bpr_req,
525 blocknr);
526}
527
528static void nilfs_bmap_abort_start_v(struct nilfs_bmap *bmap,
529 union nilfs_bmap_ptr_req *req)
530{
531 nilfs_dat_abort_start(nilfs_bmap_get_dat(bmap), &req->bpr_req);
532}
533
534static int nilfs_bmap_prepare_end_v(struct nilfs_bmap *bmap,
535 union nilfs_bmap_ptr_req *req)
536{
537 return nilfs_dat_prepare_end(nilfs_bmap_get_dat(bmap), &req->bpr_req);
538}
539
540static void nilfs_bmap_commit_end_v(struct nilfs_bmap *bmap,
541 union nilfs_bmap_ptr_req *req)
542{
543 nilfs_dat_commit_end(nilfs_bmap_get_dat(bmap), &req->bpr_req, 0);
544}
545
546static void nilfs_bmap_commit_end_vmdt(struct nilfs_bmap *bmap,
547 union nilfs_bmap_ptr_req *req)
548{
549 nilfs_dat_commit_end(nilfs_bmap_get_dat(bmap), &req->bpr_req, 1);
550}
551
552static void nilfs_bmap_abort_end_v(struct nilfs_bmap *bmap,
553 union nilfs_bmap_ptr_req *req)
554{
555 nilfs_dat_abort_end(nilfs_bmap_get_dat(bmap), &req->bpr_req);
556}
557
558int nilfs_bmap_move_v(const struct nilfs_bmap *bmap, __u64 vblocknr,
559 sector_t blocknr)
560{
561 return nilfs_dat_move(nilfs_bmap_get_dat(bmap), vblocknr, blocknr);
562}
563
564int nilfs_bmap_mark_dirty(const struct nilfs_bmap *bmap, __u64 vblocknr)
565{
566 return nilfs_dat_mark_dirty(nilfs_bmap_get_dat(bmap), vblocknr);
567}
568
569int nilfs_bmap_prepare_update(struct nilfs_bmap *bmap,
570 union nilfs_bmap_ptr_req *oldreq,
571 union nilfs_bmap_ptr_req *newreq)
572{
573 int ret;
574
575 ret = bmap->b_pops->bpop_prepare_end_ptr(bmap, oldreq);
576 if (ret < 0)
577 return ret;
578 ret = bmap->b_pops->bpop_prepare_alloc_ptr(bmap, newreq);
579 if (ret < 0)
580 bmap->b_pops->bpop_abort_end_ptr(bmap, oldreq);
581
582 return ret;
583}
584
585void nilfs_bmap_commit_update(struct nilfs_bmap *bmap,
586 union nilfs_bmap_ptr_req *oldreq,
587 union nilfs_bmap_ptr_req *newreq)
588{
589 bmap->b_pops->bpop_commit_end_ptr(bmap, oldreq);
590 bmap->b_pops->bpop_commit_alloc_ptr(bmap, newreq);
591}
592
593void nilfs_bmap_abort_update(struct nilfs_bmap *bmap,
594 union nilfs_bmap_ptr_req *oldreq,
595 union nilfs_bmap_ptr_req *newreq)
596{
597 bmap->b_pops->bpop_abort_end_ptr(bmap, oldreq);
598 bmap->b_pops->bpop_abort_alloc_ptr(bmap, newreq);
599}
600
601static int nilfs_bmap_translate_v(const struct nilfs_bmap *bmap, __u64 ptr,
602 __u64 *ptrp)
603{
604 sector_t blocknr;
605 int ret;
606
607 ret = nilfs_dat_translate(nilfs_bmap_get_dat(bmap), ptr, &blocknr);
608 if (ret < 0)
609 return ret;
610 if (ptrp != NULL)
611 *ptrp = blocknr;
612 return 0;
613}
614
615static int nilfs_bmap_prepare_alloc_p(struct nilfs_bmap *bmap,
616 union nilfs_bmap_ptr_req *req)
617{
618 /* ignore target ptr */
619 req->bpr_ptr = bmap->b_last_allocated_ptr++;
620 return 0;
621}
622
623static void nilfs_bmap_commit_alloc_p(struct nilfs_bmap *bmap,
624 union nilfs_bmap_ptr_req *req)
625{
626 /* do nothing */
627}
628
629static void nilfs_bmap_abort_alloc_p(struct nilfs_bmap *bmap,
630 union nilfs_bmap_ptr_req *req)
631{
632 bmap->b_last_allocated_ptr--;
633}
634
635static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_v = {
636 .bpop_prepare_alloc_ptr = nilfs_bmap_prepare_alloc_v,
637 .bpop_commit_alloc_ptr = nilfs_bmap_commit_alloc_v,
638 .bpop_abort_alloc_ptr = nilfs_bmap_abort_alloc_v,
639 .bpop_prepare_start_ptr = nilfs_bmap_prepare_start_v,
640 .bpop_commit_start_ptr = nilfs_bmap_commit_start_v,
641 .bpop_abort_start_ptr = nilfs_bmap_abort_start_v,
642 .bpop_prepare_end_ptr = nilfs_bmap_prepare_end_v,
643 .bpop_commit_end_ptr = nilfs_bmap_commit_end_v,
644 .bpop_abort_end_ptr = nilfs_bmap_abort_end_v,
645
646 .bpop_translate = nilfs_bmap_translate_v,
647};
648
649static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_vmdt = {
650 .bpop_prepare_alloc_ptr = nilfs_bmap_prepare_alloc_v,
651 .bpop_commit_alloc_ptr = nilfs_bmap_commit_alloc_v,
652 .bpop_abort_alloc_ptr = nilfs_bmap_abort_alloc_v,
653 .bpop_prepare_start_ptr = nilfs_bmap_prepare_start_v,
654 .bpop_commit_start_ptr = nilfs_bmap_commit_start_v,
655 .bpop_abort_start_ptr = nilfs_bmap_abort_start_v,
656 .bpop_prepare_end_ptr = nilfs_bmap_prepare_end_v,
657 .bpop_commit_end_ptr = nilfs_bmap_commit_end_vmdt,
658 .bpop_abort_end_ptr = nilfs_bmap_abort_end_v,
659
660 .bpop_translate = nilfs_bmap_translate_v,
661};
662
663static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_p = {
664 .bpop_prepare_alloc_ptr = nilfs_bmap_prepare_alloc_p,
665 .bpop_commit_alloc_ptr = nilfs_bmap_commit_alloc_p,
666 .bpop_abort_alloc_ptr = nilfs_bmap_abort_alloc_p,
667 .bpop_prepare_start_ptr = NULL,
668 .bpop_commit_start_ptr = NULL,
669 .bpop_abort_start_ptr = NULL,
670 .bpop_prepare_end_ptr = NULL,
671 .bpop_commit_end_ptr = NULL,
672 .bpop_abort_end_ptr = NULL,
673
674 .bpop_translate = NULL,
675};
676
677static const struct nilfs_bmap_ptr_operations nilfs_bmap_ptr_ops_gc = {
678 .bpop_prepare_alloc_ptr = NULL,
679 .bpop_commit_alloc_ptr = NULL,
680 .bpop_abort_alloc_ptr = NULL,
681 .bpop_prepare_start_ptr = NULL,
682 .bpop_commit_start_ptr = NULL,
683 .bpop_abort_start_ptr = NULL,
684 .bpop_prepare_end_ptr = NULL,
685 .bpop_commit_end_ptr = NULL,
686 .bpop_abort_end_ptr = NULL,
687
688 .bpop_translate = NULL,
689};
690
691/**
692 * nilfs_bmap_read - read a bmap from an inode
693 * @bmap: bmap
694 * @raw_inode: on-disk inode
695 *
696 * Description: nilfs_bmap_read() initializes the bmap @bmap.
697 *
698 * Return Value: On success, 0 is returned. On error, the following negative
699 * error code is returned.
700 *
701 * %-ENOMEM - Insufficient amount of memory available.
702 */
703int nilfs_bmap_read(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode)
704{
705 if (raw_inode == NULL)
706 memset(bmap->b_u.u_data, 0, NILFS_BMAP_SIZE);
707 else
708 memcpy(bmap->b_u.u_data, raw_inode->i_bmap, NILFS_BMAP_SIZE);
709
710 init_rwsem(&bmap->b_sem);
711 bmap->b_state = 0;
712 bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode;
713 switch (bmap->b_inode->i_ino) {
714 case NILFS_DAT_INO:
715 bmap->b_pops = &nilfs_bmap_ptr_ops_p;
716 bmap->b_last_allocated_key = 0; /* XXX: use macro */
717 bmap->b_last_allocated_ptr = NILFS_BMAP_NEW_PTR_INIT;
718 break;
719 case NILFS_CPFILE_INO:
720 case NILFS_SUFILE_INO:
721 bmap->b_pops = &nilfs_bmap_ptr_ops_vmdt;
722 bmap->b_last_allocated_key = 0; /* XXX: use macro */
723 bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR;
724 break;
725 default:
726 bmap->b_pops = &nilfs_bmap_ptr_ops_v;
727 bmap->b_last_allocated_key = 0; /* XXX: use macro */
728 bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR;
729 break;
730 }
731
732 return (bmap->b_u.u_flags & NILFS_BMAP_LARGE) ?
733 nilfs_btree_init(bmap,
734 NILFS_BMAP_LARGE_LOW,
735 NILFS_BMAP_LARGE_HIGH) :
736 nilfs_direct_init(bmap,
737 NILFS_BMAP_SMALL_LOW,
738 NILFS_BMAP_SMALL_HIGH);
739}
740
741/**
742 * nilfs_bmap_write - write back a bmap to an inode
743 * @bmap: bmap
744 * @raw_inode: on-disk inode
745 *
746 * Description: nilfs_bmap_write() stores @bmap in @raw_inode.
747 */
748void nilfs_bmap_write(struct nilfs_bmap *bmap, struct nilfs_inode *raw_inode)
749{
750 down_write(&bmap->b_sem);
751 memcpy(raw_inode->i_bmap, bmap->b_u.u_data,
752 NILFS_INODE_BMAP_SIZE * sizeof(__le64));
753 if (bmap->b_inode->i_ino == NILFS_DAT_INO)
754 bmap->b_last_allocated_ptr = NILFS_BMAP_NEW_PTR_INIT;
755
756 up_write(&bmap->b_sem);
757}
758
759void nilfs_bmap_init_gc(struct nilfs_bmap *bmap)
760{
761 memset(&bmap->b_u, 0, NILFS_BMAP_SIZE);
762 init_rwsem(&bmap->b_sem);
763 bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode;
764 bmap->b_pops = &nilfs_bmap_ptr_ops_gc;
765 bmap->b_last_allocated_key = 0;
766 bmap->b_last_allocated_ptr = NILFS_BMAP_INVALID_PTR;
767 bmap->b_state = 0;
768 nilfs_btree_init_gc(bmap);
769}
770
771void nilfs_bmap_init_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap)
772{
773 memcpy(gcbmap, bmap, sizeof(union nilfs_bmap_union));
774 init_rwsem(&gcbmap->b_sem);
775 gcbmap->b_inode = &NILFS_BMAP_I(gcbmap)->vfs_inode;
776}
777
778void nilfs_bmap_commit_gcdat(struct nilfs_bmap *gcbmap, struct nilfs_bmap *bmap)
779{
780 memcpy(bmap, gcbmap, sizeof(union nilfs_bmap_union));
781 init_rwsem(&bmap->b_sem);
782 bmap->b_inode = &NILFS_BMAP_I(bmap)->vfs_inode;
783}
diff --git a/fs/nilfs2/bmap.h b/fs/nilfs2/bmap.h
new file mode 100644
index 000000000000..4f2708abb1ba
--- /dev/null
+++ b/fs/nilfs2/bmap.h
@@ -0,0 +1,244 @@
1/*
2 * bmap.h - NILFS block mapping.
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#ifndef _NILFS_BMAP_H
24#define _NILFS_BMAP_H
25
26#include <linux/types.h>
27#include <linux/fs.h>
28#include <linux/buffer_head.h>
29#include <linux/nilfs2_fs.h>
30#include "alloc.h"
31
32#define NILFS_BMAP_INVALID_PTR 0
33
34#define nilfs_bmap_dkey_to_key(dkey) le64_to_cpu(dkey)
35#define nilfs_bmap_key_to_dkey(key) cpu_to_le64(key)
36#define nilfs_bmap_dptr_to_ptr(dptr) le64_to_cpu(dptr)
37#define nilfs_bmap_ptr_to_dptr(ptr) cpu_to_le64(ptr)
38
39#define nilfs_bmap_keydiff_abs(diff) ((diff) < 0 ? -(diff) : (diff))
40
41
42struct nilfs_bmap;
43
44/**
45 * union nilfs_bmap_ptr_req - request for bmap ptr
46 * @bpr_ptr: bmap pointer
47 * @bpr_req: request for persistent allocator
48 */
49union nilfs_bmap_ptr_req {
50 __u64 bpr_ptr;
51 struct nilfs_palloc_req bpr_req;
52};
53
54/**
55 * struct nilfs_bmap_stats - bmap statistics
56 * @bs_nblocks: number of blocks created or deleted
57 */
58struct nilfs_bmap_stats {
59 unsigned int bs_nblocks;
60};
61
62/**
63 * struct nilfs_bmap_operations - bmap operation table
64 */
65struct nilfs_bmap_operations {
66 int (*bop_lookup)(const struct nilfs_bmap *, __u64, int, __u64 *);
67 int (*bop_insert)(struct nilfs_bmap *, __u64, __u64);
68 int (*bop_delete)(struct nilfs_bmap *, __u64);
69 void (*bop_clear)(struct nilfs_bmap *);
70
71 int (*bop_propagate)(const struct nilfs_bmap *, struct buffer_head *);
72 void (*bop_lookup_dirty_buffers)(struct nilfs_bmap *,
73 struct list_head *);
74
75 int (*bop_assign)(struct nilfs_bmap *,
76 struct buffer_head **,
77 sector_t,
78 union nilfs_binfo *);
79 int (*bop_mark)(struct nilfs_bmap *, __u64, int);
80
81 /* The following functions are internal use only. */
82 int (*bop_last_key)(const struct nilfs_bmap *, __u64 *);
83 int (*bop_check_insert)(const struct nilfs_bmap *, __u64);
84 int (*bop_check_delete)(struct nilfs_bmap *, __u64);
85 int (*bop_gather_data)(struct nilfs_bmap *, __u64 *, __u64 *, int);
86};
87
88
89/**
90 * struct nilfs_bmap_ptr_operations - bmap ptr operation table
91 */
92struct nilfs_bmap_ptr_operations {
93 int (*bpop_prepare_alloc_ptr)(struct nilfs_bmap *,
94 union nilfs_bmap_ptr_req *);
95 void (*bpop_commit_alloc_ptr)(struct nilfs_bmap *,
96 union nilfs_bmap_ptr_req *);
97 void (*bpop_abort_alloc_ptr)(struct nilfs_bmap *,
98 union nilfs_bmap_ptr_req *);
99 int (*bpop_prepare_start_ptr)(struct nilfs_bmap *,
100 union nilfs_bmap_ptr_req *);
101 void (*bpop_commit_start_ptr)(struct nilfs_bmap *,
102 union nilfs_bmap_ptr_req *,
103 sector_t);
104 void (*bpop_abort_start_ptr)(struct nilfs_bmap *,
105 union nilfs_bmap_ptr_req *);
106 int (*bpop_prepare_end_ptr)(struct nilfs_bmap *,
107 union nilfs_bmap_ptr_req *);
108 void (*bpop_commit_end_ptr)(struct nilfs_bmap *,
109 union nilfs_bmap_ptr_req *);
110 void (*bpop_abort_end_ptr)(struct nilfs_bmap *,
111 union nilfs_bmap_ptr_req *);
112
113 int (*bpop_translate)(const struct nilfs_bmap *, __u64, __u64 *);
114};
115
116
117#define NILFS_BMAP_SIZE (NILFS_INODE_BMAP_SIZE * sizeof(__le64))
118#define NILFS_BMAP_KEY_BIT (sizeof(unsigned long) * 8 /* CHAR_BIT */)
119#define NILFS_BMAP_NEW_PTR_INIT \
120 (1UL << (sizeof(unsigned long) * 8 /* CHAR_BIT */ - 1))
121
122static inline int nilfs_bmap_is_new_ptr(unsigned long ptr)
123{
124 return !!(ptr & NILFS_BMAP_NEW_PTR_INIT);
125}
126
127
128/**
129 * struct nilfs_bmap - bmap structure
130 * @b_u: raw data
131 * @b_sem: semaphore
132 * @b_inode: owner of bmap
133 * @b_ops: bmap operation table
134 * @b_pops: bmap ptr operation table
135 * @b_low: low watermark of conversion
136 * @b_high: high watermark of conversion
137 * @b_last_allocated_key: last allocated key for data block
138 * @b_last_allocated_ptr: last allocated ptr for data block
139 * @b_state: state
140 */
141struct nilfs_bmap {
142 union {
143 __u8 u_flags;
144 __le64 u_data[NILFS_BMAP_SIZE / sizeof(__le64)];
145 } b_u;
146 struct rw_semaphore b_sem;
147 struct inode *b_inode;
148 const struct nilfs_bmap_operations *b_ops;
149 const struct nilfs_bmap_ptr_operations *b_pops;
150 __u64 b_low;
151 __u64 b_high;
152 __u64 b_last_allocated_key;
153 __u64 b_last_allocated_ptr;
154 int b_state;
155};
156
157/* state */
158#define NILFS_BMAP_DIRTY 0x00000001
159
160
161int nilfs_bmap_test_and_clear_dirty(struct nilfs_bmap *);
162int nilfs_bmap_read(struct nilfs_bmap *, struct nilfs_inode *);
163void nilfs_bmap_write(struct nilfs_bmap *, struct nilfs_inode *);
164int nilfs_bmap_lookup(struct nilfs_bmap *, unsigned long, unsigned long *);
165int nilfs_bmap_insert(struct nilfs_bmap *, unsigned long, unsigned long);
166int nilfs_bmap_delete(struct nilfs_bmap *, unsigned long);
167int nilfs_bmap_last_key(struct nilfs_bmap *, unsigned long *);
168int nilfs_bmap_truncate(struct nilfs_bmap *, unsigned long);
169void nilfs_bmap_clear(struct nilfs_bmap *);
170int nilfs_bmap_propagate(struct nilfs_bmap *, struct buffer_head *);
171void nilfs_bmap_lookup_dirty_buffers(struct nilfs_bmap *, struct list_head *);
172int nilfs_bmap_assign(struct nilfs_bmap *, struct buffer_head **,
173 unsigned long, union nilfs_binfo *);
174int nilfs_bmap_lookup_at_level(struct nilfs_bmap *, __u64, int, __u64 *);
175int nilfs_bmap_mark(struct nilfs_bmap *, __u64, int);
176
177void nilfs_bmap_init_gc(struct nilfs_bmap *);
178void nilfs_bmap_init_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
179void nilfs_bmap_commit_gcdat(struct nilfs_bmap *, struct nilfs_bmap *);
180
181
182/*
183 * Internal use only
184 */
185
186int nilfs_bmap_move_v(const struct nilfs_bmap *, __u64, sector_t);
187int nilfs_bmap_mark_dirty(const struct nilfs_bmap *, __u64);
188
189
190__u64 nilfs_bmap_data_get_key(const struct nilfs_bmap *,
191 const struct buffer_head *);
192
193__u64 nilfs_bmap_find_target_seq(const struct nilfs_bmap *, __u64);
194__u64 nilfs_bmap_find_target_in_group(const struct nilfs_bmap *);
195
196int nilfs_bmap_prepare_update(struct nilfs_bmap *,
197 union nilfs_bmap_ptr_req *,
198 union nilfs_bmap_ptr_req *);
199void nilfs_bmap_commit_update(struct nilfs_bmap *,
200 union nilfs_bmap_ptr_req *,
201 union nilfs_bmap_ptr_req *);
202void nilfs_bmap_abort_update(struct nilfs_bmap *,
203 union nilfs_bmap_ptr_req *,
204 union nilfs_bmap_ptr_req *);
205
206void nilfs_bmap_add_blocks(const struct nilfs_bmap *, int);
207void nilfs_bmap_sub_blocks(const struct nilfs_bmap *, int);
208
209
210int nilfs_bmap_get_block(const struct nilfs_bmap *, __u64,
211 struct buffer_head **);
212void nilfs_bmap_put_block(const struct nilfs_bmap *, struct buffer_head *);
213int nilfs_bmap_get_new_block(const struct nilfs_bmap *, __u64,
214 struct buffer_head **);
215void nilfs_bmap_delete_block(const struct nilfs_bmap *, struct buffer_head *);
216
217
218/* Assume that bmap semaphore is locked. */
219static inline int nilfs_bmap_dirty(const struct nilfs_bmap *bmap)
220{
221 return !!(bmap->b_state & NILFS_BMAP_DIRTY);
222}
223
224/* Assume that bmap semaphore is locked. */
225static inline void nilfs_bmap_set_dirty(struct nilfs_bmap *bmap)
226{
227 bmap->b_state |= NILFS_BMAP_DIRTY;
228}
229
230/* Assume that bmap semaphore is locked. */
231static inline void nilfs_bmap_clear_dirty(struct nilfs_bmap *bmap)
232{
233 bmap->b_state &= ~NILFS_BMAP_DIRTY;
234}
235
236
237#define NILFS_BMAP_LARGE 0x1
238
239#define NILFS_BMAP_SMALL_LOW NILFS_DIRECT_KEY_MIN
240#define NILFS_BMAP_SMALL_HIGH NILFS_DIRECT_KEY_MAX
241#define NILFS_BMAP_LARGE_LOW NILFS_BTREE_ROOT_NCHILDREN_MAX
242#define NILFS_BMAP_LARGE_HIGH NILFS_BTREE_KEY_MAX
243
244#endif /* _NILFS_BMAP_H */
diff --git a/fs/nilfs2/bmap_union.h b/fs/nilfs2/bmap_union.h
new file mode 100644
index 000000000000..d41509bff47b
--- /dev/null
+++ b/fs/nilfs2/bmap_union.h
@@ -0,0 +1,42 @@
1/*
2 * bmap_union.h - NILFS block mapping.
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#ifndef _NILFS_BMAP_UNION_H
24#define _NILFS_BMAP_UNION_H
25
26#include "bmap.h"
27#include "direct.h"
28#include "btree.h"
29
30/**
31 * nilfs_bmap_union -
32 * @bi_bmap: bmap structure
33 * @bi_btree: direct map structure
34 * @bi_direct: B-tree structure
35 */
36union nilfs_bmap_union {
37 struct nilfs_bmap bi_bmap;
38 struct nilfs_direct bi_direct;
39 struct nilfs_btree bi_btree;
40};
41
42#endif /* _NILFS_BMAP_UNION_H */
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
new file mode 100644
index 000000000000..4cc07b2c30e0
--- /dev/null
+++ b/fs/nilfs2/btnode.c
@@ -0,0 +1,316 @@
1/*
2 * btnode.c - NILFS B-tree node cache
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * This file was originally written by Seiji Kihara <kihara@osrg.net>
21 * and fully revised by Ryusuke Konishi <ryusuke@osrg.net> for
22 * stabilization and simplification.
23 *
24 */
25
26#include <linux/types.h>
27#include <linux/buffer_head.h>
28#include <linux/mm.h>
29#include <linux/backing-dev.h>
30#include "nilfs.h"
31#include "mdt.h"
32#include "dat.h"
33#include "page.h"
34#include "btnode.h"
35
36
37void nilfs_btnode_cache_init_once(struct address_space *btnc)
38{
39 INIT_RADIX_TREE(&btnc->page_tree, GFP_ATOMIC);
40 spin_lock_init(&btnc->tree_lock);
41 INIT_LIST_HEAD(&btnc->private_list);
42 spin_lock_init(&btnc->private_lock);
43
44 spin_lock_init(&btnc->i_mmap_lock);
45 INIT_RAW_PRIO_TREE_ROOT(&btnc->i_mmap);
46 INIT_LIST_HEAD(&btnc->i_mmap_nonlinear);
47}
48
49static struct address_space_operations def_btnode_aops;
50
51void nilfs_btnode_cache_init(struct address_space *btnc)
52{
53 btnc->host = NULL; /* can safely set to host inode ? */
54 btnc->flags = 0;
55 mapping_set_gfp_mask(btnc, GFP_NOFS);
56 btnc->assoc_mapping = NULL;
57 btnc->backing_dev_info = &default_backing_dev_info;
58 btnc->a_ops = &def_btnode_aops;
59}
60
61void nilfs_btnode_cache_clear(struct address_space *btnc)
62{
63 invalidate_mapping_pages(btnc, 0, -1);
64 truncate_inode_pages(btnc, 0);
65}
66
67int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr,
68 sector_t pblocknr, struct buffer_head **pbh,
69 int newblk)
70{
71 struct buffer_head *bh;
72 struct inode *inode = NILFS_BTNC_I(btnc);
73 int err;
74
75 bh = nilfs_grab_buffer(inode, btnc, blocknr, 1 << BH_NILFS_Node);
76 if (unlikely(!bh))
77 return -ENOMEM;
78
79 err = -EEXIST; /* internal code */
80 if (newblk) {
81 if (unlikely(buffer_mapped(bh) || buffer_uptodate(bh) ||
82 buffer_dirty(bh))) {
83 brelse(bh);
84 BUG();
85 }
86 bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev;
87 bh->b_blocknr = blocknr;
88 set_buffer_mapped(bh);
89 set_buffer_uptodate(bh);
90 goto found;
91 }
92
93 if (buffer_uptodate(bh) || buffer_dirty(bh))
94 goto found;
95
96 if (pblocknr == 0) {
97 pblocknr = blocknr;
98 if (inode->i_ino != NILFS_DAT_INO) {
99 struct inode *dat =
100 nilfs_dat_inode(NILFS_I_NILFS(inode));
101
102 /* blocknr is a virtual block number */
103 err = nilfs_dat_translate(dat, blocknr, &pblocknr);
104 if (unlikely(err)) {
105 brelse(bh);
106 goto out_locked;
107 }
108 }
109 }
110 lock_buffer(bh);
111 if (buffer_uptodate(bh)) {
112 unlock_buffer(bh);
113 err = -EEXIST; /* internal code */
114 goto found;
115 }
116 set_buffer_mapped(bh);
117 bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev;
118 bh->b_blocknr = pblocknr; /* set block address for read */
119 bh->b_end_io = end_buffer_read_sync;
120 get_bh(bh);
121 submit_bh(READ, bh);
122 bh->b_blocknr = blocknr; /* set back to the given block address */
123 err = 0;
124found:
125 *pbh = bh;
126
127out_locked:
128 unlock_page(bh->b_page);
129 page_cache_release(bh->b_page);
130 return err;
131}
132
133int nilfs_btnode_get(struct address_space *btnc, __u64 blocknr,
134 sector_t pblocknr, struct buffer_head **pbh, int newblk)
135{
136 struct buffer_head *bh;
137 int err;
138
139 err = nilfs_btnode_submit_block(btnc, blocknr, pblocknr, pbh, newblk);
140 if (err == -EEXIST) /* internal code (cache hit) */
141 return 0;
142 if (unlikely(err))
143 return err;
144
145 bh = *pbh;
146 wait_on_buffer(bh);
147 if (!buffer_uptodate(bh)) {
148 brelse(bh);
149 return -EIO;
150 }
151 return 0;
152}
153
154/**
155 * nilfs_btnode_delete - delete B-tree node buffer
156 * @bh: buffer to be deleted
157 *
158 * nilfs_btnode_delete() invalidates the specified buffer and delete the page
159 * including the buffer if the page gets unbusy.
160 */
161void nilfs_btnode_delete(struct buffer_head *bh)
162{
163 struct address_space *mapping;
164 struct page *page = bh->b_page;
165 pgoff_t index = page_index(page);
166 int still_dirty;
167
168 page_cache_get(page);
169 lock_page(page);
170 wait_on_page_writeback(page);
171
172 nilfs_forget_buffer(bh);
173 still_dirty = PageDirty(page);
174 mapping = page->mapping;
175 unlock_page(page);
176 page_cache_release(page);
177
178 if (!still_dirty && mapping)
179 invalidate_inode_pages2_range(mapping, index, index);
180}
181
182/**
183 * nilfs_btnode_prepare_change_key
184 * prepare to move contents of the block for old key to one of new key.
185 * the old buffer will not be removed, but might be reused for new buffer.
186 * it might return -ENOMEM because of memory allocation errors,
187 * and might return -EIO because of disk read errors.
188 */
189int nilfs_btnode_prepare_change_key(struct address_space *btnc,
190 struct nilfs_btnode_chkey_ctxt *ctxt)
191{
192 struct buffer_head *obh, *nbh;
193 struct inode *inode = NILFS_BTNC_I(btnc);
194 __u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey;
195 int err;
196
197 if (oldkey == newkey)
198 return 0;
199
200 obh = ctxt->bh;
201 ctxt->newbh = NULL;
202
203 if (inode->i_blkbits == PAGE_CACHE_SHIFT) {
204 lock_page(obh->b_page);
205 /*
206 * We cannot call radix_tree_preload for the kernels older
207 * than 2.6.23, because it is not exported for modules.
208 */
209 err = radix_tree_preload(GFP_NOFS & ~__GFP_HIGHMEM);
210 if (err)
211 goto failed_unlock;
212 /* BUG_ON(oldkey != obh->b_page->index); */
213 if (unlikely(oldkey != obh->b_page->index))
214 NILFS_PAGE_BUG(obh->b_page,
215 "invalid oldkey %lld (newkey=%lld)",
216 (unsigned long long)oldkey,
217 (unsigned long long)newkey);
218
219retry:
220 spin_lock_irq(&btnc->tree_lock);
221 err = radix_tree_insert(&btnc->page_tree, newkey, obh->b_page);
222 spin_unlock_irq(&btnc->tree_lock);
223 /*
224 * Note: page->index will not change to newkey until
225 * nilfs_btnode_commit_change_key() will be called.
226 * To protect the page in intermediate state, the page lock
227 * is held.
228 */
229 radix_tree_preload_end();
230 if (!err)
231 return 0;
232 else if (err != -EEXIST)
233 goto failed_unlock;
234
235 err = invalidate_inode_pages2_range(btnc, newkey, newkey);
236 if (!err)
237 goto retry;
238 /* fallback to copy mode */
239 unlock_page(obh->b_page);
240 }
241
242 err = nilfs_btnode_get(btnc, newkey, 0, &nbh, 1);
243 if (likely(!err)) {
244 BUG_ON(nbh == obh);
245 ctxt->newbh = nbh;
246 }
247 return err;
248
249 failed_unlock:
250 unlock_page(obh->b_page);
251 return err;
252}
253
254/**
255 * nilfs_btnode_commit_change_key
256 * commit the change_key operation prepared by prepare_change_key().
257 */
258void nilfs_btnode_commit_change_key(struct address_space *btnc,
259 struct nilfs_btnode_chkey_ctxt *ctxt)
260{
261 struct buffer_head *obh = ctxt->bh, *nbh = ctxt->newbh;
262 __u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey;
263 struct page *opage;
264
265 if (oldkey == newkey)
266 return;
267
268 if (nbh == NULL) { /* blocksize == pagesize */
269 opage = obh->b_page;
270 if (unlikely(oldkey != opage->index))
271 NILFS_PAGE_BUG(opage,
272 "invalid oldkey %lld (newkey=%lld)",
273 (unsigned long long)oldkey,
274 (unsigned long long)newkey);
275 if (!test_set_buffer_dirty(obh) && TestSetPageDirty(opage))
276 BUG();
277
278 spin_lock_irq(&btnc->tree_lock);
279 radix_tree_delete(&btnc->page_tree, oldkey);
280 radix_tree_tag_set(&btnc->page_tree, newkey,
281 PAGECACHE_TAG_DIRTY);
282 spin_unlock_irq(&btnc->tree_lock);
283
284 opage->index = obh->b_blocknr = newkey;
285 unlock_page(opage);
286 } else {
287 nilfs_copy_buffer(nbh, obh);
288 nilfs_btnode_mark_dirty(nbh);
289
290 nbh->b_blocknr = newkey;
291 ctxt->bh = nbh;
292 nilfs_btnode_delete(obh); /* will decrement bh->b_count */
293 }
294}
295
296/**
297 * nilfs_btnode_abort_change_key
298 * abort the change_key operation prepared by prepare_change_key().
299 */
300void nilfs_btnode_abort_change_key(struct address_space *btnc,
301 struct nilfs_btnode_chkey_ctxt *ctxt)
302{
303 struct buffer_head *nbh = ctxt->newbh;
304 __u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey;
305
306 if (oldkey == newkey)
307 return;
308
309 if (nbh == NULL) { /* blocksize == pagesize */
310 spin_lock_irq(&btnc->tree_lock);
311 radix_tree_delete(&btnc->page_tree, newkey);
312 spin_unlock_irq(&btnc->tree_lock);
313 unlock_page(ctxt->bh->b_page);
314 } else
315 brelse(nbh);
316}
diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h
new file mode 100644
index 000000000000..35faa86444a7
--- /dev/null
+++ b/fs/nilfs2/btnode.h
@@ -0,0 +1,58 @@
1/*
2 * btnode.h - NILFS B-tree node cache
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Seiji Kihara <kihara@osrg.net>
21 * Revised by Ryusuke Konishi <ryusuke@osrg.net>
22 */
23
24#ifndef _NILFS_BTNODE_H
25#define _NILFS_BTNODE_H
26
27#include <linux/types.h>
28#include <linux/buffer_head.h>
29#include <linux/fs.h>
30#include <linux/backing-dev.h>
31
32
33struct nilfs_btnode_chkey_ctxt {
34 __u64 oldkey;
35 __u64 newkey;
36 struct buffer_head *bh;
37 struct buffer_head *newbh;
38};
39
40void nilfs_btnode_cache_init_once(struct address_space *);
41void nilfs_btnode_cache_init(struct address_space *);
42void nilfs_btnode_cache_clear(struct address_space *);
43int nilfs_btnode_submit_block(struct address_space *, __u64, sector_t,
44 struct buffer_head **, int);
45int nilfs_btnode_get(struct address_space *, __u64, sector_t,
46 struct buffer_head **, int);
47void nilfs_btnode_delete(struct buffer_head *);
48int nilfs_btnode_prepare_change_key(struct address_space *,
49 struct nilfs_btnode_chkey_ctxt *);
50void nilfs_btnode_commit_change_key(struct address_space *,
51 struct nilfs_btnode_chkey_ctxt *);
52void nilfs_btnode_abort_change_key(struct address_space *,
53 struct nilfs_btnode_chkey_ctxt *);
54
55#define nilfs_btnode_mark_dirty(bh) nilfs_mark_buffer_dirty(bh)
56
57
58#endif /* _NILFS_BTNODE_H */
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
new file mode 100644
index 000000000000..6b37a2767293
--- /dev/null
+++ b/fs/nilfs2/btree.c
@@ -0,0 +1,2269 @@
1/*
2 * btree.c - NILFS B-tree.
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#include <linux/slab.h>
24#include <linux/string.h>
25#include <linux/errno.h>
26#include <linux/pagevec.h>
27#include "nilfs.h"
28#include "page.h"
29#include "btnode.h"
30#include "btree.h"
31#include "alloc.h"
32
33/**
34 * struct nilfs_btree_path - A path on which B-tree operations are executed
35 * @bp_bh: buffer head of node block
36 * @bp_sib_bh: buffer head of sibling node block
37 * @bp_index: index of child node
38 * @bp_oldreq: ptr end request for old ptr
39 * @bp_newreq: ptr alloc request for new ptr
40 * @bp_op: rebalance operation
41 */
42struct nilfs_btree_path {
43 struct buffer_head *bp_bh;
44 struct buffer_head *bp_sib_bh;
45 int bp_index;
46 union nilfs_bmap_ptr_req bp_oldreq;
47 union nilfs_bmap_ptr_req bp_newreq;
48 struct nilfs_btnode_chkey_ctxt bp_ctxt;
49 void (*bp_op)(struct nilfs_btree *, struct nilfs_btree_path *,
50 int, __u64 *, __u64 *);
51};
52
53/*
54 * B-tree path operations
55 */
56
57static struct kmem_cache *nilfs_btree_path_cache;
58
59int __init nilfs_btree_path_cache_init(void)
60{
61 nilfs_btree_path_cache =
62 kmem_cache_create("nilfs2_btree_path_cache",
63 sizeof(struct nilfs_btree_path) *
64 NILFS_BTREE_LEVEL_MAX, 0, 0, NULL);
65 return (nilfs_btree_path_cache != NULL) ? 0 : -ENOMEM;
66}
67
68void nilfs_btree_path_cache_destroy(void)
69{
70 kmem_cache_destroy(nilfs_btree_path_cache);
71}
72
73static inline struct nilfs_btree_path *
74nilfs_btree_alloc_path(const struct nilfs_btree *btree)
75{
76 return (struct nilfs_btree_path *)
77 kmem_cache_alloc(nilfs_btree_path_cache, GFP_NOFS);
78}
79
80static inline void nilfs_btree_free_path(const struct nilfs_btree *btree,
81 struct nilfs_btree_path *path)
82{
83 kmem_cache_free(nilfs_btree_path_cache, path);
84}
85
86static void nilfs_btree_init_path(const struct nilfs_btree *btree,
87 struct nilfs_btree_path *path)
88{
89 int level;
90
91 for (level = NILFS_BTREE_LEVEL_DATA;
92 level < NILFS_BTREE_LEVEL_MAX;
93 level++) {
94 path[level].bp_bh = NULL;
95 path[level].bp_sib_bh = NULL;
96 path[level].bp_index = 0;
97 path[level].bp_oldreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
98 path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
99 path[level].bp_op = NULL;
100 }
101}
102
103static void nilfs_btree_clear_path(const struct nilfs_btree *btree,
104 struct nilfs_btree_path *path)
105{
106 int level;
107
108 for (level = NILFS_BTREE_LEVEL_DATA;
109 level < NILFS_BTREE_LEVEL_MAX;
110 level++) {
111 if (path[level].bp_bh != NULL) {
112 nilfs_bmap_put_block(&btree->bt_bmap,
113 path[level].bp_bh);
114 path[level].bp_bh = NULL;
115 }
116 /* sib_bh is released or deleted by prepare or commit
117 * operations. */
118 path[level].bp_sib_bh = NULL;
119 path[level].bp_index = 0;
120 path[level].bp_oldreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
121 path[level].bp_newreq.bpr_ptr = NILFS_BMAP_INVALID_PTR;
122 path[level].bp_op = NULL;
123 }
124}
125
126
127/*
128 * B-tree node operations
129 */
130
131static inline int
132nilfs_btree_node_get_flags(const struct nilfs_btree *btree,
133 const struct nilfs_btree_node *node)
134{
135 return node->bn_flags;
136}
137
138static inline void
139nilfs_btree_node_set_flags(struct nilfs_btree *btree,
140 struct nilfs_btree_node *node,
141 int flags)
142{
143 node->bn_flags = flags;
144}
145
146static inline int nilfs_btree_node_root(const struct nilfs_btree *btree,
147 const struct nilfs_btree_node *node)
148{
149 return nilfs_btree_node_get_flags(btree, node) & NILFS_BTREE_NODE_ROOT;
150}
151
152static inline int
153nilfs_btree_node_get_level(const struct nilfs_btree *btree,
154 const struct nilfs_btree_node *node)
155{
156 return node->bn_level;
157}
158
159static inline void
160nilfs_btree_node_set_level(struct nilfs_btree *btree,
161 struct nilfs_btree_node *node,
162 int level)
163{
164 node->bn_level = level;
165}
166
167static inline int
168nilfs_btree_node_get_nchildren(const struct nilfs_btree *btree,
169 const struct nilfs_btree_node *node)
170{
171 return le16_to_cpu(node->bn_nchildren);
172}
173
174static inline void
175nilfs_btree_node_set_nchildren(struct nilfs_btree *btree,
176 struct nilfs_btree_node *node,
177 int nchildren)
178{
179 node->bn_nchildren = cpu_to_le16(nchildren);
180}
181
182static inline int
183nilfs_btree_node_size(const struct nilfs_btree *btree)
184{
185 return 1 << btree->bt_bmap.b_inode->i_blkbits;
186}
187
188static inline int
189nilfs_btree_node_nchildren_min(const struct nilfs_btree *btree,
190 const struct nilfs_btree_node *node)
191{
192 return nilfs_btree_node_root(btree, node) ?
193 NILFS_BTREE_ROOT_NCHILDREN_MIN :
194 NILFS_BTREE_NODE_NCHILDREN_MIN(nilfs_btree_node_size(btree));
195}
196
197static inline int
198nilfs_btree_node_nchildren_max(const struct nilfs_btree *btree,
199 const struct nilfs_btree_node *node)
200{
201 return nilfs_btree_node_root(btree, node) ?
202 NILFS_BTREE_ROOT_NCHILDREN_MAX :
203 NILFS_BTREE_NODE_NCHILDREN_MAX(nilfs_btree_node_size(btree));
204}
205
206static inline __le64 *
207nilfs_btree_node_dkeys(const struct nilfs_btree *btree,
208 const struct nilfs_btree_node *node)
209{
210 return (__le64 *)((char *)(node + 1) +
211 (nilfs_btree_node_root(btree, node) ?
212 0 : NILFS_BTREE_NODE_EXTRA_PAD_SIZE));
213}
214
215static inline __le64 *
216nilfs_btree_node_dptrs(const struct nilfs_btree *btree,
217 const struct nilfs_btree_node *node)
218{
219 return (__le64 *)(nilfs_btree_node_dkeys(btree, node) +
220 nilfs_btree_node_nchildren_max(btree, node));
221}
222
223static inline __u64
224nilfs_btree_node_get_key(const struct nilfs_btree *btree,
225 const struct nilfs_btree_node *node, int index)
226{
227 return nilfs_bmap_dkey_to_key(*(nilfs_btree_node_dkeys(btree, node) +
228 index));
229}
230
231static inline void
232nilfs_btree_node_set_key(struct nilfs_btree *btree,
233 struct nilfs_btree_node *node, int index, __u64 key)
234{
235 *(nilfs_btree_node_dkeys(btree, node) + index) =
236 nilfs_bmap_key_to_dkey(key);
237}
238
239static inline __u64
240nilfs_btree_node_get_ptr(const struct nilfs_btree *btree,
241 const struct nilfs_btree_node *node,
242 int index)
243{
244 return nilfs_bmap_dptr_to_ptr(*(nilfs_btree_node_dptrs(btree, node) +
245 index));
246}
247
248static inline void
249nilfs_btree_node_set_ptr(struct nilfs_btree *btree,
250 struct nilfs_btree_node *node,
251 int index,
252 __u64 ptr)
253{
254 *(nilfs_btree_node_dptrs(btree, node) + index) =
255 nilfs_bmap_ptr_to_dptr(ptr);
256}
257
258static void nilfs_btree_node_init(struct nilfs_btree *btree,
259 struct nilfs_btree_node *node,
260 int flags, int level, int nchildren,
261 const __u64 *keys, const __u64 *ptrs)
262{
263 __le64 *dkeys;
264 __le64 *dptrs;
265 int i;
266
267 nilfs_btree_node_set_flags(btree, node, flags);
268 nilfs_btree_node_set_level(btree, node, level);
269 nilfs_btree_node_set_nchildren(btree, node, nchildren);
270
271 dkeys = nilfs_btree_node_dkeys(btree, node);
272 dptrs = nilfs_btree_node_dptrs(btree, node);
273 for (i = 0; i < nchildren; i++) {
274 dkeys[i] = nilfs_bmap_key_to_dkey(keys[i]);
275 dptrs[i] = nilfs_bmap_ptr_to_dptr(ptrs[i]);
276 }
277}
278
279/* Assume the buffer heads corresponding to left and right are locked. */
280static void nilfs_btree_node_move_left(struct nilfs_btree *btree,
281 struct nilfs_btree_node *left,
282 struct nilfs_btree_node *right,
283 int n)
284{
285 __le64 *ldkeys, *rdkeys;
286 __le64 *ldptrs, *rdptrs;
287 int lnchildren, rnchildren;
288
289 ldkeys = nilfs_btree_node_dkeys(btree, left);
290 ldptrs = nilfs_btree_node_dptrs(btree, left);
291 lnchildren = nilfs_btree_node_get_nchildren(btree, left);
292
293 rdkeys = nilfs_btree_node_dkeys(btree, right);
294 rdptrs = nilfs_btree_node_dptrs(btree, right);
295 rnchildren = nilfs_btree_node_get_nchildren(btree, right);
296
297 memcpy(ldkeys + lnchildren, rdkeys, n * sizeof(*rdkeys));
298 memcpy(ldptrs + lnchildren, rdptrs, n * sizeof(*rdptrs));
299 memmove(rdkeys, rdkeys + n, (rnchildren - n) * sizeof(*rdkeys));
300 memmove(rdptrs, rdptrs + n, (rnchildren - n) * sizeof(*rdptrs));
301
302 lnchildren += n;
303 rnchildren -= n;
304 nilfs_btree_node_set_nchildren(btree, left, lnchildren);
305 nilfs_btree_node_set_nchildren(btree, right, rnchildren);
306}
307
308/* Assume that the buffer heads corresponding to left and right are locked. */
309static void nilfs_btree_node_move_right(struct nilfs_btree *btree,
310 struct nilfs_btree_node *left,
311 struct nilfs_btree_node *right,
312 int n)
313{
314 __le64 *ldkeys, *rdkeys;
315 __le64 *ldptrs, *rdptrs;
316 int lnchildren, rnchildren;
317
318 ldkeys = nilfs_btree_node_dkeys(btree, left);
319 ldptrs = nilfs_btree_node_dptrs(btree, left);
320 lnchildren = nilfs_btree_node_get_nchildren(btree, left);
321
322 rdkeys = nilfs_btree_node_dkeys(btree, right);
323 rdptrs = nilfs_btree_node_dptrs(btree, right);
324 rnchildren = nilfs_btree_node_get_nchildren(btree, right);
325
326 memmove(rdkeys + n, rdkeys, rnchildren * sizeof(*rdkeys));
327 memmove(rdptrs + n, rdptrs, rnchildren * sizeof(*rdptrs));
328 memcpy(rdkeys, ldkeys + lnchildren - n, n * sizeof(*rdkeys));
329 memcpy(rdptrs, ldptrs + lnchildren - n, n * sizeof(*rdptrs));
330
331 lnchildren -= n;
332 rnchildren += n;
333 nilfs_btree_node_set_nchildren(btree, left, lnchildren);
334 nilfs_btree_node_set_nchildren(btree, right, rnchildren);
335}
336
337/* Assume that the buffer head corresponding to node is locked. */
338static void nilfs_btree_node_insert(struct nilfs_btree *btree,
339 struct nilfs_btree_node *node,
340 __u64 key, __u64 ptr, int index)
341{
342 __le64 *dkeys;
343 __le64 *dptrs;
344 int nchildren;
345
346 dkeys = nilfs_btree_node_dkeys(btree, node);
347 dptrs = nilfs_btree_node_dptrs(btree, node);
348 nchildren = nilfs_btree_node_get_nchildren(btree, node);
349 if (index < nchildren) {
350 memmove(dkeys + index + 1, dkeys + index,
351 (nchildren - index) * sizeof(*dkeys));
352 memmove(dptrs + index + 1, dptrs + index,
353 (nchildren - index) * sizeof(*dptrs));
354 }
355 dkeys[index] = nilfs_bmap_key_to_dkey(key);
356 dptrs[index] = nilfs_bmap_ptr_to_dptr(ptr);
357 nchildren++;
358 nilfs_btree_node_set_nchildren(btree, node, nchildren);
359}
360
361/* Assume that the buffer head corresponding to node is locked. */
362static void nilfs_btree_node_delete(struct nilfs_btree *btree,
363 struct nilfs_btree_node *node,
364 __u64 *keyp, __u64 *ptrp, int index)
365{
366 __u64 key;
367 __u64 ptr;
368 __le64 *dkeys;
369 __le64 *dptrs;
370 int nchildren;
371
372 dkeys = nilfs_btree_node_dkeys(btree, node);
373 dptrs = nilfs_btree_node_dptrs(btree, node);
374 key = nilfs_bmap_dkey_to_key(dkeys[index]);
375 ptr = nilfs_bmap_dptr_to_ptr(dptrs[index]);
376 nchildren = nilfs_btree_node_get_nchildren(btree, node);
377 if (keyp != NULL)
378 *keyp = key;
379 if (ptrp != NULL)
380 *ptrp = ptr;
381
382 if (index < nchildren - 1) {
383 memmove(dkeys + index, dkeys + index + 1,
384 (nchildren - index - 1) * sizeof(*dkeys));
385 memmove(dptrs + index, dptrs + index + 1,
386 (nchildren - index - 1) * sizeof(*dptrs));
387 }
388 nchildren--;
389 nilfs_btree_node_set_nchildren(btree, node, nchildren);
390}
391
392static int nilfs_btree_node_lookup(const struct nilfs_btree *btree,
393 const struct nilfs_btree_node *node,
394 __u64 key, int *indexp)
395{
396 __u64 nkey;
397 int index, low, high, s;
398
399 /* binary search */
400 low = 0;
401 high = nilfs_btree_node_get_nchildren(btree, node) - 1;
402 index = 0;
403 s = 0;
404 while (low <= high) {
405 index = (low + high) / 2;
406 nkey = nilfs_btree_node_get_key(btree, node, index);
407 if (nkey == key) {
408 s = 0;
409 goto out;
410 } else if (nkey < key) {
411 low = index + 1;
412 s = -1;
413 } else {
414 high = index - 1;
415 s = 1;
416 }
417 }
418
419 /* adjust index */
420 if (nilfs_btree_node_get_level(btree, node) >
421 NILFS_BTREE_LEVEL_NODE_MIN) {
422 if ((s > 0) && (index > 0))
423 index--;
424 } else if (s < 0)
425 index++;
426
427 out:
428 *indexp = index;
429
430 return s == 0;
431}
432
433static inline struct nilfs_btree_node *
434nilfs_btree_get_root(const struct nilfs_btree *btree)
435{
436 return (struct nilfs_btree_node *)btree->bt_bmap.b_u.u_data;
437}
438
439static inline struct nilfs_btree_node *
440nilfs_btree_get_nonroot_node(const struct nilfs_btree *btree,
441 const struct nilfs_btree_path *path,
442 int level)
443{
444 return (struct nilfs_btree_node *)path[level].bp_bh->b_data;
445}
446
447static inline struct nilfs_btree_node *
448nilfs_btree_get_sib_node(const struct nilfs_btree *btree,
449 const struct nilfs_btree_path *path,
450 int level)
451{
452 return (struct nilfs_btree_node *)path[level].bp_sib_bh->b_data;
453}
454
455static inline int nilfs_btree_height(const struct nilfs_btree *btree)
456{
457 return nilfs_btree_node_get_level(btree, nilfs_btree_get_root(btree))
458 + 1;
459}
460
461static inline struct nilfs_btree_node *
462nilfs_btree_get_node(const struct nilfs_btree *btree,
463 const struct nilfs_btree_path *path,
464 int level)
465{
466 return (level == nilfs_btree_height(btree) - 1) ?
467 nilfs_btree_get_root(btree) :
468 nilfs_btree_get_nonroot_node(btree, path, level);
469}
470
471static int nilfs_btree_do_lookup(const struct nilfs_btree *btree,
472 struct nilfs_btree_path *path,
473 __u64 key, __u64 *ptrp, int minlevel)
474{
475 struct nilfs_btree_node *node;
476 __u64 ptr;
477 int level, index, found, ret;
478
479 node = nilfs_btree_get_root(btree);
480 level = nilfs_btree_node_get_level(btree, node);
481 if ((level < minlevel) ||
482 (nilfs_btree_node_get_nchildren(btree, node) <= 0))
483 return -ENOENT;
484
485 found = nilfs_btree_node_lookup(btree, node, key, &index);
486 ptr = nilfs_btree_node_get_ptr(btree, node, index);
487 path[level].bp_bh = NULL;
488 path[level].bp_index = index;
489
490 for (level--; level >= minlevel; level--) {
491 ret = nilfs_bmap_get_block(&btree->bt_bmap, ptr,
492 &path[level].bp_bh);
493 if (ret < 0)
494 return ret;
495 node = nilfs_btree_get_nonroot_node(btree, path, level);
496 BUG_ON(level != nilfs_btree_node_get_level(btree, node));
497 if (!found)
498 found = nilfs_btree_node_lookup(btree, node, key,
499 &index);
500 else
501 index = 0;
502 if (index < nilfs_btree_node_nchildren_max(btree, node))
503 ptr = nilfs_btree_node_get_ptr(btree, node, index);
504 else {
505 WARN_ON(found || level != NILFS_BTREE_LEVEL_NODE_MIN);
506 /* insert */
507 ptr = NILFS_BMAP_INVALID_PTR;
508 }
509 path[level].bp_index = index;
510 }
511 if (!found)
512 return -ENOENT;
513
514 if (ptrp != NULL)
515 *ptrp = ptr;
516
517 return 0;
518}
519
520static int nilfs_btree_do_lookup_last(const struct nilfs_btree *btree,
521 struct nilfs_btree_path *path,
522 __u64 *keyp, __u64 *ptrp)
523{
524 struct nilfs_btree_node *node;
525 __u64 ptr;
526 int index, level, ret;
527
528 node = nilfs_btree_get_root(btree);
529 index = nilfs_btree_node_get_nchildren(btree, node) - 1;
530 if (index < 0)
531 return -ENOENT;
532 level = nilfs_btree_node_get_level(btree, node);
533 ptr = nilfs_btree_node_get_ptr(btree, node, index);
534 path[level].bp_bh = NULL;
535 path[level].bp_index = index;
536
537 for (level--; level > 0; level--) {
538 ret = nilfs_bmap_get_block(&btree->bt_bmap, ptr,
539 &path[level].bp_bh);
540 if (ret < 0)
541 return ret;
542 node = nilfs_btree_get_nonroot_node(btree, path, level);
543 BUG_ON(level != nilfs_btree_node_get_level(btree, node));
544 index = nilfs_btree_node_get_nchildren(btree, node) - 1;
545 ptr = nilfs_btree_node_get_ptr(btree, node, index);
546 path[level].bp_index = index;
547 }
548
549 if (keyp != NULL)
550 *keyp = nilfs_btree_node_get_key(btree, node, index);
551 if (ptrp != NULL)
552 *ptrp = ptr;
553
554 return 0;
555}
556
557static int nilfs_btree_lookup(const struct nilfs_bmap *bmap,
558 __u64 key, int level, __u64 *ptrp)
559{
560 struct nilfs_btree *btree;
561 struct nilfs_btree_path *path;
562 __u64 ptr;
563 int ret;
564
565 btree = (struct nilfs_btree *)bmap;
566 path = nilfs_btree_alloc_path(btree);
567 if (path == NULL)
568 return -ENOMEM;
569 nilfs_btree_init_path(btree, path);
570
571 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level);
572
573 if (ptrp != NULL)
574 *ptrp = ptr;
575
576 nilfs_btree_clear_path(btree, path);
577 nilfs_btree_free_path(btree, path);
578
579 return ret;
580}
581
582static void nilfs_btree_promote_key(struct nilfs_btree *btree,
583 struct nilfs_btree_path *path,
584 int level, __u64 key)
585{
586 if (level < nilfs_btree_height(btree) - 1) {
587 do {
588 lock_buffer(path[level].bp_bh);
589 nilfs_btree_node_set_key(
590 btree,
591 nilfs_btree_get_nonroot_node(
592 btree, path, level),
593 path[level].bp_index, key);
594 if (!buffer_dirty(path[level].bp_bh))
595 nilfs_btnode_mark_dirty(path[level].bp_bh);
596 unlock_buffer(path[level].bp_bh);
597 } while ((path[level].bp_index == 0) &&
598 (++level < nilfs_btree_height(btree) - 1));
599 }
600
601 /* root */
602 if (level == nilfs_btree_height(btree) - 1) {
603 nilfs_btree_node_set_key(btree,
604 nilfs_btree_get_root(btree),
605 path[level].bp_index, key);
606 }
607}
608
609static void nilfs_btree_do_insert(struct nilfs_btree *btree,
610 struct nilfs_btree_path *path,
611 int level, __u64 *keyp, __u64 *ptrp)
612{
613 struct nilfs_btree_node *node;
614
615 if (level < nilfs_btree_height(btree) - 1) {
616 lock_buffer(path[level].bp_bh);
617 node = nilfs_btree_get_nonroot_node(btree, path, level);
618 nilfs_btree_node_insert(btree, node, *keyp, *ptrp,
619 path[level].bp_index);
620 if (!buffer_dirty(path[level].bp_bh))
621 nilfs_btnode_mark_dirty(path[level].bp_bh);
622 unlock_buffer(path[level].bp_bh);
623
624 if (path[level].bp_index == 0)
625 nilfs_btree_promote_key(btree, path, level + 1,
626 nilfs_btree_node_get_key(
627 btree, node, 0));
628 } else {
629 node = nilfs_btree_get_root(btree);
630 nilfs_btree_node_insert(btree, node, *keyp, *ptrp,
631 path[level].bp_index);
632 }
633}
634
635static void nilfs_btree_carry_left(struct nilfs_btree *btree,
636 struct nilfs_btree_path *path,
637 int level, __u64 *keyp, __u64 *ptrp)
638{
639 struct nilfs_btree_node *node, *left;
640 int nchildren, lnchildren, n, move;
641
642 lock_buffer(path[level].bp_bh);
643 lock_buffer(path[level].bp_sib_bh);
644
645 node = nilfs_btree_get_nonroot_node(btree, path, level);
646 left = nilfs_btree_get_sib_node(btree, path, level);
647 nchildren = nilfs_btree_node_get_nchildren(btree, node);
648 lnchildren = nilfs_btree_node_get_nchildren(btree, left);
649 move = 0;
650
651 n = (nchildren + lnchildren + 1) / 2 - lnchildren;
652 if (n > path[level].bp_index) {
653 /* move insert point */
654 n--;
655 move = 1;
656 }
657
658 nilfs_btree_node_move_left(btree, left, node, n);
659
660 if (!buffer_dirty(path[level].bp_bh))
661 nilfs_btnode_mark_dirty(path[level].bp_bh);
662 if (!buffer_dirty(path[level].bp_sib_bh))
663 nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
664
665 unlock_buffer(path[level].bp_bh);
666 unlock_buffer(path[level].bp_sib_bh);
667
668 nilfs_btree_promote_key(btree, path, level + 1,
669 nilfs_btree_node_get_key(btree, node, 0));
670
671 if (move) {
672 nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_bh);
673 path[level].bp_bh = path[level].bp_sib_bh;
674 path[level].bp_sib_bh = NULL;
675 path[level].bp_index += lnchildren;
676 path[level + 1].bp_index--;
677 } else {
678 nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
679 path[level].bp_sib_bh = NULL;
680 path[level].bp_index -= n;
681 }
682
683 nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
684}
685
686static void nilfs_btree_carry_right(struct nilfs_btree *btree,
687 struct nilfs_btree_path *path,
688 int level, __u64 *keyp, __u64 *ptrp)
689{
690 struct nilfs_btree_node *node, *right;
691 int nchildren, rnchildren, n, move;
692
693 lock_buffer(path[level].bp_bh);
694 lock_buffer(path[level].bp_sib_bh);
695
696 node = nilfs_btree_get_nonroot_node(btree, path, level);
697 right = nilfs_btree_get_sib_node(btree, path, level);
698 nchildren = nilfs_btree_node_get_nchildren(btree, node);
699 rnchildren = nilfs_btree_node_get_nchildren(btree, right);
700 move = 0;
701
702 n = (nchildren + rnchildren + 1) / 2 - rnchildren;
703 if (n > nchildren - path[level].bp_index) {
704 /* move insert point */
705 n--;
706 move = 1;
707 }
708
709 nilfs_btree_node_move_right(btree, node, right, n);
710
711 if (!buffer_dirty(path[level].bp_bh))
712 nilfs_btnode_mark_dirty(path[level].bp_bh);
713 if (!buffer_dirty(path[level].bp_sib_bh))
714 nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
715
716 unlock_buffer(path[level].bp_bh);
717 unlock_buffer(path[level].bp_sib_bh);
718
719 path[level + 1].bp_index++;
720 nilfs_btree_promote_key(btree, path, level + 1,
721 nilfs_btree_node_get_key(btree, right, 0));
722 path[level + 1].bp_index--;
723
724 if (move) {
725 nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_bh);
726 path[level].bp_bh = path[level].bp_sib_bh;
727 path[level].bp_sib_bh = NULL;
728 path[level].bp_index -=
729 nilfs_btree_node_get_nchildren(btree, node);
730 path[level + 1].bp_index++;
731 } else {
732 nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
733 path[level].bp_sib_bh = NULL;
734 }
735
736 nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
737}
738
739static void nilfs_btree_split(struct nilfs_btree *btree,
740 struct nilfs_btree_path *path,
741 int level, __u64 *keyp, __u64 *ptrp)
742{
743 struct nilfs_btree_node *node, *right;
744 __u64 newkey;
745 __u64 newptr;
746 int nchildren, n, move;
747
748 lock_buffer(path[level].bp_bh);
749 lock_buffer(path[level].bp_sib_bh);
750
751 node = nilfs_btree_get_nonroot_node(btree, path, level);
752 right = nilfs_btree_get_sib_node(btree, path, level);
753 nchildren = nilfs_btree_node_get_nchildren(btree, node);
754 move = 0;
755
756 n = (nchildren + 1) / 2;
757 if (n > nchildren - path[level].bp_index) {
758 n--;
759 move = 1;
760 }
761
762 nilfs_btree_node_move_right(btree, node, right, n);
763
764 if (!buffer_dirty(path[level].bp_bh))
765 nilfs_btnode_mark_dirty(path[level].bp_bh);
766 if (!buffer_dirty(path[level].bp_sib_bh))
767 nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
768
769 unlock_buffer(path[level].bp_bh);
770 unlock_buffer(path[level].bp_sib_bh);
771
772 newkey = nilfs_btree_node_get_key(btree, right, 0);
773 newptr = path[level].bp_newreq.bpr_ptr;
774
775 if (move) {
776 path[level].bp_index -=
777 nilfs_btree_node_get_nchildren(btree, node);
778 nilfs_btree_node_insert(btree, right, *keyp, *ptrp,
779 path[level].bp_index);
780
781 *keyp = nilfs_btree_node_get_key(btree, right, 0);
782 *ptrp = path[level].bp_newreq.bpr_ptr;
783
784 nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_bh);
785 path[level].bp_bh = path[level].bp_sib_bh;
786 path[level].bp_sib_bh = NULL;
787 } else {
788 nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
789
790 *keyp = nilfs_btree_node_get_key(btree, right, 0);
791 *ptrp = path[level].bp_newreq.bpr_ptr;
792
793 nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
794 path[level].bp_sib_bh = NULL;
795 }
796
797 path[level + 1].bp_index++;
798}
799
800static void nilfs_btree_grow(struct nilfs_btree *btree,
801 struct nilfs_btree_path *path,
802 int level, __u64 *keyp, __u64 *ptrp)
803{
804 struct nilfs_btree_node *root, *child;
805 int n;
806
807 lock_buffer(path[level].bp_sib_bh);
808
809 root = nilfs_btree_get_root(btree);
810 child = nilfs_btree_get_sib_node(btree, path, level);
811
812 n = nilfs_btree_node_get_nchildren(btree, root);
813
814 nilfs_btree_node_move_right(btree, root, child, n);
815 nilfs_btree_node_set_level(btree, root, level + 1);
816
817 if (!buffer_dirty(path[level].bp_sib_bh))
818 nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
819
820 unlock_buffer(path[level].bp_sib_bh);
821
822 path[level].bp_bh = path[level].bp_sib_bh;
823 path[level].bp_sib_bh = NULL;
824
825 nilfs_btree_do_insert(btree, path, level, keyp, ptrp);
826
827 *keyp = nilfs_btree_node_get_key(btree, child, 0);
828 *ptrp = path[level].bp_newreq.bpr_ptr;
829}
830
831static __u64 nilfs_btree_find_near(const struct nilfs_btree *btree,
832 const struct nilfs_btree_path *path)
833{
834 struct nilfs_btree_node *node;
835 int level;
836
837 if (path == NULL)
838 return NILFS_BMAP_INVALID_PTR;
839
840 /* left sibling */
841 level = NILFS_BTREE_LEVEL_NODE_MIN;
842 if (path[level].bp_index > 0) {
843 node = nilfs_btree_get_node(btree, path, level);
844 return nilfs_btree_node_get_ptr(btree, node,
845 path[level].bp_index - 1);
846 }
847
848 /* parent */
849 level = NILFS_BTREE_LEVEL_NODE_MIN + 1;
850 if (level <= nilfs_btree_height(btree) - 1) {
851 node = nilfs_btree_get_node(btree, path, level);
852 return nilfs_btree_node_get_ptr(btree, node,
853 path[level].bp_index);
854 }
855
856 return NILFS_BMAP_INVALID_PTR;
857}
858
859static __u64 nilfs_btree_find_target_v(const struct nilfs_btree *btree,
860 const struct nilfs_btree_path *path,
861 __u64 key)
862{
863 __u64 ptr;
864
865 ptr = nilfs_bmap_find_target_seq(&btree->bt_bmap, key);
866 if (ptr != NILFS_BMAP_INVALID_PTR)
867 /* sequential access */
868 return ptr;
869 else {
870 ptr = nilfs_btree_find_near(btree, path);
871 if (ptr != NILFS_BMAP_INVALID_PTR)
872 /* near */
873 return ptr;
874 }
875 /* block group */
876 return nilfs_bmap_find_target_in_group(&btree->bt_bmap);
877}
878
879static void nilfs_btree_set_target_v(struct nilfs_btree *btree, __u64 key,
880 __u64 ptr)
881{
882 btree->bt_bmap.b_last_allocated_key = key;
883 btree->bt_bmap.b_last_allocated_ptr = ptr;
884}
885
886static int nilfs_btree_prepare_insert(struct nilfs_btree *btree,
887 struct nilfs_btree_path *path,
888 int *levelp, __u64 key, __u64 ptr,
889 struct nilfs_bmap_stats *stats)
890{
891 struct buffer_head *bh;
892 struct nilfs_btree_node *node, *parent, *sib;
893 __u64 sibptr;
894 int pindex, level, ret;
895
896 stats->bs_nblocks = 0;
897 level = NILFS_BTREE_LEVEL_DATA;
898
899 /* allocate a new ptr for data block */
900 if (btree->bt_ops->btop_find_target != NULL)
901 path[level].bp_newreq.bpr_ptr =
902 btree->bt_ops->btop_find_target(btree, path, key);
903
904 ret = btree->bt_bmap.b_pops->bpop_prepare_alloc_ptr(
905 &btree->bt_bmap, &path[level].bp_newreq);
906 if (ret < 0)
907 goto err_out_data;
908
909 for (level = NILFS_BTREE_LEVEL_NODE_MIN;
910 level < nilfs_btree_height(btree) - 1;
911 level++) {
912 node = nilfs_btree_get_nonroot_node(btree, path, level);
913 if (nilfs_btree_node_get_nchildren(btree, node) <
914 nilfs_btree_node_nchildren_max(btree, node)) {
915 path[level].bp_op = nilfs_btree_do_insert;
916 stats->bs_nblocks++;
917 goto out;
918 }
919
920 parent = nilfs_btree_get_node(btree, path, level + 1);
921 pindex = path[level + 1].bp_index;
922
923 /* left sibling */
924 if (pindex > 0) {
925 sibptr = nilfs_btree_node_get_ptr(btree, parent,
926 pindex - 1);
927 ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr,
928 &bh);
929 if (ret < 0)
930 goto err_out_child_node;
931 sib = (struct nilfs_btree_node *)bh->b_data;
932 if (nilfs_btree_node_get_nchildren(btree, sib) <
933 nilfs_btree_node_nchildren_max(btree, sib)) {
934 path[level].bp_sib_bh = bh;
935 path[level].bp_op = nilfs_btree_carry_left;
936 stats->bs_nblocks++;
937 goto out;
938 } else
939 nilfs_bmap_put_block(&btree->bt_bmap, bh);
940 }
941
942 /* right sibling */
943 if (pindex <
944 nilfs_btree_node_get_nchildren(btree, parent) - 1) {
945 sibptr = nilfs_btree_node_get_ptr(btree, parent,
946 pindex + 1);
947 ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr,
948 &bh);
949 if (ret < 0)
950 goto err_out_child_node;
951 sib = (struct nilfs_btree_node *)bh->b_data;
952 if (nilfs_btree_node_get_nchildren(btree, sib) <
953 nilfs_btree_node_nchildren_max(btree, sib)) {
954 path[level].bp_sib_bh = bh;
955 path[level].bp_op = nilfs_btree_carry_right;
956 stats->bs_nblocks++;
957 goto out;
958 } else
959 nilfs_bmap_put_block(&btree->bt_bmap, bh);
960 }
961
962 /* split */
963 path[level].bp_newreq.bpr_ptr =
964 path[level - 1].bp_newreq.bpr_ptr + 1;
965 ret = btree->bt_bmap.b_pops->bpop_prepare_alloc_ptr(
966 &btree->bt_bmap, &path[level].bp_newreq);
967 if (ret < 0)
968 goto err_out_child_node;
969 ret = nilfs_bmap_get_new_block(&btree->bt_bmap,
970 path[level].bp_newreq.bpr_ptr,
971 &bh);
972 if (ret < 0)
973 goto err_out_curr_node;
974
975 stats->bs_nblocks++;
976
977 lock_buffer(bh);
978 nilfs_btree_node_init(btree,
979 (struct nilfs_btree_node *)bh->b_data,
980 0, level, 0, NULL, NULL);
981 unlock_buffer(bh);
982 path[level].bp_sib_bh = bh;
983 path[level].bp_op = nilfs_btree_split;
984 }
985
986 /* root */
987 node = nilfs_btree_get_root(btree);
988 if (nilfs_btree_node_get_nchildren(btree, node) <
989 nilfs_btree_node_nchildren_max(btree, node)) {
990 path[level].bp_op = nilfs_btree_do_insert;
991 stats->bs_nblocks++;
992 goto out;
993 }
994
995 /* grow */
996 path[level].bp_newreq.bpr_ptr = path[level - 1].bp_newreq.bpr_ptr + 1;
997 ret = btree->bt_bmap.b_pops->bpop_prepare_alloc_ptr(
998 &btree->bt_bmap, &path[level].bp_newreq);
999 if (ret < 0)
1000 goto err_out_child_node;
1001 ret = nilfs_bmap_get_new_block(&btree->bt_bmap,
1002 path[level].bp_newreq.bpr_ptr, &bh);
1003 if (ret < 0)
1004 goto err_out_curr_node;
1005
1006 lock_buffer(bh);
1007 nilfs_btree_node_init(btree, (struct nilfs_btree_node *)bh->b_data,
1008 0, level, 0, NULL, NULL);
1009 unlock_buffer(bh);
1010 path[level].bp_sib_bh = bh;
1011 path[level].bp_op = nilfs_btree_grow;
1012
1013 level++;
1014 path[level].bp_op = nilfs_btree_do_insert;
1015
1016 /* a newly-created node block and a data block are added */
1017 stats->bs_nblocks += 2;
1018
1019 /* success */
1020 out:
1021 *levelp = level;
1022 return ret;
1023
1024 /* error */
1025 err_out_curr_node:
1026 btree->bt_bmap.b_pops->bpop_abort_alloc_ptr(&btree->bt_bmap,
1027 &path[level].bp_newreq);
1028 err_out_child_node:
1029 for (level--; level > NILFS_BTREE_LEVEL_DATA; level--) {
1030 nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_sib_bh);
1031 btree->bt_bmap.b_pops->bpop_abort_alloc_ptr(
1032 &btree->bt_bmap, &path[level].bp_newreq);
1033
1034 }
1035
1036 btree->bt_bmap.b_pops->bpop_abort_alloc_ptr(&btree->bt_bmap,
1037 &path[level].bp_newreq);
1038 err_out_data:
1039 *levelp = level;
1040 stats->bs_nblocks = 0;
1041 return ret;
1042}
1043
1044static void nilfs_btree_commit_insert(struct nilfs_btree *btree,
1045 struct nilfs_btree_path *path,
1046 int maxlevel, __u64 key, __u64 ptr)
1047{
1048 int level;
1049
1050 set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
1051 ptr = path[NILFS_BTREE_LEVEL_DATA].bp_newreq.bpr_ptr;
1052 if (btree->bt_ops->btop_set_target != NULL)
1053 btree->bt_ops->btop_set_target(btree, key, ptr);
1054
1055 for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
1056 if (btree->bt_bmap.b_pops->bpop_commit_alloc_ptr != NULL) {
1057 btree->bt_bmap.b_pops->bpop_commit_alloc_ptr(
1058 &btree->bt_bmap, &path[level - 1].bp_newreq);
1059 }
1060 path[level].bp_op(btree, path, level, &key, &ptr);
1061 }
1062
1063 if (!nilfs_bmap_dirty(&btree->bt_bmap))
1064 nilfs_bmap_set_dirty(&btree->bt_bmap);
1065}
1066
1067static int nilfs_btree_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
1068{
1069 struct nilfs_btree *btree;
1070 struct nilfs_btree_path *path;
1071 struct nilfs_bmap_stats stats;
1072 int level, ret;
1073
1074 btree = (struct nilfs_btree *)bmap;
1075 path = nilfs_btree_alloc_path(btree);
1076 if (path == NULL)
1077 return -ENOMEM;
1078 nilfs_btree_init_path(btree, path);
1079
1080 ret = nilfs_btree_do_lookup(btree, path, key, NULL,
1081 NILFS_BTREE_LEVEL_NODE_MIN);
1082 if (ret != -ENOENT) {
1083 if (ret == 0)
1084 ret = -EEXIST;
1085 goto out;
1086 }
1087
1088 ret = nilfs_btree_prepare_insert(btree, path, &level, key, ptr, &stats);
1089 if (ret < 0)
1090 goto out;
1091 nilfs_btree_commit_insert(btree, path, level, key, ptr);
1092 nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
1093
1094 out:
1095 nilfs_btree_clear_path(btree, path);
1096 nilfs_btree_free_path(btree, path);
1097 return ret;
1098}
1099
1100static void nilfs_btree_do_delete(struct nilfs_btree *btree,
1101 struct nilfs_btree_path *path,
1102 int level, __u64 *keyp, __u64 *ptrp)
1103{
1104 struct nilfs_btree_node *node;
1105
1106 if (level < nilfs_btree_height(btree) - 1) {
1107 lock_buffer(path[level].bp_bh);
1108 node = nilfs_btree_get_nonroot_node(btree, path, level);
1109 nilfs_btree_node_delete(btree, node, keyp, ptrp,
1110 path[level].bp_index);
1111 if (!buffer_dirty(path[level].bp_bh))
1112 nilfs_btnode_mark_dirty(path[level].bp_bh);
1113 unlock_buffer(path[level].bp_bh);
1114 if (path[level].bp_index == 0)
1115 nilfs_btree_promote_key(btree, path, level + 1,
1116 nilfs_btree_node_get_key(btree, node, 0));
1117 } else {
1118 node = nilfs_btree_get_root(btree);
1119 nilfs_btree_node_delete(btree, node, keyp, ptrp,
1120 path[level].bp_index);
1121 }
1122}
1123
1124static void nilfs_btree_borrow_left(struct nilfs_btree *btree,
1125 struct nilfs_btree_path *path,
1126 int level, __u64 *keyp, __u64 *ptrp)
1127{
1128 struct nilfs_btree_node *node, *left;
1129 int nchildren, lnchildren, n;
1130
1131 nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
1132
1133 lock_buffer(path[level].bp_bh);
1134 lock_buffer(path[level].bp_sib_bh);
1135
1136 node = nilfs_btree_get_nonroot_node(btree, path, level);
1137 left = nilfs_btree_get_sib_node(btree, path, level);
1138 nchildren = nilfs_btree_node_get_nchildren(btree, node);
1139 lnchildren = nilfs_btree_node_get_nchildren(btree, left);
1140
1141 n = (nchildren + lnchildren) / 2 - nchildren;
1142
1143 nilfs_btree_node_move_right(btree, left, node, n);
1144
1145 if (!buffer_dirty(path[level].bp_bh))
1146 nilfs_btnode_mark_dirty(path[level].bp_bh);
1147 if (!buffer_dirty(path[level].bp_sib_bh))
1148 nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
1149
1150 unlock_buffer(path[level].bp_bh);
1151 unlock_buffer(path[level].bp_sib_bh);
1152
1153 nilfs_btree_promote_key(btree, path, level + 1,
1154 nilfs_btree_node_get_key(btree, node, 0));
1155
1156 nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
1157 path[level].bp_sib_bh = NULL;
1158 path[level].bp_index += n;
1159}
1160
1161static void nilfs_btree_borrow_right(struct nilfs_btree *btree,
1162 struct nilfs_btree_path *path,
1163 int level, __u64 *keyp, __u64 *ptrp)
1164{
1165 struct nilfs_btree_node *node, *right;
1166 int nchildren, rnchildren, n;
1167
1168 nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
1169
1170 lock_buffer(path[level].bp_bh);
1171 lock_buffer(path[level].bp_sib_bh);
1172
1173 node = nilfs_btree_get_nonroot_node(btree, path, level);
1174 right = nilfs_btree_get_sib_node(btree, path, level);
1175 nchildren = nilfs_btree_node_get_nchildren(btree, node);
1176 rnchildren = nilfs_btree_node_get_nchildren(btree, right);
1177
1178 n = (nchildren + rnchildren) / 2 - nchildren;
1179
1180 nilfs_btree_node_move_left(btree, node, right, n);
1181
1182 if (!buffer_dirty(path[level].bp_bh))
1183 nilfs_btnode_mark_dirty(path[level].bp_bh);
1184 if (!buffer_dirty(path[level].bp_sib_bh))
1185 nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
1186
1187 unlock_buffer(path[level].bp_bh);
1188 unlock_buffer(path[level].bp_sib_bh);
1189
1190 path[level + 1].bp_index++;
1191 nilfs_btree_promote_key(btree, path, level + 1,
1192 nilfs_btree_node_get_key(btree, right, 0));
1193 path[level + 1].bp_index--;
1194
1195 nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
1196 path[level].bp_sib_bh = NULL;
1197}
1198
1199static void nilfs_btree_concat_left(struct nilfs_btree *btree,
1200 struct nilfs_btree_path *path,
1201 int level, __u64 *keyp, __u64 *ptrp)
1202{
1203 struct nilfs_btree_node *node, *left;
1204 int n;
1205
1206 nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
1207
1208 lock_buffer(path[level].bp_bh);
1209 lock_buffer(path[level].bp_sib_bh);
1210
1211 node = nilfs_btree_get_nonroot_node(btree, path, level);
1212 left = nilfs_btree_get_sib_node(btree, path, level);
1213
1214 n = nilfs_btree_node_get_nchildren(btree, node);
1215
1216 nilfs_btree_node_move_left(btree, left, node, n);
1217
1218 if (!buffer_dirty(path[level].bp_sib_bh))
1219 nilfs_btnode_mark_dirty(path[level].bp_sib_bh);
1220
1221 unlock_buffer(path[level].bp_bh);
1222 unlock_buffer(path[level].bp_sib_bh);
1223
1224 nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_bh);
1225 path[level].bp_bh = path[level].bp_sib_bh;
1226 path[level].bp_sib_bh = NULL;
1227 path[level].bp_index += nilfs_btree_node_get_nchildren(btree, left);
1228}
1229
1230static void nilfs_btree_concat_right(struct nilfs_btree *btree,
1231 struct nilfs_btree_path *path,
1232 int level, __u64 *keyp, __u64 *ptrp)
1233{
1234 struct nilfs_btree_node *node, *right;
1235 int n;
1236
1237 nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
1238
1239 lock_buffer(path[level].bp_bh);
1240 lock_buffer(path[level].bp_sib_bh);
1241
1242 node = nilfs_btree_get_nonroot_node(btree, path, level);
1243 right = nilfs_btree_get_sib_node(btree, path, level);
1244
1245 n = nilfs_btree_node_get_nchildren(btree, right);
1246
1247 nilfs_btree_node_move_left(btree, node, right, n);
1248
1249 if (!buffer_dirty(path[level].bp_bh))
1250 nilfs_btnode_mark_dirty(path[level].bp_bh);
1251
1252 unlock_buffer(path[level].bp_bh);
1253 unlock_buffer(path[level].bp_sib_bh);
1254
1255 nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_sib_bh);
1256 path[level].bp_sib_bh = NULL;
1257 path[level + 1].bp_index++;
1258}
1259
1260static void nilfs_btree_shrink(struct nilfs_btree *btree,
1261 struct nilfs_btree_path *path,
1262 int level, __u64 *keyp, __u64 *ptrp)
1263{
1264 struct nilfs_btree_node *root, *child;
1265 int n;
1266
1267 nilfs_btree_do_delete(btree, path, level, keyp, ptrp);
1268
1269 lock_buffer(path[level].bp_bh);
1270 root = nilfs_btree_get_root(btree);
1271 child = nilfs_btree_get_nonroot_node(btree, path, level);
1272
1273 nilfs_btree_node_delete(btree, root, NULL, NULL, 0);
1274 nilfs_btree_node_set_level(btree, root, level);
1275 n = nilfs_btree_node_get_nchildren(btree, child);
1276 nilfs_btree_node_move_left(btree, root, child, n);
1277 unlock_buffer(path[level].bp_bh);
1278
1279 nilfs_bmap_delete_block(&btree->bt_bmap, path[level].bp_bh);
1280 path[level].bp_bh = NULL;
1281}
1282
1283
1284static int nilfs_btree_prepare_delete(struct nilfs_btree *btree,
1285 struct nilfs_btree_path *path,
1286 int *levelp,
1287 struct nilfs_bmap_stats *stats)
1288{
1289 struct buffer_head *bh;
1290 struct nilfs_btree_node *node, *parent, *sib;
1291 __u64 sibptr;
1292 int pindex, level, ret;
1293
1294 ret = 0;
1295 stats->bs_nblocks = 0;
1296 for (level = NILFS_BTREE_LEVEL_NODE_MIN;
1297 level < nilfs_btree_height(btree) - 1;
1298 level++) {
1299 node = nilfs_btree_get_nonroot_node(btree, path, level);
1300 path[level].bp_oldreq.bpr_ptr =
1301 nilfs_btree_node_get_ptr(btree, node,
1302 path[level].bp_index);
1303 if (btree->bt_bmap.b_pops->bpop_prepare_end_ptr != NULL) {
1304 ret = btree->bt_bmap.b_pops->bpop_prepare_end_ptr(
1305 &btree->bt_bmap, &path[level].bp_oldreq);
1306 if (ret < 0)
1307 goto err_out_child_node;
1308 }
1309
1310 if (nilfs_btree_node_get_nchildren(btree, node) >
1311 nilfs_btree_node_nchildren_min(btree, node)) {
1312 path[level].bp_op = nilfs_btree_do_delete;
1313 stats->bs_nblocks++;
1314 goto out;
1315 }
1316
1317 parent = nilfs_btree_get_node(btree, path, level + 1);
1318 pindex = path[level + 1].bp_index;
1319
1320 if (pindex > 0) {
1321 /* left sibling */
1322 sibptr = nilfs_btree_node_get_ptr(btree, parent,
1323 pindex - 1);
1324 ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr,
1325 &bh);
1326 if (ret < 0)
1327 goto err_out_curr_node;
1328 sib = (struct nilfs_btree_node *)bh->b_data;
1329 if (nilfs_btree_node_get_nchildren(btree, sib) >
1330 nilfs_btree_node_nchildren_min(btree, sib)) {
1331 path[level].bp_sib_bh = bh;
1332 path[level].bp_op = nilfs_btree_borrow_left;
1333 stats->bs_nblocks++;
1334 goto out;
1335 } else {
1336 path[level].bp_sib_bh = bh;
1337 path[level].bp_op = nilfs_btree_concat_left;
1338 stats->bs_nblocks++;
1339 /* continue; */
1340 }
1341 } else if (pindex <
1342 nilfs_btree_node_get_nchildren(btree, parent) - 1) {
1343 /* right sibling */
1344 sibptr = nilfs_btree_node_get_ptr(btree, parent,
1345 pindex + 1);
1346 ret = nilfs_bmap_get_block(&btree->bt_bmap, sibptr,
1347 &bh);
1348 if (ret < 0)
1349 goto err_out_curr_node;
1350 sib = (struct nilfs_btree_node *)bh->b_data;
1351 if (nilfs_btree_node_get_nchildren(btree, sib) >
1352 nilfs_btree_node_nchildren_min(btree, sib)) {
1353 path[level].bp_sib_bh = bh;
1354 path[level].bp_op = nilfs_btree_borrow_right;
1355 stats->bs_nblocks++;
1356 goto out;
1357 } else {
1358 path[level].bp_sib_bh = bh;
1359 path[level].bp_op = nilfs_btree_concat_right;
1360 stats->bs_nblocks++;
1361 /* continue; */
1362 }
1363 } else {
1364 /* no siblings */
1365 /* the only child of the root node */
1366 WARN_ON(level != nilfs_btree_height(btree) - 2);
1367 if (nilfs_btree_node_get_nchildren(btree, node) - 1 <=
1368 NILFS_BTREE_ROOT_NCHILDREN_MAX) {
1369 path[level].bp_op = nilfs_btree_shrink;
1370 stats->bs_nblocks += 2;
1371 } else {
1372 path[level].bp_op = nilfs_btree_do_delete;
1373 stats->bs_nblocks++;
1374 }
1375
1376 goto out;
1377
1378 }
1379 }
1380
1381 node = nilfs_btree_get_root(btree);
1382 path[level].bp_oldreq.bpr_ptr =
1383 nilfs_btree_node_get_ptr(btree, node, path[level].bp_index);
1384 if (btree->bt_bmap.b_pops->bpop_prepare_end_ptr != NULL) {
1385 ret = btree->bt_bmap.b_pops->bpop_prepare_end_ptr(
1386 &btree->bt_bmap, &path[level].bp_oldreq);
1387 if (ret < 0)
1388 goto err_out_child_node;
1389 }
1390 /* child of the root node is deleted */
1391 path[level].bp_op = nilfs_btree_do_delete;
1392 stats->bs_nblocks++;
1393
1394 /* success */
1395 out:
1396 *levelp = level;
1397 return ret;
1398
1399 /* error */
1400 err_out_curr_node:
1401 if (btree->bt_bmap.b_pops->bpop_abort_end_ptr != NULL)
1402 btree->bt_bmap.b_pops->bpop_abort_end_ptr(
1403 &btree->bt_bmap, &path[level].bp_oldreq);
1404 err_out_child_node:
1405 for (level--; level >= NILFS_BTREE_LEVEL_NODE_MIN; level--) {
1406 nilfs_bmap_put_block(&btree->bt_bmap, path[level].bp_sib_bh);
1407 if (btree->bt_bmap.b_pops->bpop_abort_end_ptr != NULL)
1408 btree->bt_bmap.b_pops->bpop_abort_end_ptr(
1409 &btree->bt_bmap, &path[level].bp_oldreq);
1410 }
1411 *levelp = level;
1412 stats->bs_nblocks = 0;
1413 return ret;
1414}
1415
1416static void nilfs_btree_commit_delete(struct nilfs_btree *btree,
1417 struct nilfs_btree_path *path,
1418 int maxlevel)
1419{
1420 int level;
1421
1422 for (level = NILFS_BTREE_LEVEL_NODE_MIN; level <= maxlevel; level++) {
1423 if (btree->bt_bmap.b_pops->bpop_commit_end_ptr != NULL)
1424 btree->bt_bmap.b_pops->bpop_commit_end_ptr(
1425 &btree->bt_bmap, &path[level].bp_oldreq);
1426 path[level].bp_op(btree, path, level, NULL, NULL);
1427 }
1428
1429 if (!nilfs_bmap_dirty(&btree->bt_bmap))
1430 nilfs_bmap_set_dirty(&btree->bt_bmap);
1431}
1432
1433static int nilfs_btree_delete(struct nilfs_bmap *bmap, __u64 key)
1434
1435{
1436 struct nilfs_btree *btree;
1437 struct nilfs_btree_path *path;
1438 struct nilfs_bmap_stats stats;
1439 int level, ret;
1440
1441 btree = (struct nilfs_btree *)bmap;
1442 path = nilfs_btree_alloc_path(btree);
1443 if (path == NULL)
1444 return -ENOMEM;
1445 nilfs_btree_init_path(btree, path);
1446 ret = nilfs_btree_do_lookup(btree, path, key, NULL,
1447 NILFS_BTREE_LEVEL_NODE_MIN);
1448 if (ret < 0)
1449 goto out;
1450
1451 ret = nilfs_btree_prepare_delete(btree, path, &level, &stats);
1452 if (ret < 0)
1453 goto out;
1454 nilfs_btree_commit_delete(btree, path, level);
1455 nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks);
1456
1457out:
1458 nilfs_btree_clear_path(btree, path);
1459 nilfs_btree_free_path(btree, path);
1460 return ret;
1461}
1462
1463static int nilfs_btree_last_key(const struct nilfs_bmap *bmap, __u64 *keyp)
1464{
1465 struct nilfs_btree *btree;
1466 struct nilfs_btree_path *path;
1467 int ret;
1468
1469 btree = (struct nilfs_btree *)bmap;
1470 path = nilfs_btree_alloc_path(btree);
1471 if (path == NULL)
1472 return -ENOMEM;
1473 nilfs_btree_init_path(btree, path);
1474
1475 ret = nilfs_btree_do_lookup_last(btree, path, keyp, NULL);
1476
1477 nilfs_btree_clear_path(btree, path);
1478 nilfs_btree_free_path(btree, path);
1479
1480 return ret;
1481}
1482
1483static int nilfs_btree_check_delete(struct nilfs_bmap *bmap, __u64 key)
1484{
1485 struct buffer_head *bh;
1486 struct nilfs_btree *btree;
1487 struct nilfs_btree_node *root, *node;
1488 __u64 maxkey, nextmaxkey;
1489 __u64 ptr;
1490 int nchildren, ret;
1491
1492 btree = (struct nilfs_btree *)bmap;
1493 root = nilfs_btree_get_root(btree);
1494 switch (nilfs_btree_height(btree)) {
1495 case 2:
1496 bh = NULL;
1497 node = root;
1498 break;
1499 case 3:
1500 nchildren = nilfs_btree_node_get_nchildren(btree, root);
1501 if (nchildren > 1)
1502 return 0;
1503 ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1);
1504 ret = nilfs_bmap_get_block(bmap, ptr, &bh);
1505 if (ret < 0)
1506 return ret;
1507 node = (struct nilfs_btree_node *)bh->b_data;
1508 break;
1509 default:
1510 return 0;
1511 }
1512
1513 nchildren = nilfs_btree_node_get_nchildren(btree, node);
1514 maxkey = nilfs_btree_node_get_key(btree, node, nchildren - 1);
1515 nextmaxkey = (nchildren > 1) ?
1516 nilfs_btree_node_get_key(btree, node, nchildren - 2) : 0;
1517 if (bh != NULL)
1518 nilfs_bmap_put_block(bmap, bh);
1519
1520 return (maxkey == key) && (nextmaxkey < bmap->b_low);
1521}
1522
1523static int nilfs_btree_gather_data(struct nilfs_bmap *bmap,
1524 __u64 *keys, __u64 *ptrs, int nitems)
1525{
1526 struct buffer_head *bh;
1527 struct nilfs_btree *btree;
1528 struct nilfs_btree_node *node, *root;
1529 __le64 *dkeys;
1530 __le64 *dptrs;
1531 __u64 ptr;
1532 int nchildren, i, ret;
1533
1534 btree = (struct nilfs_btree *)bmap;
1535 root = nilfs_btree_get_root(btree);
1536 switch (nilfs_btree_height(btree)) {
1537 case 2:
1538 bh = NULL;
1539 node = root;
1540 break;
1541 case 3:
1542 nchildren = nilfs_btree_node_get_nchildren(btree, root);
1543 WARN_ON(nchildren > 1);
1544 ptr = nilfs_btree_node_get_ptr(btree, root, nchildren - 1);
1545 ret = nilfs_bmap_get_block(bmap, ptr, &bh);
1546 if (ret < 0)
1547 return ret;
1548 node = (struct nilfs_btree_node *)bh->b_data;
1549 break;
1550 default:
1551 node = NULL;
1552 return -EINVAL;
1553 }
1554
1555 nchildren = nilfs_btree_node_get_nchildren(btree, node);
1556 if (nchildren < nitems)
1557 nitems = nchildren;
1558 dkeys = nilfs_btree_node_dkeys(btree, node);
1559 dptrs = nilfs_btree_node_dptrs(btree, node);
1560 for (i = 0; i < nitems; i++) {
1561 keys[i] = nilfs_bmap_dkey_to_key(dkeys[i]);
1562 ptrs[i] = nilfs_bmap_dptr_to_ptr(dptrs[i]);
1563 }
1564
1565 if (bh != NULL)
1566 nilfs_bmap_put_block(bmap, bh);
1567
1568 return nitems;
1569}
1570
1571static int
1572nilfs_btree_prepare_convert_and_insert(struct nilfs_bmap *bmap, __u64 key,
1573 union nilfs_bmap_ptr_req *dreq,
1574 union nilfs_bmap_ptr_req *nreq,
1575 struct buffer_head **bhp,
1576 struct nilfs_bmap_stats *stats)
1577{
1578 struct buffer_head *bh;
1579 struct nilfs_btree *btree;
1580 int ret;
1581
1582 btree = (struct nilfs_btree *)bmap;
1583 stats->bs_nblocks = 0;
1584
1585 /* for data */
1586 /* cannot find near ptr */
1587 if (btree->bt_ops->btop_find_target != NULL)
1588 dreq->bpr_ptr
1589 = btree->bt_ops->btop_find_target(btree, NULL, key);
1590 ret = bmap->b_pops->bpop_prepare_alloc_ptr(bmap, dreq);
1591 if (ret < 0)
1592 return ret;
1593
1594 *bhp = NULL;
1595 stats->bs_nblocks++;
1596 if (nreq != NULL) {
1597 nreq->bpr_ptr = dreq->bpr_ptr + 1;
1598 ret = bmap->b_pops->bpop_prepare_alloc_ptr(bmap, nreq);
1599 if (ret < 0)
1600 goto err_out_dreq;
1601
1602 ret = nilfs_bmap_get_new_block(bmap, nreq->bpr_ptr, &bh);
1603 if (ret < 0)
1604 goto err_out_nreq;
1605
1606 *bhp = bh;
1607 stats->bs_nblocks++;
1608 }
1609
1610 /* success */
1611 return 0;
1612
1613 /* error */
1614 err_out_nreq:
1615 bmap->b_pops->bpop_abort_alloc_ptr(bmap, nreq);
1616 err_out_dreq:
1617 bmap->b_pops->bpop_abort_alloc_ptr(bmap, dreq);
1618 stats->bs_nblocks = 0;
1619 return ret;
1620
1621}
1622
1623static void
1624nilfs_btree_commit_convert_and_insert(struct nilfs_bmap *bmap,
1625 __u64 key, __u64 ptr,
1626 const __u64 *keys, const __u64 *ptrs,
1627 int n, __u64 low, __u64 high,
1628 union nilfs_bmap_ptr_req *dreq,
1629 union nilfs_bmap_ptr_req *nreq,
1630 struct buffer_head *bh)
1631{
1632 struct nilfs_btree *btree;
1633 struct nilfs_btree_node *node;
1634 __u64 tmpptr;
1635
1636 /* free resources */
1637 if (bmap->b_ops->bop_clear != NULL)
1638 bmap->b_ops->bop_clear(bmap);
1639
1640 /* ptr must be a pointer to a buffer head. */
1641 set_buffer_nilfs_volatile((struct buffer_head *)((unsigned long)ptr));
1642
1643 /* convert and insert */
1644 btree = (struct nilfs_btree *)bmap;
1645 nilfs_btree_init(bmap, low, high);
1646 if (nreq != NULL) {
1647 if (bmap->b_pops->bpop_commit_alloc_ptr != NULL) {
1648 bmap->b_pops->bpop_commit_alloc_ptr(bmap, dreq);
1649 bmap->b_pops->bpop_commit_alloc_ptr(bmap, nreq);
1650 }
1651
1652 /* create child node at level 1 */
1653 lock_buffer(bh);
1654 node = (struct nilfs_btree_node *)bh->b_data;
1655 nilfs_btree_node_init(btree, node, 0, 1, n, keys, ptrs);
1656 nilfs_btree_node_insert(btree, node,
1657 key, dreq->bpr_ptr, n);
1658 if (!buffer_dirty(bh))
1659 nilfs_btnode_mark_dirty(bh);
1660 if (!nilfs_bmap_dirty(bmap))
1661 nilfs_bmap_set_dirty(bmap);
1662
1663 unlock_buffer(bh);
1664 nilfs_bmap_put_block(bmap, bh);
1665
1666 /* create root node at level 2 */
1667 node = nilfs_btree_get_root(btree);
1668 tmpptr = nreq->bpr_ptr;
1669 nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT,
1670 2, 1, &keys[0], &tmpptr);
1671 } else {
1672 if (bmap->b_pops->bpop_commit_alloc_ptr != NULL)
1673 bmap->b_pops->bpop_commit_alloc_ptr(bmap, dreq);
1674
1675 /* create root node at level 1 */
1676 node = nilfs_btree_get_root(btree);
1677 nilfs_btree_node_init(btree, node, NILFS_BTREE_NODE_ROOT,
1678 1, n, keys, ptrs);
1679 nilfs_btree_node_insert(btree, node,
1680 key, dreq->bpr_ptr, n);
1681 if (!nilfs_bmap_dirty(bmap))
1682 nilfs_bmap_set_dirty(bmap);
1683 }
1684
1685 if (btree->bt_ops->btop_set_target != NULL)
1686 btree->bt_ops->btop_set_target(btree, key, dreq->bpr_ptr);
1687}
1688
1689/**
1690 * nilfs_btree_convert_and_insert -
1691 * @bmap:
1692 * @key:
1693 * @ptr:
1694 * @keys:
1695 * @ptrs:
1696 * @n:
1697 * @low:
1698 * @high:
1699 */
1700int nilfs_btree_convert_and_insert(struct nilfs_bmap *bmap,
1701 __u64 key, __u64 ptr,
1702 const __u64 *keys, const __u64 *ptrs,
1703 int n, __u64 low, __u64 high)
1704{
1705 struct buffer_head *bh;
1706 union nilfs_bmap_ptr_req dreq, nreq, *di, *ni;
1707 struct nilfs_bmap_stats stats;
1708 int ret;
1709
1710 if (n + 1 <= NILFS_BTREE_ROOT_NCHILDREN_MAX) {
1711 di = &dreq;
1712 ni = NULL;
1713 } else if ((n + 1) <= NILFS_BTREE_NODE_NCHILDREN_MAX(
1714 1 << bmap->b_inode->i_blkbits)) {
1715 di = &dreq;
1716 ni = &nreq;
1717 } else {
1718 di = NULL;
1719 ni = NULL;
1720 BUG();
1721 }
1722
1723 ret = nilfs_btree_prepare_convert_and_insert(bmap, key, di, ni, &bh,
1724 &stats);
1725 if (ret < 0)
1726 return ret;
1727 nilfs_btree_commit_convert_and_insert(bmap, key, ptr, keys, ptrs, n,
1728 low, high, di, ni, bh);
1729 nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
1730 return 0;
1731}
1732
1733static int nilfs_btree_propagate_p(struct nilfs_btree *btree,
1734 struct nilfs_btree_path *path,
1735 int level,
1736 struct buffer_head *bh)
1737{
1738 while ((++level < nilfs_btree_height(btree) - 1) &&
1739 !buffer_dirty(path[level].bp_bh))
1740 nilfs_btnode_mark_dirty(path[level].bp_bh);
1741
1742 return 0;
1743}
1744
1745static int nilfs_btree_prepare_update_v(struct nilfs_btree *btree,
1746 struct nilfs_btree_path *path,
1747 int level)
1748{
1749 struct nilfs_btree_node *parent;
1750 int ret;
1751
1752 parent = nilfs_btree_get_node(btree, path, level + 1);
1753 path[level].bp_oldreq.bpr_ptr =
1754 nilfs_btree_node_get_ptr(btree, parent,
1755 path[level + 1].bp_index);
1756 path[level].bp_newreq.bpr_ptr = path[level].bp_oldreq.bpr_ptr + 1;
1757 ret = nilfs_bmap_prepare_update(&btree->bt_bmap,
1758 &path[level].bp_oldreq,
1759 &path[level].bp_newreq);
1760 if (ret < 0)
1761 return ret;
1762
1763 if (buffer_nilfs_node(path[level].bp_bh)) {
1764 path[level].bp_ctxt.oldkey = path[level].bp_oldreq.bpr_ptr;
1765 path[level].bp_ctxt.newkey = path[level].bp_newreq.bpr_ptr;
1766 path[level].bp_ctxt.bh = path[level].bp_bh;
1767 ret = nilfs_btnode_prepare_change_key(
1768 &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
1769 &path[level].bp_ctxt);
1770 if (ret < 0) {
1771 nilfs_bmap_abort_update(&btree->bt_bmap,
1772 &path[level].bp_oldreq,
1773 &path[level].bp_newreq);
1774 return ret;
1775 }
1776 }
1777
1778 return 0;
1779}
1780
1781static void nilfs_btree_commit_update_v(struct nilfs_btree *btree,
1782 struct nilfs_btree_path *path,
1783 int level)
1784{
1785 struct nilfs_btree_node *parent;
1786
1787 nilfs_bmap_commit_update(&btree->bt_bmap,
1788 &path[level].bp_oldreq,
1789 &path[level].bp_newreq);
1790
1791 if (buffer_nilfs_node(path[level].bp_bh)) {
1792 nilfs_btnode_commit_change_key(
1793 &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
1794 &path[level].bp_ctxt);
1795 path[level].bp_bh = path[level].bp_ctxt.bh;
1796 }
1797 set_buffer_nilfs_volatile(path[level].bp_bh);
1798
1799 parent = nilfs_btree_get_node(btree, path, level + 1);
1800 nilfs_btree_node_set_ptr(btree, parent, path[level + 1].bp_index,
1801 path[level].bp_newreq.bpr_ptr);
1802}
1803
1804static void nilfs_btree_abort_update_v(struct nilfs_btree *btree,
1805 struct nilfs_btree_path *path,
1806 int level)
1807{
1808 nilfs_bmap_abort_update(&btree->bt_bmap,
1809 &path[level].bp_oldreq,
1810 &path[level].bp_newreq);
1811 if (buffer_nilfs_node(path[level].bp_bh))
1812 nilfs_btnode_abort_change_key(
1813 &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
1814 &path[level].bp_ctxt);
1815}
1816
1817static int nilfs_btree_prepare_propagate_v(struct nilfs_btree *btree,
1818 struct nilfs_btree_path *path,
1819 int minlevel,
1820 int *maxlevelp)
1821{
1822 int level, ret;
1823
1824 level = minlevel;
1825 if (!buffer_nilfs_volatile(path[level].bp_bh)) {
1826 ret = nilfs_btree_prepare_update_v(btree, path, level);
1827 if (ret < 0)
1828 return ret;
1829 }
1830 while ((++level < nilfs_btree_height(btree) - 1) &&
1831 !buffer_dirty(path[level].bp_bh)) {
1832
1833 WARN_ON(buffer_nilfs_volatile(path[level].bp_bh));
1834 ret = nilfs_btree_prepare_update_v(btree, path, level);
1835 if (ret < 0)
1836 goto out;
1837 }
1838
1839 /* success */
1840 *maxlevelp = level - 1;
1841 return 0;
1842
1843 /* error */
1844 out:
1845 while (--level > minlevel)
1846 nilfs_btree_abort_update_v(btree, path, level);
1847 if (!buffer_nilfs_volatile(path[level].bp_bh))
1848 nilfs_btree_abort_update_v(btree, path, level);
1849 return ret;
1850}
1851
1852static void nilfs_btree_commit_propagate_v(struct nilfs_btree *btree,
1853 struct nilfs_btree_path *path,
1854 int minlevel,
1855 int maxlevel,
1856 struct buffer_head *bh)
1857{
1858 int level;
1859
1860 if (!buffer_nilfs_volatile(path[minlevel].bp_bh))
1861 nilfs_btree_commit_update_v(btree, path, minlevel);
1862
1863 for (level = minlevel + 1; level <= maxlevel; level++)
1864 nilfs_btree_commit_update_v(btree, path, level);
1865}
1866
1867static int nilfs_btree_propagate_v(struct nilfs_btree *btree,
1868 struct nilfs_btree_path *path,
1869 int level,
1870 struct buffer_head *bh)
1871{
1872 int maxlevel, ret;
1873 struct nilfs_btree_node *parent;
1874 __u64 ptr;
1875
1876 get_bh(bh);
1877 path[level].bp_bh = bh;
1878 ret = nilfs_btree_prepare_propagate_v(btree, path, level, &maxlevel);
1879 if (ret < 0)
1880 goto out;
1881
1882 if (buffer_nilfs_volatile(path[level].bp_bh)) {
1883 parent = nilfs_btree_get_node(btree, path, level + 1);
1884 ptr = nilfs_btree_node_get_ptr(btree, parent,
1885 path[level + 1].bp_index);
1886 ret = nilfs_bmap_mark_dirty(&btree->bt_bmap, ptr);
1887 if (ret < 0)
1888 goto out;
1889 }
1890
1891 nilfs_btree_commit_propagate_v(btree, path, level, maxlevel, bh);
1892
1893 out:
1894 brelse(path[level].bp_bh);
1895 path[level].bp_bh = NULL;
1896 return ret;
1897}
1898
1899static int nilfs_btree_propagate(const struct nilfs_bmap *bmap,
1900 struct buffer_head *bh)
1901{
1902 struct nilfs_btree *btree;
1903 struct nilfs_btree_path *path;
1904 struct nilfs_btree_node *node;
1905 __u64 key;
1906 int level, ret;
1907
1908 WARN_ON(!buffer_dirty(bh));
1909
1910 btree = (struct nilfs_btree *)bmap;
1911 path = nilfs_btree_alloc_path(btree);
1912 if (path == NULL)
1913 return -ENOMEM;
1914 nilfs_btree_init_path(btree, path);
1915
1916 if (buffer_nilfs_node(bh)) {
1917 node = (struct nilfs_btree_node *)bh->b_data;
1918 key = nilfs_btree_node_get_key(btree, node, 0);
1919 level = nilfs_btree_node_get_level(btree, node);
1920 } else {
1921 key = nilfs_bmap_data_get_key(bmap, bh);
1922 level = NILFS_BTREE_LEVEL_DATA;
1923 }
1924
1925 ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1);
1926 if (ret < 0) {
1927 if (unlikely(ret == -ENOENT))
1928 printk(KERN_CRIT "%s: key = %llu, level == %d\n",
1929 __func__, (unsigned long long)key, level);
1930 goto out;
1931 }
1932
1933 ret = btree->bt_ops->btop_propagate(btree, path, level, bh);
1934
1935 out:
1936 nilfs_btree_clear_path(btree, path);
1937 nilfs_btree_free_path(btree, path);
1938
1939 return ret;
1940}
1941
1942static int nilfs_btree_propagate_gc(const struct nilfs_bmap *bmap,
1943 struct buffer_head *bh)
1944{
1945 return nilfs_bmap_mark_dirty(bmap, bh->b_blocknr);
1946}
1947
1948static void nilfs_btree_add_dirty_buffer(struct nilfs_btree *btree,
1949 struct list_head *lists,
1950 struct buffer_head *bh)
1951{
1952 struct list_head *head;
1953 struct buffer_head *cbh;
1954 struct nilfs_btree_node *node, *cnode;
1955 __u64 key, ckey;
1956 int level;
1957
1958 get_bh(bh);
1959 node = (struct nilfs_btree_node *)bh->b_data;
1960 key = nilfs_btree_node_get_key(btree, node, 0);
1961 level = nilfs_btree_node_get_level(btree, node);
1962 list_for_each(head, &lists[level]) {
1963 cbh = list_entry(head, struct buffer_head, b_assoc_buffers);
1964 cnode = (struct nilfs_btree_node *)cbh->b_data;
1965 ckey = nilfs_btree_node_get_key(btree, cnode, 0);
1966 if (key < ckey)
1967 break;
1968 }
1969 list_add_tail(&bh->b_assoc_buffers, head);
1970}
1971
1972static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *bmap,
1973 struct list_head *listp)
1974{
1975 struct nilfs_btree *btree = (struct nilfs_btree *)bmap;
1976 struct address_space *btcache = &NILFS_BMAP_I(bmap)->i_btnode_cache;
1977 struct list_head lists[NILFS_BTREE_LEVEL_MAX];
1978 struct pagevec pvec;
1979 struct buffer_head *bh, *head;
1980 pgoff_t index = 0;
1981 int level, i;
1982
1983 for (level = NILFS_BTREE_LEVEL_NODE_MIN;
1984 level < NILFS_BTREE_LEVEL_MAX;
1985 level++)
1986 INIT_LIST_HEAD(&lists[level]);
1987
1988 pagevec_init(&pvec, 0);
1989
1990 while (pagevec_lookup_tag(&pvec, btcache, &index, PAGECACHE_TAG_DIRTY,
1991 PAGEVEC_SIZE)) {
1992 for (i = 0; i < pagevec_count(&pvec); i++) {
1993 bh = head = page_buffers(pvec.pages[i]);
1994 do {
1995 if (buffer_dirty(bh))
1996 nilfs_btree_add_dirty_buffer(btree,
1997 lists, bh);
1998 } while ((bh = bh->b_this_page) != head);
1999 }
2000 pagevec_release(&pvec);
2001 cond_resched();
2002 }
2003
2004 for (level = NILFS_BTREE_LEVEL_NODE_MIN;
2005 level < NILFS_BTREE_LEVEL_MAX;
2006 level++)
2007 list_splice(&lists[level], listp->prev);
2008}
2009
2010static int nilfs_btree_assign_p(struct nilfs_btree *btree,
2011 struct nilfs_btree_path *path,
2012 int level,
2013 struct buffer_head **bh,
2014 sector_t blocknr,
2015 union nilfs_binfo *binfo)
2016{
2017 struct nilfs_btree_node *parent;
2018 __u64 key;
2019 __u64 ptr;
2020 int ret;
2021
2022 parent = nilfs_btree_get_node(btree, path, level + 1);
2023 ptr = nilfs_btree_node_get_ptr(btree, parent,
2024 path[level + 1].bp_index);
2025 if (buffer_nilfs_node(*bh)) {
2026 path[level].bp_ctxt.oldkey = ptr;
2027 path[level].bp_ctxt.newkey = blocknr;
2028 path[level].bp_ctxt.bh = *bh;
2029 ret = nilfs_btnode_prepare_change_key(
2030 &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
2031 &path[level].bp_ctxt);
2032 if (ret < 0)
2033 return ret;
2034 nilfs_btnode_commit_change_key(
2035 &NILFS_BMAP_I(&btree->bt_bmap)->i_btnode_cache,
2036 &path[level].bp_ctxt);
2037 *bh = path[level].bp_ctxt.bh;
2038 }
2039
2040 nilfs_btree_node_set_ptr(btree, parent,
2041 path[level + 1].bp_index, blocknr);
2042
2043 key = nilfs_btree_node_get_key(btree, parent,
2044 path[level + 1].bp_index);
2045 /* on-disk format */
2046 binfo->bi_dat.bi_blkoff = nilfs_bmap_key_to_dkey(key);
2047 binfo->bi_dat.bi_level = level;
2048
2049 return 0;
2050}
2051
2052static int nilfs_btree_assign_v(struct nilfs_btree *btree,
2053 struct nilfs_btree_path *path,
2054 int level,
2055 struct buffer_head **bh,
2056 sector_t blocknr,
2057 union nilfs_binfo *binfo)
2058{
2059 struct nilfs_btree_node *parent;
2060 __u64 key;
2061 __u64 ptr;
2062 union nilfs_bmap_ptr_req req;
2063 int ret;
2064
2065 parent = nilfs_btree_get_node(btree, path, level + 1);
2066 ptr = nilfs_btree_node_get_ptr(btree, parent,
2067 path[level + 1].bp_index);
2068 req.bpr_ptr = ptr;
2069 ret = btree->bt_bmap.b_pops->bpop_prepare_start_ptr(&btree->bt_bmap,
2070 &req);
2071 if (ret < 0)
2072 return ret;
2073 btree->bt_bmap.b_pops->bpop_commit_start_ptr(&btree->bt_bmap,
2074 &req, blocknr);
2075
2076 key = nilfs_btree_node_get_key(btree, parent,
2077 path[level + 1].bp_index);
2078 /* on-disk format */
2079 binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr);
2080 binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
2081
2082 return 0;
2083}
2084
2085static int nilfs_btree_assign(struct nilfs_bmap *bmap,
2086 struct buffer_head **bh,
2087 sector_t blocknr,
2088 union nilfs_binfo *binfo)
2089{
2090 struct nilfs_btree *btree;
2091 struct nilfs_btree_path *path;
2092 struct nilfs_btree_node *node;
2093 __u64 key;
2094 int level, ret;
2095
2096 btree = (struct nilfs_btree *)bmap;
2097 path = nilfs_btree_alloc_path(btree);
2098 if (path == NULL)
2099 return -ENOMEM;
2100 nilfs_btree_init_path(btree, path);
2101
2102 if (buffer_nilfs_node(*bh)) {
2103 node = (struct nilfs_btree_node *)(*bh)->b_data;
2104 key = nilfs_btree_node_get_key(btree, node, 0);
2105 level = nilfs_btree_node_get_level(btree, node);
2106 } else {
2107 key = nilfs_bmap_data_get_key(bmap, *bh);
2108 level = NILFS_BTREE_LEVEL_DATA;
2109 }
2110
2111 ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1);
2112 if (ret < 0) {
2113 WARN_ON(ret == -ENOENT);
2114 goto out;
2115 }
2116
2117 ret = btree->bt_ops->btop_assign(btree, path, level, bh,
2118 blocknr, binfo);
2119
2120 out:
2121 nilfs_btree_clear_path(btree, path);
2122 nilfs_btree_free_path(btree, path);
2123
2124 return ret;
2125}
2126
2127static int nilfs_btree_assign_gc(struct nilfs_bmap *bmap,
2128 struct buffer_head **bh,
2129 sector_t blocknr,
2130 union nilfs_binfo *binfo)
2131{
2132 struct nilfs_btree *btree;
2133 struct nilfs_btree_node *node;
2134 __u64 key;
2135 int ret;
2136
2137 btree = (struct nilfs_btree *)bmap;
2138 ret = nilfs_bmap_move_v(bmap, (*bh)->b_blocknr, blocknr);
2139 if (ret < 0)
2140 return ret;
2141
2142 if (buffer_nilfs_node(*bh)) {
2143 node = (struct nilfs_btree_node *)(*bh)->b_data;
2144 key = nilfs_btree_node_get_key(btree, node, 0);
2145 } else
2146 key = nilfs_bmap_data_get_key(bmap, *bh);
2147
2148 /* on-disk format */
2149 binfo->bi_v.bi_vblocknr = cpu_to_le64((*bh)->b_blocknr);
2150 binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
2151
2152 return 0;
2153}
2154
2155static int nilfs_btree_mark(struct nilfs_bmap *bmap, __u64 key, int level)
2156{
2157 struct buffer_head *bh;
2158 struct nilfs_btree *btree;
2159 struct nilfs_btree_path *path;
2160 __u64 ptr;
2161 int ret;
2162
2163 btree = (struct nilfs_btree *)bmap;
2164 path = nilfs_btree_alloc_path(btree);
2165 if (path == NULL)
2166 return -ENOMEM;
2167 nilfs_btree_init_path(btree, path);
2168
2169 ret = nilfs_btree_do_lookup(btree, path, key, &ptr, level + 1);
2170 if (ret < 0) {
2171 WARN_ON(ret == -ENOENT);
2172 goto out;
2173 }
2174 ret = nilfs_bmap_get_block(&btree->bt_bmap, ptr, &bh);
2175 if (ret < 0) {
2176 WARN_ON(ret == -ENOENT);
2177 goto out;
2178 }
2179
2180 if (!buffer_dirty(bh))
2181 nilfs_btnode_mark_dirty(bh);
2182 nilfs_bmap_put_block(&btree->bt_bmap, bh);
2183 if (!nilfs_bmap_dirty(&btree->bt_bmap))
2184 nilfs_bmap_set_dirty(&btree->bt_bmap);
2185
2186 out:
2187 nilfs_btree_clear_path(btree, path);
2188 nilfs_btree_free_path(btree, path);
2189 return ret;
2190}
2191
2192static const struct nilfs_bmap_operations nilfs_btree_ops = {
2193 .bop_lookup = nilfs_btree_lookup,
2194 .bop_insert = nilfs_btree_insert,
2195 .bop_delete = nilfs_btree_delete,
2196 .bop_clear = NULL,
2197
2198 .bop_propagate = nilfs_btree_propagate,
2199
2200 .bop_lookup_dirty_buffers = nilfs_btree_lookup_dirty_buffers,
2201
2202 .bop_assign = nilfs_btree_assign,
2203 .bop_mark = nilfs_btree_mark,
2204
2205 .bop_last_key = nilfs_btree_last_key,
2206 .bop_check_insert = NULL,
2207 .bop_check_delete = nilfs_btree_check_delete,
2208 .bop_gather_data = nilfs_btree_gather_data,
2209};
2210
2211static const struct nilfs_bmap_operations nilfs_btree_ops_gc = {
2212 .bop_lookup = NULL,
2213 .bop_insert = NULL,
2214 .bop_delete = NULL,
2215 .bop_clear = NULL,
2216
2217 .bop_propagate = nilfs_btree_propagate_gc,
2218
2219 .bop_lookup_dirty_buffers = nilfs_btree_lookup_dirty_buffers,
2220
2221 .bop_assign = nilfs_btree_assign_gc,
2222 .bop_mark = NULL,
2223
2224 .bop_last_key = NULL,
2225 .bop_check_insert = NULL,
2226 .bop_check_delete = NULL,
2227 .bop_gather_data = NULL,
2228};
2229
2230static const struct nilfs_btree_operations nilfs_btree_ops_v = {
2231 .btop_find_target = nilfs_btree_find_target_v,
2232 .btop_set_target = nilfs_btree_set_target_v,
2233 .btop_propagate = nilfs_btree_propagate_v,
2234 .btop_assign = nilfs_btree_assign_v,
2235};
2236
2237static const struct nilfs_btree_operations nilfs_btree_ops_p = {
2238 .btop_find_target = NULL,
2239 .btop_set_target = NULL,
2240 .btop_propagate = nilfs_btree_propagate_p,
2241 .btop_assign = nilfs_btree_assign_p,
2242};
2243
2244int nilfs_btree_init(struct nilfs_bmap *bmap, __u64 low, __u64 high)
2245{
2246 struct nilfs_btree *btree;
2247
2248 btree = (struct nilfs_btree *)bmap;
2249 bmap->b_ops = &nilfs_btree_ops;
2250 bmap->b_low = low;
2251 bmap->b_high = high;
2252 switch (bmap->b_inode->i_ino) {
2253 case NILFS_DAT_INO:
2254 btree->bt_ops = &nilfs_btree_ops_p;
2255 break;
2256 default:
2257 btree->bt_ops = &nilfs_btree_ops_v;
2258 break;
2259 }
2260
2261 return 0;
2262}
2263
2264void nilfs_btree_init_gc(struct nilfs_bmap *bmap)
2265{
2266 bmap->b_low = NILFS_BMAP_LARGE_LOW;
2267 bmap->b_high = NILFS_BMAP_LARGE_HIGH;
2268 bmap->b_ops = &nilfs_btree_ops_gc;
2269}
diff --git a/fs/nilfs2/btree.h b/fs/nilfs2/btree.h
new file mode 100644
index 000000000000..4766deb52fb1
--- /dev/null
+++ b/fs/nilfs2/btree.h
@@ -0,0 +1,117 @@
1/*
2 * btree.h - NILFS B-tree.
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#ifndef _NILFS_BTREE_H
24#define _NILFS_BTREE_H
25
26#include <linux/types.h>
27#include <linux/buffer_head.h>
28#include <linux/list.h>
29#include <linux/nilfs2_fs.h>
30#include "btnode.h"
31#include "bmap.h"
32
33struct nilfs_btree;
34struct nilfs_btree_path;
35
36/**
37 * struct nilfs_btree_operations - B-tree operation table
38 */
39struct nilfs_btree_operations {
40 __u64 (*btop_find_target)(const struct nilfs_btree *,
41 const struct nilfs_btree_path *, __u64);
42 void (*btop_set_target)(struct nilfs_btree *, __u64, __u64);
43
44 struct the_nilfs *(*btop_get_nilfs)(struct nilfs_btree *);
45
46 int (*btop_propagate)(struct nilfs_btree *,
47 struct nilfs_btree_path *,
48 int,
49 struct buffer_head *);
50 int (*btop_assign)(struct nilfs_btree *,
51 struct nilfs_btree_path *,
52 int,
53 struct buffer_head **,
54 sector_t,
55 union nilfs_binfo *);
56};
57
58/**
59 * struct nilfs_btree_node - B-tree node
60 * @bn_flags: flags
61 * @bn_level: level
62 * @bn_nchildren: number of children
63 * @bn_pad: padding
64 */
65struct nilfs_btree_node {
66 __u8 bn_flags;
67 __u8 bn_level;
68 __le16 bn_nchildren;
69 __le32 bn_pad;
70};
71
72/* flags */
73#define NILFS_BTREE_NODE_ROOT 0x01
74
75/* level */
76#define NILFS_BTREE_LEVEL_DATA 0
77#define NILFS_BTREE_LEVEL_NODE_MIN (NILFS_BTREE_LEVEL_DATA + 1)
78#define NILFS_BTREE_LEVEL_MAX 14
79
80/**
81 * struct nilfs_btree - B-tree structure
82 * @bt_bmap: bmap base structure
83 * @bt_ops: B-tree operation table
84 */
85struct nilfs_btree {
86 struct nilfs_bmap bt_bmap;
87
88 /* B-tree-specific members */
89 const struct nilfs_btree_operations *bt_ops;
90};
91
92
93#define NILFS_BTREE_ROOT_SIZE NILFS_BMAP_SIZE
94#define NILFS_BTREE_ROOT_NCHILDREN_MAX \
95 ((NILFS_BTREE_ROOT_SIZE - sizeof(struct nilfs_btree_node)) / \
96 (sizeof(__le64 /* dkey */) + sizeof(__le64 /* dptr */)))
97#define NILFS_BTREE_ROOT_NCHILDREN_MIN 0
98#define NILFS_BTREE_NODE_EXTRA_PAD_SIZE (sizeof(__le64))
99#define NILFS_BTREE_NODE_NCHILDREN_MAX(nodesize) \
100 (((nodesize) - sizeof(struct nilfs_btree_node) - \
101 NILFS_BTREE_NODE_EXTRA_PAD_SIZE) / \
102 (sizeof(__le64 /* dkey */) + sizeof(__le64 /* dptr */)))
103#define NILFS_BTREE_NODE_NCHILDREN_MIN(nodesize) \
104 ((NILFS_BTREE_NODE_NCHILDREN_MAX(nodesize) - 1) / 2 + 1)
105#define NILFS_BTREE_KEY_MIN ((__u64)0)
106#define NILFS_BTREE_KEY_MAX (~(__u64)0)
107
108
109int nilfs_btree_path_cache_init(void);
110void nilfs_btree_path_cache_destroy(void);
111int nilfs_btree_init(struct nilfs_bmap *, __u64, __u64);
112int nilfs_btree_convert_and_insert(struct nilfs_bmap *, __u64, __u64,
113 const __u64 *, const __u64 *,
114 int, __u64, __u64);
115void nilfs_btree_init_gc(struct nilfs_bmap *);
116
117#endif /* _NILFS_BTREE_H */
diff --git a/fs/nilfs2/cpfile.c b/fs/nilfs2/cpfile.c
new file mode 100644
index 000000000000..e90b60dfced9
--- /dev/null
+++ b/fs/nilfs2/cpfile.c
@@ -0,0 +1,925 @@
1/*
2 * cpfile.c - NILFS checkpoint file.
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#include <linux/kernel.h>
24#include <linux/fs.h>
25#include <linux/string.h>
26#include <linux/buffer_head.h>
27#include <linux/errno.h>
28#include <linux/nilfs2_fs.h>
29#include "mdt.h"
30#include "cpfile.h"
31
32
33static inline unsigned long
34nilfs_cpfile_checkpoints_per_block(const struct inode *cpfile)
35{
36 return NILFS_MDT(cpfile)->mi_entries_per_block;
37}
38
39/* block number from the beginning of the file */
40static unsigned long
41nilfs_cpfile_get_blkoff(const struct inode *cpfile, __u64 cno)
42{
43 __u64 tcno = cno + NILFS_MDT(cpfile)->mi_first_entry_offset - 1;
44 do_div(tcno, nilfs_cpfile_checkpoints_per_block(cpfile));
45 return (unsigned long)tcno;
46}
47
48/* offset in block */
49static unsigned long
50nilfs_cpfile_get_offset(const struct inode *cpfile, __u64 cno)
51{
52 __u64 tcno = cno + NILFS_MDT(cpfile)->mi_first_entry_offset - 1;
53 return do_div(tcno, nilfs_cpfile_checkpoints_per_block(cpfile));
54}
55
56static unsigned long
57nilfs_cpfile_checkpoints_in_block(const struct inode *cpfile,
58 __u64 curr,
59 __u64 max)
60{
61 return min_t(__u64,
62 nilfs_cpfile_checkpoints_per_block(cpfile) -
63 nilfs_cpfile_get_offset(cpfile, curr),
64 max - curr);
65}
66
67static inline int nilfs_cpfile_is_in_first(const struct inode *cpfile,
68 __u64 cno)
69{
70 return nilfs_cpfile_get_blkoff(cpfile, cno) == 0;
71}
72
73static unsigned int
74nilfs_cpfile_block_add_valid_checkpoints(const struct inode *cpfile,
75 struct buffer_head *bh,
76 void *kaddr,
77 unsigned int n)
78{
79 struct nilfs_checkpoint *cp = kaddr + bh_offset(bh);
80 unsigned int count;
81
82 count = le32_to_cpu(cp->cp_checkpoints_count) + n;
83 cp->cp_checkpoints_count = cpu_to_le32(count);
84 return count;
85}
86
87static unsigned int
88nilfs_cpfile_block_sub_valid_checkpoints(const struct inode *cpfile,
89 struct buffer_head *bh,
90 void *kaddr,
91 unsigned int n)
92{
93 struct nilfs_checkpoint *cp = kaddr + bh_offset(bh);
94 unsigned int count;
95
96 WARN_ON(le32_to_cpu(cp->cp_checkpoints_count) < n);
97 count = le32_to_cpu(cp->cp_checkpoints_count) - n;
98 cp->cp_checkpoints_count = cpu_to_le32(count);
99 return count;
100}
101
102static inline struct nilfs_cpfile_header *
103nilfs_cpfile_block_get_header(const struct inode *cpfile,
104 struct buffer_head *bh,
105 void *kaddr)
106{
107 return kaddr + bh_offset(bh);
108}
109
110static struct nilfs_checkpoint *
111nilfs_cpfile_block_get_checkpoint(const struct inode *cpfile, __u64 cno,
112 struct buffer_head *bh,
113 void *kaddr)
114{
115 return kaddr + bh_offset(bh) + nilfs_cpfile_get_offset(cpfile, cno) *
116 NILFS_MDT(cpfile)->mi_entry_size;
117}
118
119static void nilfs_cpfile_block_init(struct inode *cpfile,
120 struct buffer_head *bh,
121 void *kaddr)
122{
123 struct nilfs_checkpoint *cp = kaddr + bh_offset(bh);
124 size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size;
125 int n = nilfs_cpfile_checkpoints_per_block(cpfile);
126
127 while (n-- > 0) {
128 nilfs_checkpoint_set_invalid(cp);
129 cp = (void *)cp + cpsz;
130 }
131}
132
133static inline int nilfs_cpfile_get_header_block(struct inode *cpfile,
134 struct buffer_head **bhp)
135{
136 return nilfs_mdt_get_block(cpfile, 0, 0, NULL, bhp);
137}
138
139static inline int nilfs_cpfile_get_checkpoint_block(struct inode *cpfile,
140 __u64 cno,
141 int create,
142 struct buffer_head **bhp)
143{
144 return nilfs_mdt_get_block(cpfile,
145 nilfs_cpfile_get_blkoff(cpfile, cno),
146 create, nilfs_cpfile_block_init, bhp);
147}
148
149static inline int nilfs_cpfile_delete_checkpoint_block(struct inode *cpfile,
150 __u64 cno)
151{
152 return nilfs_mdt_delete_block(cpfile,
153 nilfs_cpfile_get_blkoff(cpfile, cno));
154}
155
156/**
157 * nilfs_cpfile_get_checkpoint - get a checkpoint
158 * @cpfile: inode of checkpoint file
159 * @cno: checkpoint number
160 * @create: create flag
161 * @cpp: pointer to a checkpoint
162 * @bhp: pointer to a buffer head
163 *
164 * Description: nilfs_cpfile_get_checkpoint() acquires the checkpoint
165 * specified by @cno. A new checkpoint will be created if @cno is the current
166 * checkpoint number and @create is nonzero.
167 *
168 * Return Value: On success, 0 is returned, and the checkpoint and the
169 * buffer head of the buffer on which the checkpoint is located are stored in
170 * the place pointed by @cpp and @bhp, respectively. On error, one of the
171 * following negative error codes is returned.
172 *
173 * %-EIO - I/O error.
174 *
175 * %-ENOMEM - Insufficient amount of memory available.
176 *
177 * %-ENOENT - No such checkpoint.
178 *
179 * %-EINVAL - invalid checkpoint.
180 */
181int nilfs_cpfile_get_checkpoint(struct inode *cpfile,
182 __u64 cno,
183 int create,
184 struct nilfs_checkpoint **cpp,
185 struct buffer_head **bhp)
186{
187 struct buffer_head *header_bh, *cp_bh;
188 struct nilfs_cpfile_header *header;
189 struct nilfs_checkpoint *cp;
190 void *kaddr;
191 int ret;
192
193 if (unlikely(cno < 1 || cno > nilfs_mdt_cno(cpfile) ||
194 (cno < nilfs_mdt_cno(cpfile) && create)))
195 return -EINVAL;
196
197 down_write(&NILFS_MDT(cpfile)->mi_sem);
198
199 ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
200 if (ret < 0)
201 goto out_sem;
202 ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, create, &cp_bh);
203 if (ret < 0)
204 goto out_header;
205 kaddr = kmap(cp_bh->b_page);
206 cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
207 if (nilfs_checkpoint_invalid(cp)) {
208 if (!create) {
209 kunmap(cp_bh->b_page);
210 brelse(cp_bh);
211 ret = -ENOENT;
212 goto out_header;
213 }
214 /* a newly-created checkpoint */
215 nilfs_checkpoint_clear_invalid(cp);
216 if (!nilfs_cpfile_is_in_first(cpfile, cno))
217 nilfs_cpfile_block_add_valid_checkpoints(cpfile, cp_bh,
218 kaddr, 1);
219 nilfs_mdt_mark_buffer_dirty(cp_bh);
220
221 kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
222 header = nilfs_cpfile_block_get_header(cpfile, header_bh,
223 kaddr);
224 le64_add_cpu(&header->ch_ncheckpoints, 1);
225 kunmap_atomic(kaddr, KM_USER0);
226 nilfs_mdt_mark_buffer_dirty(header_bh);
227 nilfs_mdt_mark_dirty(cpfile);
228 }
229
230 if (cpp != NULL)
231 *cpp = cp;
232 *bhp = cp_bh;
233
234 out_header:
235 brelse(header_bh);
236
237 out_sem:
238 up_write(&NILFS_MDT(cpfile)->mi_sem);
239 return ret;
240}
241
242/**
243 * nilfs_cpfile_put_checkpoint - put a checkpoint
244 * @cpfile: inode of checkpoint file
245 * @cno: checkpoint number
246 * @bh: buffer head
247 *
248 * Description: nilfs_cpfile_put_checkpoint() releases the checkpoint
249 * specified by @cno. @bh must be the buffer head which has been returned by
250 * a previous call to nilfs_cpfile_get_checkpoint() with @cno.
251 */
252void nilfs_cpfile_put_checkpoint(struct inode *cpfile, __u64 cno,
253 struct buffer_head *bh)
254{
255 kunmap(bh->b_page);
256 brelse(bh);
257}
258
259/**
260 * nilfs_cpfile_delete_checkpoints - delete checkpoints
261 * @cpfile: inode of checkpoint file
262 * @start: start checkpoint number
263 * @end: end checkpoint numer
264 *
265 * Description: nilfs_cpfile_delete_checkpoints() deletes the checkpoints in
266 * the period from @start to @end, excluding @end itself. The checkpoints
267 * which have been already deleted are ignored.
268 *
269 * Return Value: On success, 0 is returned. On error, one of the following
270 * negative error codes is returned.
271 *
272 * %-EIO - I/O error.
273 *
274 * %-ENOMEM - Insufficient amount of memory available.
275 *
276 * %-EINVAL - invalid checkpoints.
277 */
278int nilfs_cpfile_delete_checkpoints(struct inode *cpfile,
279 __u64 start,
280 __u64 end)
281{
282 struct buffer_head *header_bh, *cp_bh;
283 struct nilfs_cpfile_header *header;
284 struct nilfs_checkpoint *cp;
285 size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size;
286 __u64 cno;
287 void *kaddr;
288 unsigned long tnicps;
289 int ret, ncps, nicps, count, i;
290
291 if (unlikely(start == 0 || start > end)) {
292 printk(KERN_ERR "%s: invalid range of checkpoint numbers: "
293 "[%llu, %llu)\n", __func__,
294 (unsigned long long)start, (unsigned long long)end);
295 return -EINVAL;
296 }
297
298 /* cannot delete the latest checkpoint */
299 if (start == nilfs_mdt_cno(cpfile) - 1)
300 return -EPERM;
301
302 down_write(&NILFS_MDT(cpfile)->mi_sem);
303
304 ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
305 if (ret < 0)
306 goto out_sem;
307 tnicps = 0;
308
309 for (cno = start; cno < end; cno += ncps) {
310 ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, end);
311 ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
312 if (ret < 0) {
313 if (ret != -ENOENT)
314 goto out_sem;
315 /* skip hole */
316 ret = 0;
317 continue;
318 }
319
320 kaddr = kmap_atomic(cp_bh->b_page, KM_USER0);
321 cp = nilfs_cpfile_block_get_checkpoint(
322 cpfile, cno, cp_bh, kaddr);
323 nicps = 0;
324 for (i = 0; i < ncps; i++, cp = (void *)cp + cpsz) {
325 WARN_ON(nilfs_checkpoint_snapshot(cp));
326 if (!nilfs_checkpoint_invalid(cp)) {
327 nilfs_checkpoint_set_invalid(cp);
328 nicps++;
329 }
330 }
331 if (nicps > 0) {
332 tnicps += nicps;
333 nilfs_mdt_mark_buffer_dirty(cp_bh);
334 nilfs_mdt_mark_dirty(cpfile);
335 if (!nilfs_cpfile_is_in_first(cpfile, cno) &&
336 (count = nilfs_cpfile_block_sub_valid_checkpoints(
337 cpfile, cp_bh, kaddr, nicps)) == 0) {
338 /* make hole */
339 kunmap_atomic(kaddr, KM_USER0);
340 brelse(cp_bh);
341 ret = nilfs_cpfile_delete_checkpoint_block(
342 cpfile, cno);
343 if (ret == 0)
344 continue;
345 printk(KERN_ERR "%s: cannot delete block\n",
346 __func__);
347 goto out_sem;
348 }
349 }
350
351 kunmap_atomic(kaddr, KM_USER0);
352 brelse(cp_bh);
353 }
354
355 if (tnicps > 0) {
356 kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
357 header = nilfs_cpfile_block_get_header(cpfile, header_bh,
358 kaddr);
359 le64_add_cpu(&header->ch_ncheckpoints, -(u64)tnicps);
360 nilfs_mdt_mark_buffer_dirty(header_bh);
361 nilfs_mdt_mark_dirty(cpfile);
362 kunmap_atomic(kaddr, KM_USER0);
363 }
364 brelse(header_bh);
365
366 out_sem:
367 up_write(&NILFS_MDT(cpfile)->mi_sem);
368 return ret;
369}
370
371static void nilfs_cpfile_checkpoint_to_cpinfo(struct inode *cpfile,
372 struct nilfs_checkpoint *cp,
373 struct nilfs_cpinfo *ci)
374{
375 ci->ci_flags = le32_to_cpu(cp->cp_flags);
376 ci->ci_cno = le64_to_cpu(cp->cp_cno);
377 ci->ci_create = le64_to_cpu(cp->cp_create);
378 ci->ci_nblk_inc = le64_to_cpu(cp->cp_nblk_inc);
379 ci->ci_inodes_count = le64_to_cpu(cp->cp_inodes_count);
380 ci->ci_blocks_count = le64_to_cpu(cp->cp_blocks_count);
381 ci->ci_next = le64_to_cpu(cp->cp_snapshot_list.ssl_next);
382}
383
384static ssize_t nilfs_cpfile_do_get_cpinfo(struct inode *cpfile, __u64 *cnop,
385 struct nilfs_cpinfo *ci, size_t nci)
386{
387 struct nilfs_checkpoint *cp;
388 struct buffer_head *bh;
389 size_t cpsz = NILFS_MDT(cpfile)->mi_entry_size;
390 __u64 cur_cno = nilfs_mdt_cno(cpfile), cno = *cnop;
391 void *kaddr;
392 int n, ret;
393 int ncps, i;
394
395 if (cno == 0)
396 return -ENOENT; /* checkpoint number 0 is invalid */
397 down_read(&NILFS_MDT(cpfile)->mi_sem);
398
399 for (n = 0; cno < cur_cno && n < nci; cno += ncps) {
400 ncps = nilfs_cpfile_checkpoints_in_block(cpfile, cno, cur_cno);
401 ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh);
402 if (ret < 0) {
403 if (ret != -ENOENT)
404 goto out;
405 continue; /* skip hole */
406 }
407
408 kaddr = kmap_atomic(bh->b_page, KM_USER0);
409 cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
410 for (i = 0; i < ncps && n < nci; i++, cp = (void *)cp + cpsz) {
411 if (!nilfs_checkpoint_invalid(cp))
412 nilfs_cpfile_checkpoint_to_cpinfo(
413 cpfile, cp, &ci[n++]);
414 }
415 kunmap_atomic(kaddr, KM_USER0);
416 brelse(bh);
417 }
418
419 ret = n;
420 if (n > 0)
421 *cnop = ci[n - 1].ci_cno + 1;
422
423 out:
424 up_read(&NILFS_MDT(cpfile)->mi_sem);
425 return ret;
426}
427
428static ssize_t nilfs_cpfile_do_get_ssinfo(struct inode *cpfile, __u64 *cnop,
429 struct nilfs_cpinfo *ci, size_t nci)
430{
431 struct buffer_head *bh;
432 struct nilfs_cpfile_header *header;
433 struct nilfs_checkpoint *cp;
434 __u64 curr = *cnop, next;
435 unsigned long curr_blkoff, next_blkoff;
436 void *kaddr;
437 int n = 0, ret;
438
439 down_read(&NILFS_MDT(cpfile)->mi_sem);
440
441 if (curr == 0) {
442 ret = nilfs_cpfile_get_header_block(cpfile, &bh);
443 if (ret < 0)
444 goto out;
445 kaddr = kmap_atomic(bh->b_page, KM_USER0);
446 header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr);
447 curr = le64_to_cpu(header->ch_snapshot_list.ssl_next);
448 kunmap_atomic(kaddr, KM_USER0);
449 brelse(bh);
450 if (curr == 0) {
451 ret = 0;
452 goto out;
453 }
454 } else if (unlikely(curr == ~(__u64)0)) {
455 ret = 0;
456 goto out;
457 }
458
459 curr_blkoff = nilfs_cpfile_get_blkoff(cpfile, curr);
460 ret = nilfs_cpfile_get_checkpoint_block(cpfile, curr, 0, &bh);
461 if (unlikely(ret < 0)) {
462 if (ret == -ENOENT)
463 ret = 0; /* No snapshots (started from a hole block) */
464 goto out;
465 }
466 kaddr = kmap_atomic(bh->b_page, KM_USER0);
467 while (n < nci) {
468 cp = nilfs_cpfile_block_get_checkpoint(cpfile, curr, bh, kaddr);
469 curr = ~(__u64)0; /* Terminator */
470 if (unlikely(nilfs_checkpoint_invalid(cp) ||
471 !nilfs_checkpoint_snapshot(cp)))
472 break;
473 nilfs_cpfile_checkpoint_to_cpinfo(cpfile, cp, &ci[n++]);
474 next = le64_to_cpu(cp->cp_snapshot_list.ssl_next);
475 if (next == 0)
476 break; /* reach end of the snapshot list */
477
478 next_blkoff = nilfs_cpfile_get_blkoff(cpfile, next);
479 if (curr_blkoff != next_blkoff) {
480 kunmap_atomic(kaddr, KM_USER0);
481 brelse(bh);
482 ret = nilfs_cpfile_get_checkpoint_block(cpfile, next,
483 0, &bh);
484 if (unlikely(ret < 0)) {
485 WARN_ON(ret == -ENOENT);
486 goto out;
487 }
488 kaddr = kmap_atomic(bh->b_page, KM_USER0);
489 }
490 curr = next;
491 curr_blkoff = next_blkoff;
492 }
493 kunmap_atomic(kaddr, KM_USER0);
494 brelse(bh);
495 *cnop = curr;
496 ret = n;
497
498 out:
499 up_read(&NILFS_MDT(cpfile)->mi_sem);
500 return ret;
501}
502
503/**
504 * nilfs_cpfile_get_cpinfo -
505 * @cpfile:
506 * @cno:
507 * @ci:
508 * @nci:
509 */
510
511ssize_t nilfs_cpfile_get_cpinfo(struct inode *cpfile, __u64 *cnop, int mode,
512 struct nilfs_cpinfo *ci, size_t nci)
513{
514 switch (mode) {
515 case NILFS_CHECKPOINT:
516 return nilfs_cpfile_do_get_cpinfo(cpfile, cnop, ci, nci);
517 case NILFS_SNAPSHOT:
518 return nilfs_cpfile_do_get_ssinfo(cpfile, cnop, ci, nci);
519 default:
520 return -EINVAL;
521 }
522}
523
524/**
525 * nilfs_cpfile_delete_checkpoint -
526 * @cpfile:
527 * @cno:
528 */
529int nilfs_cpfile_delete_checkpoint(struct inode *cpfile, __u64 cno)
530{
531 struct nilfs_cpinfo ci;
532 __u64 tcno = cno;
533 ssize_t nci;
534 int ret;
535
536 nci = nilfs_cpfile_do_get_cpinfo(cpfile, &tcno, &ci, 1);
537 if (nci < 0)
538 return nci;
539 else if (nci == 0 || ci.ci_cno != cno)
540 return -ENOENT;
541
542 /* cannot delete the latest checkpoint nor snapshots */
543 ret = nilfs_cpinfo_snapshot(&ci);
544 if (ret < 0)
545 return ret;
546 else if (ret > 0 || cno == nilfs_mdt_cno(cpfile) - 1)
547 return -EPERM;
548
549 return nilfs_cpfile_delete_checkpoints(cpfile, cno, cno + 1);
550}
551
552static struct nilfs_snapshot_list *
553nilfs_cpfile_block_get_snapshot_list(const struct inode *cpfile,
554 __u64 cno,
555 struct buffer_head *bh,
556 void *kaddr)
557{
558 struct nilfs_cpfile_header *header;
559 struct nilfs_checkpoint *cp;
560 struct nilfs_snapshot_list *list;
561
562 if (cno != 0) {
563 cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
564 list = &cp->cp_snapshot_list;
565 } else {
566 header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr);
567 list = &header->ch_snapshot_list;
568 }
569 return list;
570}
571
572static int nilfs_cpfile_set_snapshot(struct inode *cpfile, __u64 cno)
573{
574 struct buffer_head *header_bh, *curr_bh, *prev_bh, *cp_bh;
575 struct nilfs_cpfile_header *header;
576 struct nilfs_checkpoint *cp;
577 struct nilfs_snapshot_list *list;
578 __u64 curr, prev;
579 unsigned long curr_blkoff, prev_blkoff;
580 void *kaddr;
581 int ret;
582
583 if (cno == 0)
584 return -ENOENT; /* checkpoint number 0 is invalid */
585 down_write(&NILFS_MDT(cpfile)->mi_sem);
586
587 ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
588 if (ret < 0)
589 goto out_sem;
590 kaddr = kmap_atomic(cp_bh->b_page, KM_USER0);
591 cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
592 if (nilfs_checkpoint_invalid(cp)) {
593 ret = -ENOENT;
594 kunmap_atomic(kaddr, KM_USER0);
595 goto out_cp;
596 }
597 if (nilfs_checkpoint_snapshot(cp)) {
598 ret = 0;
599 kunmap_atomic(kaddr, KM_USER0);
600 goto out_cp;
601 }
602 kunmap_atomic(kaddr, KM_USER0);
603
604 ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
605 if (ret < 0)
606 goto out_cp;
607 kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
608 header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr);
609 list = &header->ch_snapshot_list;
610 curr_bh = header_bh;
611 get_bh(curr_bh);
612 curr = 0;
613 curr_blkoff = 0;
614 prev = le64_to_cpu(list->ssl_prev);
615 while (prev > cno) {
616 prev_blkoff = nilfs_cpfile_get_blkoff(cpfile, prev);
617 curr = prev;
618 if (curr_blkoff != prev_blkoff) {
619 kunmap_atomic(kaddr, KM_USER0);
620 brelse(curr_bh);
621 ret = nilfs_cpfile_get_checkpoint_block(cpfile, curr,
622 0, &curr_bh);
623 if (ret < 0)
624 goto out_header;
625 kaddr = kmap_atomic(curr_bh->b_page, KM_USER0);
626 }
627 curr_blkoff = prev_blkoff;
628 cp = nilfs_cpfile_block_get_checkpoint(
629 cpfile, curr, curr_bh, kaddr);
630 list = &cp->cp_snapshot_list;
631 prev = le64_to_cpu(list->ssl_prev);
632 }
633 kunmap_atomic(kaddr, KM_USER0);
634
635 if (prev != 0) {
636 ret = nilfs_cpfile_get_checkpoint_block(cpfile, prev, 0,
637 &prev_bh);
638 if (ret < 0)
639 goto out_curr;
640 } else {
641 prev_bh = header_bh;
642 get_bh(prev_bh);
643 }
644
645 kaddr = kmap_atomic(curr_bh->b_page, KM_USER0);
646 list = nilfs_cpfile_block_get_snapshot_list(
647 cpfile, curr, curr_bh, kaddr);
648 list->ssl_prev = cpu_to_le64(cno);
649 kunmap_atomic(kaddr, KM_USER0);
650
651 kaddr = kmap_atomic(cp_bh->b_page, KM_USER0);
652 cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
653 cp->cp_snapshot_list.ssl_next = cpu_to_le64(curr);
654 cp->cp_snapshot_list.ssl_prev = cpu_to_le64(prev);
655 nilfs_checkpoint_set_snapshot(cp);
656 kunmap_atomic(kaddr, KM_USER0);
657
658 kaddr = kmap_atomic(prev_bh->b_page, KM_USER0);
659 list = nilfs_cpfile_block_get_snapshot_list(
660 cpfile, prev, prev_bh, kaddr);
661 list->ssl_next = cpu_to_le64(cno);
662 kunmap_atomic(kaddr, KM_USER0);
663
664 kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
665 header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr);
666 le64_add_cpu(&header->ch_nsnapshots, 1);
667 kunmap_atomic(kaddr, KM_USER0);
668
669 nilfs_mdt_mark_buffer_dirty(prev_bh);
670 nilfs_mdt_mark_buffer_dirty(curr_bh);
671 nilfs_mdt_mark_buffer_dirty(cp_bh);
672 nilfs_mdt_mark_buffer_dirty(header_bh);
673 nilfs_mdt_mark_dirty(cpfile);
674
675 brelse(prev_bh);
676
677 out_curr:
678 brelse(curr_bh);
679
680 out_header:
681 brelse(header_bh);
682
683 out_cp:
684 brelse(cp_bh);
685
686 out_sem:
687 up_write(&NILFS_MDT(cpfile)->mi_sem);
688 return ret;
689}
690
691static int nilfs_cpfile_clear_snapshot(struct inode *cpfile, __u64 cno)
692{
693 struct buffer_head *header_bh, *next_bh, *prev_bh, *cp_bh;
694 struct nilfs_cpfile_header *header;
695 struct nilfs_checkpoint *cp;
696 struct nilfs_snapshot_list *list;
697 __u64 next, prev;
698 void *kaddr;
699 int ret;
700
701 if (cno == 0)
702 return -ENOENT; /* checkpoint number 0 is invalid */
703 down_write(&NILFS_MDT(cpfile)->mi_sem);
704
705 ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &cp_bh);
706 if (ret < 0)
707 goto out_sem;
708 kaddr = kmap_atomic(cp_bh->b_page, KM_USER0);
709 cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
710 if (nilfs_checkpoint_invalid(cp)) {
711 ret = -ENOENT;
712 kunmap_atomic(kaddr, KM_USER0);
713 goto out_cp;
714 }
715 if (!nilfs_checkpoint_snapshot(cp)) {
716 ret = 0;
717 kunmap_atomic(kaddr, KM_USER0);
718 goto out_cp;
719 }
720
721 list = &cp->cp_snapshot_list;
722 next = le64_to_cpu(list->ssl_next);
723 prev = le64_to_cpu(list->ssl_prev);
724 kunmap_atomic(kaddr, KM_USER0);
725
726 ret = nilfs_cpfile_get_header_block(cpfile, &header_bh);
727 if (ret < 0)
728 goto out_cp;
729 if (next != 0) {
730 ret = nilfs_cpfile_get_checkpoint_block(cpfile, next, 0,
731 &next_bh);
732 if (ret < 0)
733 goto out_header;
734 } else {
735 next_bh = header_bh;
736 get_bh(next_bh);
737 }
738 if (prev != 0) {
739 ret = nilfs_cpfile_get_checkpoint_block(cpfile, prev, 0,
740 &prev_bh);
741 if (ret < 0)
742 goto out_next;
743 } else {
744 prev_bh = header_bh;
745 get_bh(prev_bh);
746 }
747
748 kaddr = kmap_atomic(next_bh->b_page, KM_USER0);
749 list = nilfs_cpfile_block_get_snapshot_list(
750 cpfile, next, next_bh, kaddr);
751 list->ssl_prev = cpu_to_le64(prev);
752 kunmap_atomic(kaddr, KM_USER0);
753
754 kaddr = kmap_atomic(prev_bh->b_page, KM_USER0);
755 list = nilfs_cpfile_block_get_snapshot_list(
756 cpfile, prev, prev_bh, kaddr);
757 list->ssl_next = cpu_to_le64(next);
758 kunmap_atomic(kaddr, KM_USER0);
759
760 kaddr = kmap_atomic(cp_bh->b_page, KM_USER0);
761 cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, cp_bh, kaddr);
762 cp->cp_snapshot_list.ssl_next = cpu_to_le64(0);
763 cp->cp_snapshot_list.ssl_prev = cpu_to_le64(0);
764 nilfs_checkpoint_clear_snapshot(cp);
765 kunmap_atomic(kaddr, KM_USER0);
766
767 kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
768 header = nilfs_cpfile_block_get_header(cpfile, header_bh, kaddr);
769 le64_add_cpu(&header->ch_nsnapshots, -1);
770 kunmap_atomic(kaddr, KM_USER0);
771
772 nilfs_mdt_mark_buffer_dirty(next_bh);
773 nilfs_mdt_mark_buffer_dirty(prev_bh);
774 nilfs_mdt_mark_buffer_dirty(cp_bh);
775 nilfs_mdt_mark_buffer_dirty(header_bh);
776 nilfs_mdt_mark_dirty(cpfile);
777
778 brelse(prev_bh);
779
780 out_next:
781 brelse(next_bh);
782
783 out_header:
784 brelse(header_bh);
785
786 out_cp:
787 brelse(cp_bh);
788
789 out_sem:
790 up_write(&NILFS_MDT(cpfile)->mi_sem);
791 return ret;
792}
793
794/**
795 * nilfs_cpfile_is_snapshot -
796 * @cpfile: inode of checkpoint file
797 * @cno: checkpoint number
798 *
799 * Description:
800 *
801 * Return Value: On success, 1 is returned if the checkpoint specified by
802 * @cno is a snapshot, or 0 if not. On error, one of the following negative
803 * error codes is returned.
804 *
805 * %-EIO - I/O error.
806 *
807 * %-ENOMEM - Insufficient amount of memory available.
808 *
809 * %-ENOENT - No such checkpoint.
810 */
811int nilfs_cpfile_is_snapshot(struct inode *cpfile, __u64 cno)
812{
813 struct buffer_head *bh;
814 struct nilfs_checkpoint *cp;
815 void *kaddr;
816 int ret;
817
818 if (cno == 0)
819 return -ENOENT; /* checkpoint number 0 is invalid */
820 down_read(&NILFS_MDT(cpfile)->mi_sem);
821
822 ret = nilfs_cpfile_get_checkpoint_block(cpfile, cno, 0, &bh);
823 if (ret < 0)
824 goto out;
825 kaddr = kmap_atomic(bh->b_page, KM_USER0);
826 cp = nilfs_cpfile_block_get_checkpoint(cpfile, cno, bh, kaddr);
827 ret = nilfs_checkpoint_snapshot(cp);
828 kunmap_atomic(kaddr, KM_USER0);
829 brelse(bh);
830
831 out:
832 up_read(&NILFS_MDT(cpfile)->mi_sem);
833 return ret;
834}
835
836/**
837 * nilfs_cpfile_change_cpmode - change checkpoint mode
838 * @cpfile: inode of checkpoint file
839 * @cno: checkpoint number
840 * @status: mode of checkpoint
841 *
842 * Description: nilfs_change_cpmode() changes the mode of the checkpoint
843 * specified by @cno. The mode @mode is NILFS_CHECKPOINT or NILFS_SNAPSHOT.
844 *
845 * Return Value: On success, 0 is returned. On error, one of the following
846 * negative error codes is returned.
847 *
848 * %-EIO - I/O error.
849 *
850 * %-ENOMEM - Insufficient amount of memory available.
851 *
852 * %-ENOENT - No such checkpoint.
853 */
854int nilfs_cpfile_change_cpmode(struct inode *cpfile, __u64 cno, int mode)
855{
856 struct the_nilfs *nilfs;
857 int ret;
858
859 nilfs = NILFS_MDT(cpfile)->mi_nilfs;
860
861 switch (mode) {
862 case NILFS_CHECKPOINT:
863 /*
864 * Check for protecting existing snapshot mounts:
865 * bd_mount_sem is used to make this operation atomic and
866 * exclusive with a new mount job. Though it doesn't cover
867 * umount, it's enough for the purpose.
868 */
869 down(&nilfs->ns_bdev->bd_mount_sem);
870 if (nilfs_checkpoint_is_mounted(nilfs, cno, 1)) {
871 /* Current implementation does not have to protect
872 plain read-only mounts since they are exclusive
873 with a read/write mount and are protected from the
874 cleaner. */
875 ret = -EBUSY;
876 } else
877 ret = nilfs_cpfile_clear_snapshot(cpfile, cno);
878 up(&nilfs->ns_bdev->bd_mount_sem);
879 return ret;
880 case NILFS_SNAPSHOT:
881 return nilfs_cpfile_set_snapshot(cpfile, cno);
882 default:
883 return -EINVAL;
884 }
885}
886
887/**
888 * nilfs_cpfile_get_stat - get checkpoint statistics
889 * @cpfile: inode of checkpoint file
890 * @stat: pointer to a structure of checkpoint statistics
891 *
892 * Description: nilfs_cpfile_get_stat() returns information about checkpoints.
893 *
894 * Return Value: On success, 0 is returned, and checkpoints information is
895 * stored in the place pointed by @stat. On error, one of the following
896 * negative error codes is returned.
897 *
898 * %-EIO - I/O error.
899 *
900 * %-ENOMEM - Insufficient amount of memory available.
901 */
902int nilfs_cpfile_get_stat(struct inode *cpfile, struct nilfs_cpstat *cpstat)
903{
904 struct buffer_head *bh;
905 struct nilfs_cpfile_header *header;
906 void *kaddr;
907 int ret;
908
909 down_read(&NILFS_MDT(cpfile)->mi_sem);
910
911 ret = nilfs_cpfile_get_header_block(cpfile, &bh);
912 if (ret < 0)
913 goto out_sem;
914 kaddr = kmap_atomic(bh->b_page, KM_USER0);
915 header = nilfs_cpfile_block_get_header(cpfile, bh, kaddr);
916 cpstat->cs_cno = nilfs_mdt_cno(cpfile);
917 cpstat->cs_ncps = le64_to_cpu(header->ch_ncheckpoints);
918 cpstat->cs_nsss = le64_to_cpu(header->ch_nsnapshots);
919 kunmap_atomic(kaddr, KM_USER0);
920 brelse(bh);
921
922 out_sem:
923 up_read(&NILFS_MDT(cpfile)->mi_sem);
924 return ret;
925}
diff --git a/fs/nilfs2/cpfile.h b/fs/nilfs2/cpfile.h
new file mode 100644
index 000000000000..1a8a1008c342
--- /dev/null
+++ b/fs/nilfs2/cpfile.h
@@ -0,0 +1,45 @@
1/*
2 * cpfile.h - NILFS checkpoint file.
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#ifndef _NILFS_CPFILE_H
24#define _NILFS_CPFILE_H
25
26#include <linux/fs.h>
27#include <linux/buffer_head.h>
28#include <linux/nilfs2_fs.h>
29
30#define NILFS_CPFILE_GFP NILFS_MDT_GFP
31
32
33int nilfs_cpfile_get_checkpoint(struct inode *, __u64, int,
34 struct nilfs_checkpoint **,
35 struct buffer_head **);
36void nilfs_cpfile_put_checkpoint(struct inode *, __u64, struct buffer_head *);
37int nilfs_cpfile_delete_checkpoints(struct inode *, __u64, __u64);
38int nilfs_cpfile_delete_checkpoint(struct inode *, __u64);
39int nilfs_cpfile_change_cpmode(struct inode *, __u64, int);
40int nilfs_cpfile_is_snapshot(struct inode *, __u64);
41int nilfs_cpfile_get_stat(struct inode *, struct nilfs_cpstat *);
42ssize_t nilfs_cpfile_get_cpinfo(struct inode *, __u64 *, int,
43 struct nilfs_cpinfo *, size_t);
44
45#endif /* _NILFS_CPFILE_H */
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
new file mode 100644
index 000000000000..bb8a5818e7f1
--- /dev/null
+++ b/fs/nilfs2/dat.c
@@ -0,0 +1,430 @@
1/*
2 * dat.c - NILFS disk address translation.
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#include <linux/types.h>
24#include <linux/buffer_head.h>
25#include <linux/string.h>
26#include <linux/errno.h>
27#include "nilfs.h"
28#include "mdt.h"
29#include "alloc.h"
30#include "dat.h"
31
32
33#define NILFS_CNO_MIN ((__u64)1)
34#define NILFS_CNO_MAX (~(__u64)0)
35
36static int nilfs_dat_prepare_entry(struct inode *dat,
37 struct nilfs_palloc_req *req, int create)
38{
39 return nilfs_palloc_get_entry_block(dat, req->pr_entry_nr,
40 create, &req->pr_entry_bh);
41}
42
43static void nilfs_dat_commit_entry(struct inode *dat,
44 struct nilfs_palloc_req *req)
45{
46 nilfs_mdt_mark_buffer_dirty(req->pr_entry_bh);
47 nilfs_mdt_mark_dirty(dat);
48 brelse(req->pr_entry_bh);
49}
50
51static void nilfs_dat_abort_entry(struct inode *dat,
52 struct nilfs_palloc_req *req)
53{
54 brelse(req->pr_entry_bh);
55}
56
57int nilfs_dat_prepare_alloc(struct inode *dat, struct nilfs_palloc_req *req)
58{
59 int ret;
60
61 ret = nilfs_palloc_prepare_alloc_entry(dat, req);
62 if (ret < 0)
63 return ret;
64
65 ret = nilfs_dat_prepare_entry(dat, req, 1);
66 if (ret < 0)
67 nilfs_palloc_abort_alloc_entry(dat, req);
68
69 return ret;
70}
71
72void nilfs_dat_commit_alloc(struct inode *dat, struct nilfs_palloc_req *req)
73{
74 struct nilfs_dat_entry *entry;
75 void *kaddr;
76
77 kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
78 entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
79 req->pr_entry_bh, kaddr);
80 entry->de_start = cpu_to_le64(NILFS_CNO_MIN);
81 entry->de_end = cpu_to_le64(NILFS_CNO_MAX);
82 entry->de_blocknr = cpu_to_le64(0);
83 kunmap_atomic(kaddr, KM_USER0);
84
85 nilfs_palloc_commit_alloc_entry(dat, req);
86 nilfs_dat_commit_entry(dat, req);
87}
88
89void nilfs_dat_abort_alloc(struct inode *dat, struct nilfs_palloc_req *req)
90{
91 nilfs_dat_abort_entry(dat, req);
92 nilfs_palloc_abort_alloc_entry(dat, req);
93}
94
95int nilfs_dat_prepare_free(struct inode *dat, struct nilfs_palloc_req *req)
96{
97 int ret;
98
99 ret = nilfs_palloc_prepare_free_entry(dat, req);
100 if (ret < 0)
101 return ret;
102 ret = nilfs_dat_prepare_entry(dat, req, 0);
103 if (ret < 0) {
104 nilfs_palloc_abort_free_entry(dat, req);
105 return ret;
106 }
107 return 0;
108}
109
110void nilfs_dat_commit_free(struct inode *dat, struct nilfs_palloc_req *req)
111{
112 struct nilfs_dat_entry *entry;
113 void *kaddr;
114
115 kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
116 entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
117 req->pr_entry_bh, kaddr);
118 entry->de_start = cpu_to_le64(NILFS_CNO_MIN);
119 entry->de_end = cpu_to_le64(NILFS_CNO_MIN);
120 entry->de_blocknr = cpu_to_le64(0);
121 kunmap_atomic(kaddr, KM_USER0);
122
123 nilfs_dat_commit_entry(dat, req);
124 nilfs_palloc_commit_free_entry(dat, req);
125}
126
127void nilfs_dat_abort_free(struct inode *dat, struct nilfs_palloc_req *req)
128{
129 nilfs_dat_abort_entry(dat, req);
130 nilfs_palloc_abort_free_entry(dat, req);
131}
132
133int nilfs_dat_prepare_start(struct inode *dat, struct nilfs_palloc_req *req)
134{
135 int ret;
136
137 ret = nilfs_dat_prepare_entry(dat, req, 0);
138 WARN_ON(ret == -ENOENT);
139 return ret;
140}
141
142void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req,
143 sector_t blocknr)
144{
145 struct nilfs_dat_entry *entry;
146 void *kaddr;
147
148 kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
149 entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
150 req->pr_entry_bh, kaddr);
151 entry->de_start = cpu_to_le64(nilfs_mdt_cno(dat));
152 if (entry->de_blocknr != cpu_to_le64(0) ||
153 entry->de_end != cpu_to_le64(NILFS_CNO_MAX)) {
154 printk(KERN_CRIT
155 "%s: vbn = %llu, start = %llu, end = %llu, pbn = %llu\n",
156 __func__, (unsigned long long)req->pr_entry_nr,
157 (unsigned long long)le64_to_cpu(entry->de_start),
158 (unsigned long long)le64_to_cpu(entry->de_end),
159 (unsigned long long)le64_to_cpu(entry->de_blocknr));
160 }
161 entry->de_blocknr = cpu_to_le64(blocknr);
162 kunmap_atomic(kaddr, KM_USER0);
163
164 nilfs_dat_commit_entry(dat, req);
165}
166
167void nilfs_dat_abort_start(struct inode *dat, struct nilfs_palloc_req *req)
168{
169 nilfs_dat_abort_entry(dat, req);
170}
171
172int nilfs_dat_prepare_end(struct inode *dat, struct nilfs_palloc_req *req)
173{
174 struct nilfs_dat_entry *entry;
175 __u64 start;
176 sector_t blocknr;
177 void *kaddr;
178 int ret;
179
180 ret = nilfs_dat_prepare_entry(dat, req, 0);
181 if (ret < 0) {
182 WARN_ON(ret == -ENOENT);
183 return ret;
184 }
185
186 kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
187 entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
188 req->pr_entry_bh, kaddr);
189 start = le64_to_cpu(entry->de_start);
190 blocknr = le64_to_cpu(entry->de_blocknr);
191 kunmap_atomic(kaddr, KM_USER0);
192
193 if (blocknr == 0) {
194 ret = nilfs_palloc_prepare_free_entry(dat, req);
195 if (ret < 0) {
196 nilfs_dat_abort_entry(dat, req);
197 return ret;
198 }
199 }
200
201 return 0;
202}
203
204void nilfs_dat_commit_end(struct inode *dat, struct nilfs_palloc_req *req,
205 int dead)
206{
207 struct nilfs_dat_entry *entry;
208 __u64 start, end;
209 sector_t blocknr;
210 void *kaddr;
211
212 kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
213 entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
214 req->pr_entry_bh, kaddr);
215 end = start = le64_to_cpu(entry->de_start);
216 if (!dead) {
217 end = nilfs_mdt_cno(dat);
218 WARN_ON(start > end);
219 }
220 entry->de_end = cpu_to_le64(end);
221 blocknr = le64_to_cpu(entry->de_blocknr);
222 kunmap_atomic(kaddr, KM_USER0);
223
224 if (blocknr == 0)
225 nilfs_dat_commit_free(dat, req);
226 else
227 nilfs_dat_commit_entry(dat, req);
228}
229
230void nilfs_dat_abort_end(struct inode *dat, struct nilfs_palloc_req *req)
231{
232 struct nilfs_dat_entry *entry;
233 __u64 start;
234 sector_t blocknr;
235 void *kaddr;
236
237 kaddr = kmap_atomic(req->pr_entry_bh->b_page, KM_USER0);
238 entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
239 req->pr_entry_bh, kaddr);
240 start = le64_to_cpu(entry->de_start);
241 blocknr = le64_to_cpu(entry->de_blocknr);
242 kunmap_atomic(kaddr, KM_USER0);
243
244 if (start == nilfs_mdt_cno(dat) && blocknr == 0)
245 nilfs_palloc_abort_free_entry(dat, req);
246 nilfs_dat_abort_entry(dat, req);
247}
248
249/**
250 * nilfs_dat_mark_dirty -
251 * @dat: DAT file inode
252 * @vblocknr: virtual block number
253 *
254 * Description:
255 *
256 * Return Value: On success, 0 is returned. On error, one of the following
257 * negative error codes is returned.
258 *
259 * %-EIO - I/O error.
260 *
261 * %-ENOMEM - Insufficient amount of memory available.
262 */
263int nilfs_dat_mark_dirty(struct inode *dat, __u64 vblocknr)
264{
265 struct nilfs_palloc_req req;
266 int ret;
267
268 req.pr_entry_nr = vblocknr;
269 ret = nilfs_dat_prepare_entry(dat, &req, 0);
270 if (ret == 0)
271 nilfs_dat_commit_entry(dat, &req);
272 return ret;
273}
274
275/**
276 * nilfs_dat_freev - free virtual block numbers
277 * @dat: DAT file inode
278 * @vblocknrs: array of virtual block numbers
279 * @nitems: number of virtual block numbers
280 *
281 * Description: nilfs_dat_freev() frees the virtual block numbers specified by
282 * @vblocknrs and @nitems.
283 *
284 * Return Value: On success, 0 is returned. On error, one of the following
285 * nagative error codes is returned.
286 *
287 * %-EIO - I/O error.
288 *
289 * %-ENOMEM - Insufficient amount of memory available.
290 *
291 * %-ENOENT - The virtual block number have not been allocated.
292 */
293int nilfs_dat_freev(struct inode *dat, __u64 *vblocknrs, size_t nitems)
294{
295 return nilfs_palloc_freev(dat, vblocknrs, nitems);
296}
297
298/**
299 * nilfs_dat_move - change a block number
300 * @dat: DAT file inode
301 * @vblocknr: virtual block number
302 * @blocknr: block number
303 *
304 * Description: nilfs_dat_move() changes the block number associated with
305 * @vblocknr to @blocknr.
306 *
307 * Return Value: On success, 0 is returned. On error, one of the following
308 * negative error codes is returned.
309 *
310 * %-EIO - I/O error.
311 *
312 * %-ENOMEM - Insufficient amount of memory available.
313 */
314int nilfs_dat_move(struct inode *dat, __u64 vblocknr, sector_t blocknr)
315{
316 struct buffer_head *entry_bh;
317 struct nilfs_dat_entry *entry;
318 void *kaddr;
319 int ret;
320
321 ret = nilfs_palloc_get_entry_block(dat, vblocknr, 0, &entry_bh);
322 if (ret < 0)
323 return ret;
324 kaddr = kmap_atomic(entry_bh->b_page, KM_USER0);
325 entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr);
326 if (unlikely(entry->de_blocknr == cpu_to_le64(0))) {
327 printk(KERN_CRIT "%s: vbn = %llu, [%llu, %llu)\n", __func__,
328 (unsigned long long)vblocknr,
329 (unsigned long long)le64_to_cpu(entry->de_start),
330 (unsigned long long)le64_to_cpu(entry->de_end));
331 kunmap_atomic(kaddr, KM_USER0);
332 brelse(entry_bh);
333 return -EINVAL;
334 }
335 WARN_ON(blocknr == 0);
336 entry->de_blocknr = cpu_to_le64(blocknr);
337 kunmap_atomic(kaddr, KM_USER0);
338
339 nilfs_mdt_mark_buffer_dirty(entry_bh);
340 nilfs_mdt_mark_dirty(dat);
341
342 brelse(entry_bh);
343
344 return 0;
345}
346
347/**
348 * nilfs_dat_translate - translate a virtual block number to a block number
349 * @dat: DAT file inode
350 * @vblocknr: virtual block number
351 * @blocknrp: pointer to a block number
352 *
353 * Description: nilfs_dat_translate() maps the virtual block number @vblocknr
354 * to the corresponding block number.
355 *
356 * Return Value: On success, 0 is returned and the block number associated
357 * with @vblocknr is stored in the place pointed by @blocknrp. On error, one
358 * of the following negative error codes is returned.
359 *
360 * %-EIO - I/O error.
361 *
362 * %-ENOMEM - Insufficient amount of memory available.
363 *
364 * %-ENOENT - A block number associated with @vblocknr does not exist.
365 */
366int nilfs_dat_translate(struct inode *dat, __u64 vblocknr, sector_t *blocknrp)
367{
368 struct buffer_head *entry_bh;
369 struct nilfs_dat_entry *entry;
370 sector_t blocknr;
371 void *kaddr;
372 int ret;
373
374 ret = nilfs_palloc_get_entry_block(dat, vblocknr, 0, &entry_bh);
375 if (ret < 0)
376 return ret;
377
378 kaddr = kmap_atomic(entry_bh->b_page, KM_USER0);
379 entry = nilfs_palloc_block_get_entry(dat, vblocknr, entry_bh, kaddr);
380 blocknr = le64_to_cpu(entry->de_blocknr);
381 if (blocknr == 0) {
382 ret = -ENOENT;
383 goto out;
384 }
385 if (blocknrp != NULL)
386 *blocknrp = blocknr;
387
388 out:
389 kunmap_atomic(kaddr, KM_USER0);
390 brelse(entry_bh);
391 return ret;
392}
393
394ssize_t nilfs_dat_get_vinfo(struct inode *dat, struct nilfs_vinfo *vinfo,
395 size_t nvi)
396{
397 struct buffer_head *entry_bh;
398 struct nilfs_dat_entry *entry;
399 __u64 first, last;
400 void *kaddr;
401 unsigned long entries_per_block = NILFS_MDT(dat)->mi_entries_per_block;
402 int i, j, n, ret;
403
404 for (i = 0; i < nvi; i += n) {
405 ret = nilfs_palloc_get_entry_block(dat, vinfo[i].vi_vblocknr,
406 0, &entry_bh);
407 if (ret < 0)
408 return ret;
409 kaddr = kmap_atomic(entry_bh->b_page, KM_USER0);
410 /* last virtual block number in this block */
411 first = vinfo[i].vi_vblocknr;
412 do_div(first, entries_per_block);
413 first *= entries_per_block;
414 last = first + entries_per_block - 1;
415 for (j = i, n = 0;
416 j < nvi && vinfo[j].vi_vblocknr >= first &&
417 vinfo[j].vi_vblocknr <= last;
418 j++, n++) {
419 entry = nilfs_palloc_block_get_entry(
420 dat, vinfo[j].vi_vblocknr, entry_bh, kaddr);
421 vinfo[j].vi_start = le64_to_cpu(entry->de_start);
422 vinfo[j].vi_end = le64_to_cpu(entry->de_end);
423 vinfo[j].vi_blocknr = le64_to_cpu(entry->de_blocknr);
424 }
425 kunmap_atomic(kaddr, KM_USER0);
426 brelse(entry_bh);
427 }
428
429 return nvi;
430}
diff --git a/fs/nilfs2/dat.h b/fs/nilfs2/dat.h
new file mode 100644
index 000000000000..d9560654a4b7
--- /dev/null
+++ b/fs/nilfs2/dat.h
@@ -0,0 +1,52 @@
1/*
2 * dat.h - NILFS disk address translation.
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#ifndef _NILFS_DAT_H
24#define _NILFS_DAT_H
25
26#include <linux/types.h>
27#include <linux/buffer_head.h>
28#include <linux/fs.h>
29
30#define NILFS_DAT_GFP NILFS_MDT_GFP
31
32struct nilfs_palloc_req;
33
34int nilfs_dat_translate(struct inode *, __u64, sector_t *);
35
36int nilfs_dat_prepare_alloc(struct inode *, struct nilfs_palloc_req *);
37void nilfs_dat_commit_alloc(struct inode *, struct nilfs_palloc_req *);
38void nilfs_dat_abort_alloc(struct inode *, struct nilfs_palloc_req *);
39int nilfs_dat_prepare_start(struct inode *, struct nilfs_palloc_req *);
40void nilfs_dat_commit_start(struct inode *, struct nilfs_palloc_req *,
41 sector_t);
42void nilfs_dat_abort_start(struct inode *, struct nilfs_palloc_req *);
43int nilfs_dat_prepare_end(struct inode *, struct nilfs_palloc_req *);
44void nilfs_dat_commit_end(struct inode *, struct nilfs_palloc_req *, int);
45void nilfs_dat_abort_end(struct inode *, struct nilfs_palloc_req *);
46
47int nilfs_dat_mark_dirty(struct inode *, __u64);
48int nilfs_dat_freev(struct inode *, __u64 *, size_t);
49int nilfs_dat_move(struct inode *, __u64, sector_t);
50ssize_t nilfs_dat_get_vinfo(struct inode *, struct nilfs_vinfo *, size_t);
51
52#endif /* _NILFS_DAT_H */
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
new file mode 100644
index 000000000000..54100acc1102
--- /dev/null
+++ b/fs/nilfs2/dir.c
@@ -0,0 +1,711 @@
1/*
2 * dir.c - NILFS directory entry operations
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Modified for NILFS by Amagai Yoshiji <amagai@osrg.net>
21 */
22/*
23 * linux/fs/ext2/dir.c
24 *
25 * Copyright (C) 1992, 1993, 1994, 1995
26 * Remy Card (card@masi.ibp.fr)
27 * Laboratoire MASI - Institut Blaise Pascal
28 * Universite Pierre et Marie Curie (Paris VI)
29 *
30 * from
31 *
32 * linux/fs/minix/dir.c
33 *
34 * Copyright (C) 1991, 1992 Linus Torvalds
35 *
36 * ext2 directory handling functions
37 *
38 * Big-endian to little-endian byte-swapping/bitmaps by
39 * David S. Miller (davem@caip.rutgers.edu), 1995
40 *
41 * All code that works with directory layout had been switched to pagecache
42 * and moved here. AV
43 */
44
45#include <linux/pagemap.h>
46#include <linux/smp_lock.h>
47#include "nilfs.h"
48#include "page.h"
49
50/*
51 * nilfs uses block-sized chunks. Arguably, sector-sized ones would be
52 * more robust, but we have what we have
53 */
54static inline unsigned nilfs_chunk_size(struct inode *inode)
55{
56 return inode->i_sb->s_blocksize;
57}
58
59static inline void nilfs_put_page(struct page *page)
60{
61 kunmap(page);
62 page_cache_release(page);
63}
64
65static inline unsigned long dir_pages(struct inode *inode)
66{
67 return (inode->i_size+PAGE_CACHE_SIZE-1)>>PAGE_CACHE_SHIFT;
68}
69
70/*
71 * Return the offset into page `page_nr' of the last valid
72 * byte in that page, plus one.
73 */
74static unsigned nilfs_last_byte(struct inode *inode, unsigned long page_nr)
75{
76 unsigned last_byte = inode->i_size;
77
78 last_byte -= page_nr << PAGE_CACHE_SHIFT;
79 if (last_byte > PAGE_CACHE_SIZE)
80 last_byte = PAGE_CACHE_SIZE;
81 return last_byte;
82}
83
84static int nilfs_prepare_chunk_uninterruptible(struct page *page,
85 struct address_space *mapping,
86 unsigned from, unsigned to)
87{
88 loff_t pos = page_offset(page) + from;
89 return block_write_begin(NULL, mapping, pos, to - from,
90 AOP_FLAG_UNINTERRUPTIBLE, &page,
91 NULL, nilfs_get_block);
92}
93
94static int nilfs_prepare_chunk(struct page *page,
95 struct address_space *mapping,
96 unsigned from, unsigned to)
97{
98 loff_t pos = page_offset(page) + from;
99 return block_write_begin(NULL, mapping, pos, to - from, 0, &page,
100 NULL, nilfs_get_block);
101}
102
103static int nilfs_commit_chunk(struct page *page,
104 struct address_space *mapping,
105 unsigned from, unsigned to)
106{
107 struct inode *dir = mapping->host;
108 struct nilfs_sb_info *sbi = NILFS_SB(dir->i_sb);
109 loff_t pos = page_offset(page) + from;
110 unsigned len = to - from;
111 unsigned nr_dirty, copied;
112 int err;
113
114 nr_dirty = nilfs_page_count_clean_buffers(page, from, to);
115 copied = block_write_end(NULL, mapping, pos, len, len, page, NULL);
116 if (pos + copied > dir->i_size) {
117 i_size_write(dir, pos + copied);
118 mark_inode_dirty(dir);
119 }
120 if (IS_DIRSYNC(dir))
121 nilfs_set_transaction_flag(NILFS_TI_SYNC);
122 err = nilfs_set_file_dirty(sbi, dir, nr_dirty);
123 unlock_page(page);
124 return err;
125}
126
127static void nilfs_check_page(struct page *page)
128{
129 struct inode *dir = page->mapping->host;
130 struct super_block *sb = dir->i_sb;
131 unsigned chunk_size = nilfs_chunk_size(dir);
132 char *kaddr = page_address(page);
133 unsigned offs, rec_len;
134 unsigned limit = PAGE_CACHE_SIZE;
135 struct nilfs_dir_entry *p;
136 char *error;
137
138 if ((dir->i_size >> PAGE_CACHE_SHIFT) == page->index) {
139 limit = dir->i_size & ~PAGE_CACHE_MASK;
140 if (limit & (chunk_size - 1))
141 goto Ebadsize;
142 if (!limit)
143 goto out;
144 }
145 for (offs = 0; offs <= limit - NILFS_DIR_REC_LEN(1); offs += rec_len) {
146 p = (struct nilfs_dir_entry *)(kaddr + offs);
147 rec_len = le16_to_cpu(p->rec_len);
148
149 if (rec_len < NILFS_DIR_REC_LEN(1))
150 goto Eshort;
151 if (rec_len & 3)
152 goto Ealign;
153 if (rec_len < NILFS_DIR_REC_LEN(p->name_len))
154 goto Enamelen;
155 if (((offs + rec_len - 1) ^ offs) & ~(chunk_size-1))
156 goto Espan;
157 }
158 if (offs != limit)
159 goto Eend;
160out:
161 SetPageChecked(page);
162 return;
163
164 /* Too bad, we had an error */
165
166Ebadsize:
167 nilfs_error(sb, "nilfs_check_page",
168 "size of directory #%lu is not a multiple of chunk size",
169 dir->i_ino
170 );
171 goto fail;
172Eshort:
173 error = "rec_len is smaller than minimal";
174 goto bad_entry;
175Ealign:
176 error = "unaligned directory entry";
177 goto bad_entry;
178Enamelen:
179 error = "rec_len is too small for name_len";
180 goto bad_entry;
181Espan:
182 error = "directory entry across blocks";
183bad_entry:
184 nilfs_error(sb, "nilfs_check_page", "bad entry in directory #%lu: %s - "
185 "offset=%lu, inode=%lu, rec_len=%d, name_len=%d",
186 dir->i_ino, error, (page->index<<PAGE_CACHE_SHIFT)+offs,
187 (unsigned long) le64_to_cpu(p->inode),
188 rec_len, p->name_len);
189 goto fail;
190Eend:
191 p = (struct nilfs_dir_entry *)(kaddr + offs);
192 nilfs_error(sb, "nilfs_check_page",
193 "entry in directory #%lu spans the page boundary"
194 "offset=%lu, inode=%lu",
195 dir->i_ino, (page->index<<PAGE_CACHE_SHIFT)+offs,
196 (unsigned long) le64_to_cpu(p->inode));
197fail:
198 SetPageChecked(page);
199 SetPageError(page);
200}
201
202static struct page *nilfs_get_page(struct inode *dir, unsigned long n)
203{
204 struct address_space *mapping = dir->i_mapping;
205 struct page *page = read_cache_page(mapping, n,
206 (filler_t *)mapping->a_ops->readpage, NULL);
207 if (!IS_ERR(page)) {
208 wait_on_page_locked(page);
209 kmap(page);
210 if (!PageUptodate(page))
211 goto fail;
212 if (!PageChecked(page))
213 nilfs_check_page(page);
214 if (PageError(page))
215 goto fail;
216 }
217 return page;
218
219fail:
220 nilfs_put_page(page);
221 return ERR_PTR(-EIO);
222}
223
224/*
225 * NOTE! unlike strncmp, nilfs_match returns 1 for success, 0 for failure.
226 *
227 * len <= NILFS_NAME_LEN and de != NULL are guaranteed by caller.
228 */
229static int
230nilfs_match(int len, const char * const name, struct nilfs_dir_entry *de)
231{
232 if (len != de->name_len)
233 return 0;
234 if (!de->inode)
235 return 0;
236 return !memcmp(name, de->name, len);
237}
238
239/*
240 * p is at least 6 bytes before the end of page
241 */
242static struct nilfs_dir_entry *nilfs_next_entry(struct nilfs_dir_entry *p)
243{
244 return (struct nilfs_dir_entry *)((char *)p + le16_to_cpu(p->rec_len));
245}
246
247static unsigned char
248nilfs_filetype_table[NILFS_FT_MAX] = {
249 [NILFS_FT_UNKNOWN] = DT_UNKNOWN,
250 [NILFS_FT_REG_FILE] = DT_REG,
251 [NILFS_FT_DIR] = DT_DIR,
252 [NILFS_FT_CHRDEV] = DT_CHR,
253 [NILFS_FT_BLKDEV] = DT_BLK,
254 [NILFS_FT_FIFO] = DT_FIFO,
255 [NILFS_FT_SOCK] = DT_SOCK,
256 [NILFS_FT_SYMLINK] = DT_LNK,
257};
258
259#define S_SHIFT 12
260static unsigned char
261nilfs_type_by_mode[S_IFMT >> S_SHIFT] = {
262 [S_IFREG >> S_SHIFT] = NILFS_FT_REG_FILE,
263 [S_IFDIR >> S_SHIFT] = NILFS_FT_DIR,
264 [S_IFCHR >> S_SHIFT] = NILFS_FT_CHRDEV,
265 [S_IFBLK >> S_SHIFT] = NILFS_FT_BLKDEV,
266 [S_IFIFO >> S_SHIFT] = NILFS_FT_FIFO,
267 [S_IFSOCK >> S_SHIFT] = NILFS_FT_SOCK,
268 [S_IFLNK >> S_SHIFT] = NILFS_FT_SYMLINK,
269};
270
271static void nilfs_set_de_type(struct nilfs_dir_entry *de, struct inode *inode)
272{
273 mode_t mode = inode->i_mode;
274
275 de->file_type = nilfs_type_by_mode[(mode & S_IFMT)>>S_SHIFT];
276}
277
278static int nilfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
279{
280 loff_t pos = filp->f_pos;
281 struct inode *inode = filp->f_dentry->d_inode;
282 struct super_block *sb = inode->i_sb;
283 unsigned int offset = pos & ~PAGE_CACHE_MASK;
284 unsigned long n = pos >> PAGE_CACHE_SHIFT;
285 unsigned long npages = dir_pages(inode);
286/* unsigned chunk_mask = ~(nilfs_chunk_size(inode)-1); */
287 unsigned char *types = NULL;
288 int ret;
289
290 if (pos > inode->i_size - NILFS_DIR_REC_LEN(1))
291 goto success;
292
293 types = nilfs_filetype_table;
294
295 for ( ; n < npages; n++, offset = 0) {
296 char *kaddr, *limit;
297 struct nilfs_dir_entry *de;
298 struct page *page = nilfs_get_page(inode, n);
299
300 if (IS_ERR(page)) {
301 nilfs_error(sb, __func__, "bad page in #%lu",
302 inode->i_ino);
303 filp->f_pos += PAGE_CACHE_SIZE - offset;
304 ret = -EIO;
305 goto done;
306 }
307 kaddr = page_address(page);
308 de = (struct nilfs_dir_entry *)(kaddr + offset);
309 limit = kaddr + nilfs_last_byte(inode, n) -
310 NILFS_DIR_REC_LEN(1);
311 for ( ; (char *)de <= limit; de = nilfs_next_entry(de)) {
312 if (de->rec_len == 0) {
313 nilfs_error(sb, __func__,
314 "zero-length directory entry");
315 ret = -EIO;
316 nilfs_put_page(page);
317 goto done;
318 }
319 if (de->inode) {
320 int over;
321 unsigned char d_type = DT_UNKNOWN;
322
323 if (types && de->file_type < NILFS_FT_MAX)
324 d_type = types[de->file_type];
325
326 offset = (char *)de - kaddr;
327 over = filldir(dirent, de->name, de->name_len,
328 (n<<PAGE_CACHE_SHIFT) | offset,
329 le64_to_cpu(de->inode), d_type);
330 if (over) {
331 nilfs_put_page(page);
332 goto success;
333 }
334 }
335 filp->f_pos += le16_to_cpu(de->rec_len);
336 }
337 nilfs_put_page(page);
338 }
339
340success:
341 ret = 0;
342done:
343 return ret;
344}
345
346/*
347 * nilfs_find_entry()
348 *
349 * finds an entry in the specified directory with the wanted name. It
350 * returns the page in which the entry was found, and the entry itself
351 * (as a parameter - res_dir). Page is returned mapped and unlocked.
352 * Entry is guaranteed to be valid.
353 */
354struct nilfs_dir_entry *
355nilfs_find_entry(struct inode *dir, struct dentry *dentry,
356 struct page **res_page)
357{
358 const char *name = dentry->d_name.name;
359 int namelen = dentry->d_name.len;
360 unsigned reclen = NILFS_DIR_REC_LEN(namelen);
361 unsigned long start, n;
362 unsigned long npages = dir_pages(dir);
363 struct page *page = NULL;
364 struct nilfs_inode_info *ei = NILFS_I(dir);
365 struct nilfs_dir_entry *de;
366
367 if (npages == 0)
368 goto out;
369
370 /* OFFSET_CACHE */
371 *res_page = NULL;
372
373 start = ei->i_dir_start_lookup;
374 if (start >= npages)
375 start = 0;
376 n = start;
377 do {
378 char *kaddr;
379 page = nilfs_get_page(dir, n);
380 if (!IS_ERR(page)) {
381 kaddr = page_address(page);
382 de = (struct nilfs_dir_entry *)kaddr;
383 kaddr += nilfs_last_byte(dir, n) - reclen;
384 while ((char *) de <= kaddr) {
385 if (de->rec_len == 0) {
386 nilfs_error(dir->i_sb, __func__,
387 "zero-length directory entry");
388 nilfs_put_page(page);
389 goto out;
390 }
391 if (nilfs_match(namelen, name, de))
392 goto found;
393 de = nilfs_next_entry(de);
394 }
395 nilfs_put_page(page);
396 }
397 if (++n >= npages)
398 n = 0;
399 /* next page is past the blocks we've got */
400 if (unlikely(n > (dir->i_blocks >> (PAGE_CACHE_SHIFT - 9)))) {
401 nilfs_error(dir->i_sb, __func__,
402 "dir %lu size %lld exceeds block cout %llu",
403 dir->i_ino, dir->i_size,
404 (unsigned long long)dir->i_blocks);
405 goto out;
406 }
407 } while (n != start);
408out:
409 return NULL;
410
411found:
412 *res_page = page;
413 ei->i_dir_start_lookup = n;
414 return de;
415}
416
417struct nilfs_dir_entry *nilfs_dotdot(struct inode *dir, struct page **p)
418{
419 struct page *page = nilfs_get_page(dir, 0);
420 struct nilfs_dir_entry *de = NULL;
421
422 if (!IS_ERR(page)) {
423 de = nilfs_next_entry(
424 (struct nilfs_dir_entry *)page_address(page));
425 *p = page;
426 }
427 return de;
428}
429
430ino_t nilfs_inode_by_name(struct inode *dir, struct dentry *dentry)
431{
432 ino_t res = 0;
433 struct nilfs_dir_entry *de;
434 struct page *page;
435
436 de = nilfs_find_entry(dir, dentry, &page);
437 if (de) {
438 res = le64_to_cpu(de->inode);
439 kunmap(page);
440 page_cache_release(page);
441 }
442 return res;
443}
444
445/* Releases the page */
446void nilfs_set_link(struct inode *dir, struct nilfs_dir_entry *de,
447 struct page *page, struct inode *inode)
448{
449 unsigned from = (char *) de - (char *) page_address(page);
450 unsigned to = from + le16_to_cpu(de->rec_len);
451 struct address_space *mapping = page->mapping;
452 int err;
453
454 lock_page(page);
455 err = nilfs_prepare_chunk_uninterruptible(page, mapping, from, to);
456 BUG_ON(err);
457 de->inode = cpu_to_le64(inode->i_ino);
458 nilfs_set_de_type(de, inode);
459 err = nilfs_commit_chunk(page, mapping, from, to);
460 nilfs_put_page(page);
461 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
462/* NILFS_I(dir)->i_flags &= ~NILFS_BTREE_FL; */
463 mark_inode_dirty(dir);
464}
465
466/*
467 * Parent is locked.
468 */
469int nilfs_add_link(struct dentry *dentry, struct inode *inode)
470{
471 struct inode *dir = dentry->d_parent->d_inode;
472 const char *name = dentry->d_name.name;
473 int namelen = dentry->d_name.len;
474 unsigned chunk_size = nilfs_chunk_size(dir);
475 unsigned reclen = NILFS_DIR_REC_LEN(namelen);
476 unsigned short rec_len, name_len;
477 struct page *page = NULL;
478 struct nilfs_dir_entry *de;
479 unsigned long npages = dir_pages(dir);
480 unsigned long n;
481 char *kaddr;
482 unsigned from, to;
483 int err;
484
485 /*
486 * We take care of directory expansion in the same loop.
487 * This code plays outside i_size, so it locks the page
488 * to protect that region.
489 */
490 for (n = 0; n <= npages; n++) {
491 char *dir_end;
492
493 page = nilfs_get_page(dir, n);
494 err = PTR_ERR(page);
495 if (IS_ERR(page))
496 goto out;
497 lock_page(page);
498 kaddr = page_address(page);
499 dir_end = kaddr + nilfs_last_byte(dir, n);
500 de = (struct nilfs_dir_entry *)kaddr;
501 kaddr += PAGE_CACHE_SIZE - reclen;
502 while ((char *)de <= kaddr) {
503 if ((char *)de == dir_end) {
504 /* We hit i_size */
505 name_len = 0;
506 rec_len = chunk_size;
507 de->rec_len = cpu_to_le16(chunk_size);
508 de->inode = 0;
509 goto got_it;
510 }
511 if (de->rec_len == 0) {
512 nilfs_error(dir->i_sb, __func__,
513 "zero-length directory entry");
514 err = -EIO;
515 goto out_unlock;
516 }
517 err = -EEXIST;
518 if (nilfs_match(namelen, name, de))
519 goto out_unlock;
520 name_len = NILFS_DIR_REC_LEN(de->name_len);
521 rec_len = le16_to_cpu(de->rec_len);
522 if (!de->inode && rec_len >= reclen)
523 goto got_it;
524 if (rec_len >= name_len + reclen)
525 goto got_it;
526 de = (struct nilfs_dir_entry *)((char *)de + rec_len);
527 }
528 unlock_page(page);
529 nilfs_put_page(page);
530 }
531 BUG();
532 return -EINVAL;
533
534got_it:
535 from = (char *)de - (char *)page_address(page);
536 to = from + rec_len;
537 err = nilfs_prepare_chunk(page, page->mapping, from, to);
538 if (err)
539 goto out_unlock;
540 if (de->inode) {
541 struct nilfs_dir_entry *de1;
542
543 de1 = (struct nilfs_dir_entry *)((char *)de + name_len);
544 de1->rec_len = cpu_to_le16(rec_len - name_len);
545 de->rec_len = cpu_to_le16(name_len);
546 de = de1;
547 }
548 de->name_len = namelen;
549 memcpy(de->name, name, namelen);
550 de->inode = cpu_to_le64(inode->i_ino);
551 nilfs_set_de_type(de, inode);
552 err = nilfs_commit_chunk(page, page->mapping, from, to);
553 dir->i_mtime = dir->i_ctime = CURRENT_TIME;
554/* NILFS_I(dir)->i_flags &= ~NILFS_BTREE_FL; */
555 mark_inode_dirty(dir);
556 /* OFFSET_CACHE */
557out_put:
558 nilfs_put_page(page);
559out:
560 return err;
561out_unlock:
562 unlock_page(page);
563 goto out_put;
564}
565
566/*
567 * nilfs_delete_entry deletes a directory entry by merging it with the
568 * previous entry. Page is up-to-date. Releases the page.
569 */
570int nilfs_delete_entry(struct nilfs_dir_entry *dir, struct page *page)
571{
572 struct address_space *mapping = page->mapping;
573 struct inode *inode = mapping->host;
574 char *kaddr = page_address(page);
575 unsigned from = ((char *)dir - kaddr) & ~(nilfs_chunk_size(inode) - 1);
576 unsigned to = ((char *)dir - kaddr) + le16_to_cpu(dir->rec_len);
577 struct nilfs_dir_entry *pde = NULL;
578 struct nilfs_dir_entry *de = (struct nilfs_dir_entry *)(kaddr + from);
579 int err;
580
581 while ((char *)de < (char *)dir) {
582 if (de->rec_len == 0) {
583 nilfs_error(inode->i_sb, __func__,
584 "zero-length directory entry");
585 err = -EIO;
586 goto out;
587 }
588 pde = de;
589 de = nilfs_next_entry(de);
590 }
591 if (pde)
592 from = (char *)pde - (char *)page_address(page);
593 lock_page(page);
594 err = nilfs_prepare_chunk(page, mapping, from, to);
595 BUG_ON(err);
596 if (pde)
597 pde->rec_len = cpu_to_le16(to - from);
598 dir->inode = 0;
599 err = nilfs_commit_chunk(page, mapping, from, to);
600 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
601/* NILFS_I(inode)->i_flags &= ~NILFS_BTREE_FL; */
602 mark_inode_dirty(inode);
603out:
604 nilfs_put_page(page);
605 return err;
606}
607
608/*
609 * Set the first fragment of directory.
610 */
611int nilfs_make_empty(struct inode *inode, struct inode *parent)
612{
613 struct address_space *mapping = inode->i_mapping;
614 struct page *page = grab_cache_page(mapping, 0);
615 unsigned chunk_size = nilfs_chunk_size(inode);
616 struct nilfs_dir_entry *de;
617 int err;
618 void *kaddr;
619
620 if (!page)
621 return -ENOMEM;
622
623 err = nilfs_prepare_chunk(page, mapping, 0, chunk_size);
624 if (unlikely(err)) {
625 unlock_page(page);
626 goto fail;
627 }
628 kaddr = kmap_atomic(page, KM_USER0);
629 memset(kaddr, 0, chunk_size);
630 de = (struct nilfs_dir_entry *)kaddr;
631 de->name_len = 1;
632 de->rec_len = cpu_to_le16(NILFS_DIR_REC_LEN(1));
633 memcpy(de->name, ".\0\0", 4);
634 de->inode = cpu_to_le64(inode->i_ino);
635 nilfs_set_de_type(de, inode);
636
637 de = (struct nilfs_dir_entry *)(kaddr + NILFS_DIR_REC_LEN(1));
638 de->name_len = 2;
639 de->rec_len = cpu_to_le16(chunk_size - NILFS_DIR_REC_LEN(1));
640 de->inode = cpu_to_le64(parent->i_ino);
641 memcpy(de->name, "..\0", 4);
642 nilfs_set_de_type(de, inode);
643 kunmap_atomic(kaddr, KM_USER0);
644 err = nilfs_commit_chunk(page, mapping, 0, chunk_size);
645fail:
646 page_cache_release(page);
647 return err;
648}
649
650/*
651 * routine to check that the specified directory is empty (for rmdir)
652 */
653int nilfs_empty_dir(struct inode *inode)
654{
655 struct page *page = NULL;
656 unsigned long i, npages = dir_pages(inode);
657
658 for (i = 0; i < npages; i++) {
659 char *kaddr;
660 struct nilfs_dir_entry *de;
661
662 page = nilfs_get_page(inode, i);
663 if (IS_ERR(page))
664 continue;
665
666 kaddr = page_address(page);
667 de = (struct nilfs_dir_entry *)kaddr;
668 kaddr += nilfs_last_byte(inode, i) - NILFS_DIR_REC_LEN(1);
669
670 while ((char *)de <= kaddr) {
671 if (de->rec_len == 0) {
672 nilfs_error(inode->i_sb, __func__,
673 "zero-length directory entry "
674 "(kaddr=%p, de=%p)\n", kaddr, de);
675 goto not_empty;
676 }
677 if (de->inode != 0) {
678 /* check for . and .. */
679 if (de->name[0] != '.')
680 goto not_empty;
681 if (de->name_len > 2)
682 goto not_empty;
683 if (de->name_len < 2) {
684 if (de->inode !=
685 cpu_to_le64(inode->i_ino))
686 goto not_empty;
687 } else if (de->name[1] != '.')
688 goto not_empty;
689 }
690 de = nilfs_next_entry(de);
691 }
692 nilfs_put_page(page);
693 }
694 return 1;
695
696not_empty:
697 nilfs_put_page(page);
698 return 0;
699}
700
701struct file_operations nilfs_dir_operations = {
702 .llseek = generic_file_llseek,
703 .read = generic_read_dir,
704 .readdir = nilfs_readdir,
705 .unlocked_ioctl = nilfs_ioctl,
706#ifdef CONFIG_COMPAT
707 .compat_ioctl = nilfs_ioctl,
708#endif /* CONFIG_COMPAT */
709 .fsync = nilfs_sync_file,
710
711};
diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c
new file mode 100644
index 000000000000..c6379e482781
--- /dev/null
+++ b/fs/nilfs2/direct.c
@@ -0,0 +1,436 @@
1/*
2 * direct.c - NILFS direct block pointer.
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#include <linux/errno.h>
24#include "nilfs.h"
25#include "page.h"
26#include "direct.h"
27#include "alloc.h"
28
29static inline __le64 *nilfs_direct_dptrs(const struct nilfs_direct *direct)
30{
31 return (__le64 *)
32 ((struct nilfs_direct_node *)direct->d_bmap.b_u.u_data + 1);
33}
34
35static inline __u64
36nilfs_direct_get_ptr(const struct nilfs_direct *direct, __u64 key)
37{
38 return nilfs_bmap_dptr_to_ptr(*(nilfs_direct_dptrs(direct) + key));
39}
40
41static inline void nilfs_direct_set_ptr(struct nilfs_direct *direct,
42 __u64 key, __u64 ptr)
43{
44 *(nilfs_direct_dptrs(direct) + key) = nilfs_bmap_ptr_to_dptr(ptr);
45}
46
47static int nilfs_direct_lookup(const struct nilfs_bmap *bmap,
48 __u64 key, int level, __u64 *ptrp)
49{
50 struct nilfs_direct *direct;
51 __u64 ptr;
52
53 direct = (struct nilfs_direct *)bmap;
54 if ((key > NILFS_DIRECT_KEY_MAX) ||
55 (level != 1) || /* XXX: use macro for level 1 */
56 ((ptr = nilfs_direct_get_ptr(direct, key)) ==
57 NILFS_BMAP_INVALID_PTR))
58 return -ENOENT;
59
60 if (ptrp != NULL)
61 *ptrp = ptr;
62 return 0;
63}
64
65static __u64
66nilfs_direct_find_target_v(const struct nilfs_direct *direct, __u64 key)
67{
68 __u64 ptr;
69
70 ptr = nilfs_bmap_find_target_seq(&direct->d_bmap, key);
71 if (ptr != NILFS_BMAP_INVALID_PTR)
72 /* sequential access */
73 return ptr;
74 else
75 /* block group */
76 return nilfs_bmap_find_target_in_group(&direct->d_bmap);
77}
78
79static void nilfs_direct_set_target_v(struct nilfs_direct *direct,
80 __u64 key, __u64 ptr)
81{
82 direct->d_bmap.b_last_allocated_key = key;
83 direct->d_bmap.b_last_allocated_ptr = ptr;
84}
85
86static int nilfs_direct_prepare_insert(struct nilfs_direct *direct,
87 __u64 key,
88 union nilfs_bmap_ptr_req *req,
89 struct nilfs_bmap_stats *stats)
90{
91 int ret;
92
93 if (direct->d_ops->dop_find_target != NULL)
94 req->bpr_ptr = direct->d_ops->dop_find_target(direct, key);
95 ret = direct->d_bmap.b_pops->bpop_prepare_alloc_ptr(&direct->d_bmap,
96 req);
97 if (ret < 0)
98 return ret;
99
100 stats->bs_nblocks = 1;
101 return 0;
102}
103
104static void nilfs_direct_commit_insert(struct nilfs_direct *direct,
105 union nilfs_bmap_ptr_req *req,
106 __u64 key, __u64 ptr)
107{
108 struct buffer_head *bh;
109
110 /* ptr must be a pointer to a buffer head. */
111 bh = (struct buffer_head *)((unsigned long)ptr);
112 set_buffer_nilfs_volatile(bh);
113
114 if (direct->d_bmap.b_pops->bpop_commit_alloc_ptr != NULL)
115 direct->d_bmap.b_pops->bpop_commit_alloc_ptr(
116 &direct->d_bmap, req);
117 nilfs_direct_set_ptr(direct, key, req->bpr_ptr);
118
119 if (!nilfs_bmap_dirty(&direct->d_bmap))
120 nilfs_bmap_set_dirty(&direct->d_bmap);
121
122 if (direct->d_ops->dop_set_target != NULL)
123 direct->d_ops->dop_set_target(direct, key, req->bpr_ptr);
124}
125
126static int nilfs_direct_insert(struct nilfs_bmap *bmap, __u64 key, __u64 ptr)
127{
128 struct nilfs_direct *direct;
129 union nilfs_bmap_ptr_req req;
130 struct nilfs_bmap_stats stats;
131 int ret;
132
133 direct = (struct nilfs_direct *)bmap;
134 if (key > NILFS_DIRECT_KEY_MAX)
135 return -ENOENT;
136 if (nilfs_direct_get_ptr(direct, key) != NILFS_BMAP_INVALID_PTR)
137 return -EEXIST;
138
139 ret = nilfs_direct_prepare_insert(direct, key, &req, &stats);
140 if (ret < 0)
141 return ret;
142 nilfs_direct_commit_insert(direct, &req, key, ptr);
143 nilfs_bmap_add_blocks(bmap, stats.bs_nblocks);
144
145 return 0;
146}
147
148static int nilfs_direct_prepare_delete(struct nilfs_direct *direct,
149 union nilfs_bmap_ptr_req *req,
150 __u64 key,
151 struct nilfs_bmap_stats *stats)
152{
153 int ret;
154
155 if (direct->d_bmap.b_pops->bpop_prepare_end_ptr != NULL) {
156 req->bpr_ptr = nilfs_direct_get_ptr(direct, key);
157 ret = direct->d_bmap.b_pops->bpop_prepare_end_ptr(
158 &direct->d_bmap, req);
159 if (ret < 0)
160 return ret;
161 }
162
163 stats->bs_nblocks = 1;
164 return 0;
165}
166
167static void nilfs_direct_commit_delete(struct nilfs_direct *direct,
168 union nilfs_bmap_ptr_req *req,
169 __u64 key)
170{
171 if (direct->d_bmap.b_pops->bpop_commit_end_ptr != NULL)
172 direct->d_bmap.b_pops->bpop_commit_end_ptr(
173 &direct->d_bmap, req);
174 nilfs_direct_set_ptr(direct, key, NILFS_BMAP_INVALID_PTR);
175}
176
177static int nilfs_direct_delete(struct nilfs_bmap *bmap, __u64 key)
178{
179 struct nilfs_direct *direct;
180 union nilfs_bmap_ptr_req req;
181 struct nilfs_bmap_stats stats;
182 int ret;
183
184 direct = (struct nilfs_direct *)bmap;
185 if ((key > NILFS_DIRECT_KEY_MAX) ||
186 nilfs_direct_get_ptr(direct, key) == NILFS_BMAP_INVALID_PTR)
187 return -ENOENT;
188
189 ret = nilfs_direct_prepare_delete(direct, &req, key, &stats);
190 if (ret < 0)
191 return ret;
192 nilfs_direct_commit_delete(direct, &req, key);
193 nilfs_bmap_sub_blocks(bmap, stats.bs_nblocks);
194
195 return 0;
196}
197
198static int nilfs_direct_last_key(const struct nilfs_bmap *bmap, __u64 *keyp)
199{
200 struct nilfs_direct *direct;
201 __u64 key, lastkey;
202
203 direct = (struct nilfs_direct *)bmap;
204 lastkey = NILFS_DIRECT_KEY_MAX + 1;
205 for (key = NILFS_DIRECT_KEY_MIN; key <= NILFS_DIRECT_KEY_MAX; key++)
206 if (nilfs_direct_get_ptr(direct, key) !=
207 NILFS_BMAP_INVALID_PTR)
208 lastkey = key;
209
210 if (lastkey == NILFS_DIRECT_KEY_MAX + 1)
211 return -ENOENT;
212
213 *keyp = lastkey;
214
215 return 0;
216}
217
218static int nilfs_direct_check_insert(const struct nilfs_bmap *bmap, __u64 key)
219{
220 return key > NILFS_DIRECT_KEY_MAX;
221}
222
223static int nilfs_direct_gather_data(struct nilfs_bmap *bmap,
224 __u64 *keys, __u64 *ptrs, int nitems)
225{
226 struct nilfs_direct *direct;
227 __u64 key;
228 __u64 ptr;
229 int n;
230
231 direct = (struct nilfs_direct *)bmap;
232 if (nitems > NILFS_DIRECT_NBLOCKS)
233 nitems = NILFS_DIRECT_NBLOCKS;
234 n = 0;
235 for (key = 0; key < nitems; key++) {
236 ptr = nilfs_direct_get_ptr(direct, key);
237 if (ptr != NILFS_BMAP_INVALID_PTR) {
238 keys[n] = key;
239 ptrs[n] = ptr;
240 n++;
241 }
242 }
243 return n;
244}
245
246int nilfs_direct_delete_and_convert(struct nilfs_bmap *bmap,
247 __u64 key, __u64 *keys, __u64 *ptrs,
248 int n, __u64 low, __u64 high)
249{
250 struct nilfs_direct *direct;
251 __le64 *dptrs;
252 int ret, i, j;
253
254 /* no need to allocate any resource for conversion */
255
256 /* delete */
257 ret = bmap->b_ops->bop_delete(bmap, key);
258 if (ret < 0)
259 return ret;
260
261 /* free resources */
262 if (bmap->b_ops->bop_clear != NULL)
263 bmap->b_ops->bop_clear(bmap);
264
265 /* convert */
266 direct = (struct nilfs_direct *)bmap;
267 dptrs = nilfs_direct_dptrs(direct);
268 for (i = 0, j = 0; i < NILFS_DIRECT_NBLOCKS; i++) {
269 if ((j < n) && (i == keys[j])) {
270 dptrs[i] = (i != key) ?
271 nilfs_bmap_ptr_to_dptr(ptrs[j]) :
272 NILFS_BMAP_INVALID_PTR;
273 j++;
274 } else
275 dptrs[i] = NILFS_BMAP_INVALID_PTR;
276 }
277
278 nilfs_direct_init(bmap, low, high);
279
280 return 0;
281}
282
283static int nilfs_direct_propagate_v(struct nilfs_direct *direct,
284 struct buffer_head *bh)
285{
286 union nilfs_bmap_ptr_req oldreq, newreq;
287 __u64 key;
288 __u64 ptr;
289 int ret;
290
291 key = nilfs_bmap_data_get_key(&direct->d_bmap, bh);
292 ptr = nilfs_direct_get_ptr(direct, key);
293 if (!buffer_nilfs_volatile(bh)) {
294 oldreq.bpr_ptr = ptr;
295 newreq.bpr_ptr = ptr;
296 ret = nilfs_bmap_prepare_update(&direct->d_bmap, &oldreq,
297 &newreq);
298 if (ret < 0)
299 return ret;
300 nilfs_bmap_commit_update(&direct->d_bmap, &oldreq, &newreq);
301 set_buffer_nilfs_volatile(bh);
302 nilfs_direct_set_ptr(direct, key, newreq.bpr_ptr);
303 } else
304 ret = nilfs_bmap_mark_dirty(&direct->d_bmap, ptr);
305
306 return ret;
307}
308
309static int nilfs_direct_propagate(const struct nilfs_bmap *bmap,
310 struct buffer_head *bh)
311{
312 struct nilfs_direct *direct;
313
314 direct = (struct nilfs_direct *)bmap;
315 return (direct->d_ops->dop_propagate != NULL) ?
316 direct->d_ops->dop_propagate(direct, bh) :
317 0;
318}
319
320static int nilfs_direct_assign_v(struct nilfs_direct *direct,
321 __u64 key, __u64 ptr,
322 struct buffer_head **bh,
323 sector_t blocknr,
324 union nilfs_binfo *binfo)
325{
326 union nilfs_bmap_ptr_req req;
327 int ret;
328
329 req.bpr_ptr = ptr;
330 ret = direct->d_bmap.b_pops->bpop_prepare_start_ptr(
331 &direct->d_bmap, &req);
332 if (ret < 0)
333 return ret;
334 direct->d_bmap.b_pops->bpop_commit_start_ptr(&direct->d_bmap,
335 &req, blocknr);
336
337 binfo->bi_v.bi_vblocknr = nilfs_bmap_ptr_to_dptr(ptr);
338 binfo->bi_v.bi_blkoff = nilfs_bmap_key_to_dkey(key);
339
340 return 0;
341}
342
343static int nilfs_direct_assign_p(struct nilfs_direct *direct,
344 __u64 key, __u64 ptr,
345 struct buffer_head **bh,
346 sector_t blocknr,
347 union nilfs_binfo *binfo)
348{
349 nilfs_direct_set_ptr(direct, key, blocknr);
350
351 binfo->bi_dat.bi_blkoff = nilfs_bmap_key_to_dkey(key);
352 binfo->bi_dat.bi_level = 0;
353
354 return 0;
355}
356
357static int nilfs_direct_assign(struct nilfs_bmap *bmap,
358 struct buffer_head **bh,
359 sector_t blocknr,
360 union nilfs_binfo *binfo)
361{
362 struct nilfs_direct *direct;
363 __u64 key;
364 __u64 ptr;
365
366 direct = (struct nilfs_direct *)bmap;
367 key = nilfs_bmap_data_get_key(bmap, *bh);
368 if (unlikely(key > NILFS_DIRECT_KEY_MAX)) {
369 printk(KERN_CRIT "%s: invalid key: %llu\n", __func__,
370 (unsigned long long)key);
371 return -EINVAL;
372 }
373 ptr = nilfs_direct_get_ptr(direct, key);
374 if (unlikely(ptr == NILFS_BMAP_INVALID_PTR)) {
375 printk(KERN_CRIT "%s: invalid pointer: %llu\n", __func__,
376 (unsigned long long)ptr);
377 return -EINVAL;
378 }
379
380 return direct->d_ops->dop_assign(direct, key, ptr, bh,
381 blocknr, binfo);
382}
383
384static const struct nilfs_bmap_operations nilfs_direct_ops = {
385 .bop_lookup = nilfs_direct_lookup,
386 .bop_insert = nilfs_direct_insert,
387 .bop_delete = nilfs_direct_delete,
388 .bop_clear = NULL,
389
390 .bop_propagate = nilfs_direct_propagate,
391
392 .bop_lookup_dirty_buffers = NULL,
393
394 .bop_assign = nilfs_direct_assign,
395 .bop_mark = NULL,
396
397 .bop_last_key = nilfs_direct_last_key,
398 .bop_check_insert = nilfs_direct_check_insert,
399 .bop_check_delete = NULL,
400 .bop_gather_data = nilfs_direct_gather_data,
401};
402
403
404static const struct nilfs_direct_operations nilfs_direct_ops_v = {
405 .dop_find_target = nilfs_direct_find_target_v,
406 .dop_set_target = nilfs_direct_set_target_v,
407 .dop_propagate = nilfs_direct_propagate_v,
408 .dop_assign = nilfs_direct_assign_v,
409};
410
411static const struct nilfs_direct_operations nilfs_direct_ops_p = {
412 .dop_find_target = NULL,
413 .dop_set_target = NULL,
414 .dop_propagate = NULL,
415 .dop_assign = nilfs_direct_assign_p,
416};
417
418int nilfs_direct_init(struct nilfs_bmap *bmap, __u64 low, __u64 high)
419{
420 struct nilfs_direct *direct;
421
422 direct = (struct nilfs_direct *)bmap;
423 bmap->b_ops = &nilfs_direct_ops;
424 bmap->b_low = low;
425 bmap->b_high = high;
426 switch (bmap->b_inode->i_ino) {
427 case NILFS_DAT_INO:
428 direct->d_ops = &nilfs_direct_ops_p;
429 break;
430 default:
431 direct->d_ops = &nilfs_direct_ops_v;
432 break;
433 }
434
435 return 0;
436}
diff --git a/fs/nilfs2/direct.h b/fs/nilfs2/direct.h
new file mode 100644
index 000000000000..45d2c5cda812
--- /dev/null
+++ b/fs/nilfs2/direct.h
@@ -0,0 +1,78 @@
1/*
2 * direct.h - NILFS direct block pointer.
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#ifndef _NILFS_DIRECT_H
24#define _NILFS_DIRECT_H
25
26#include <linux/types.h>
27#include <linux/buffer_head.h>
28#include "bmap.h"
29
30
31struct nilfs_direct;
32
33/**
34 * struct nilfs_direct_operations - direct mapping operation table
35 */
36struct nilfs_direct_operations {
37 __u64 (*dop_find_target)(const struct nilfs_direct *, __u64);
38 void (*dop_set_target)(struct nilfs_direct *, __u64, __u64);
39 int (*dop_propagate)(struct nilfs_direct *, struct buffer_head *);
40 int (*dop_assign)(struct nilfs_direct *, __u64, __u64,
41 struct buffer_head **, sector_t,
42 union nilfs_binfo *);
43};
44
45/**
46 * struct nilfs_direct_node - direct node
47 * @dn_flags: flags
48 * @dn_pad: padding
49 */
50struct nilfs_direct_node {
51 __u8 dn_flags;
52 __u8 pad[7];
53};
54
55/**
56 * struct nilfs_direct - direct mapping
57 * @d_bmap: bmap structure
58 * @d_ops: direct mapping operation table
59 */
60struct nilfs_direct {
61 struct nilfs_bmap d_bmap;
62
63 /* direct-mapping-specific members */
64 const struct nilfs_direct_operations *d_ops;
65};
66
67
68#define NILFS_DIRECT_NBLOCKS (NILFS_BMAP_SIZE / sizeof(__le64) - 1)
69#define NILFS_DIRECT_KEY_MIN 0
70#define NILFS_DIRECT_KEY_MAX (NILFS_DIRECT_NBLOCKS - 1)
71
72
73int nilfs_direct_init(struct nilfs_bmap *, __u64, __u64);
74int nilfs_direct_delete_and_convert(struct nilfs_bmap *, __u64, __u64 *,
75 __u64 *, int, __u64, __u64);
76
77
78#endif /* _NILFS_DIRECT_H */
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
new file mode 100644
index 000000000000..6bd84a0d8238
--- /dev/null
+++ b/fs/nilfs2/file.c
@@ -0,0 +1,160 @@
1/*
2 * file.c - NILFS regular file handling primitives including fsync().
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Amagai Yoshiji <amagai@osrg.net>,
21 * Ryusuke Konishi <ryusuke@osrg.net>
22 */
23
24#include <linux/fs.h>
25#include <linux/mm.h>
26#include <linux/writeback.h>
27#include "nilfs.h"
28#include "segment.h"
29
30int nilfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
31{
32 /*
33 * Called from fsync() system call
34 * This is the only entry point that can catch write and synch
35 * timing for both data blocks and intermediate blocks.
36 *
37 * This function should be implemented when the writeback function
38 * will be implemented.
39 */
40 struct inode *inode = dentry->d_inode;
41 int err;
42
43 if (!nilfs_inode_dirty(inode))
44 return 0;
45
46 if (datasync)
47 err = nilfs_construct_dsync_segment(inode->i_sb, inode, 0,
48 LLONG_MAX);
49 else
50 err = nilfs_construct_segment(inode->i_sb);
51
52 return err;
53}
54
55static int nilfs_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
56{
57 struct page *page = vmf->page;
58 struct inode *inode = vma->vm_file->f_dentry->d_inode;
59 struct nilfs_transaction_info ti;
60 int ret;
61
62 if (unlikely(nilfs_near_disk_full(NILFS_SB(inode->i_sb)->s_nilfs)))
63 return VM_FAULT_SIGBUS; /* -ENOSPC */
64
65 lock_page(page);
66 if (page->mapping != inode->i_mapping ||
67 page_offset(page) >= i_size_read(inode) || !PageUptodate(page)) {
68 unlock_page(page);
69 return VM_FAULT_NOPAGE; /* make the VM retry the fault */
70 }
71
72 /*
73 * check to see if the page is mapped already (no holes)
74 */
75 if (PageMappedToDisk(page)) {
76 unlock_page(page);
77 goto mapped;
78 }
79 if (page_has_buffers(page)) {
80 struct buffer_head *bh, *head;
81 int fully_mapped = 1;
82
83 bh = head = page_buffers(page);
84 do {
85 if (!buffer_mapped(bh)) {
86 fully_mapped = 0;
87 break;
88 }
89 } while (bh = bh->b_this_page, bh != head);
90
91 if (fully_mapped) {
92 SetPageMappedToDisk(page);
93 unlock_page(page);
94 goto mapped;
95 }
96 }
97 unlock_page(page);
98
99 /*
100 * fill hole blocks
101 */
102 ret = nilfs_transaction_begin(inode->i_sb, &ti, 1);
103 /* never returns -ENOMEM, but may return -ENOSPC */
104 if (unlikely(ret))
105 return VM_FAULT_SIGBUS;
106
107 ret = block_page_mkwrite(vma, vmf, nilfs_get_block);
108 if (unlikely(ret)) {
109 nilfs_transaction_abort(inode->i_sb);
110 return ret;
111 }
112 nilfs_transaction_commit(inode->i_sb);
113
114 mapped:
115 SetPageChecked(page);
116 wait_on_page_writeback(page);
117 return 0;
118}
119
120struct vm_operations_struct nilfs_file_vm_ops = {
121 .fault = filemap_fault,
122 .page_mkwrite = nilfs_page_mkwrite,
123};
124
125static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma)
126{
127 file_accessed(file);
128 vma->vm_ops = &nilfs_file_vm_ops;
129 vma->vm_flags |= VM_CAN_NONLINEAR;
130 return 0;
131}
132
133/*
134 * We have mostly NULL's here: the current defaults are ok for
135 * the nilfs filesystem.
136 */
137struct file_operations nilfs_file_operations = {
138 .llseek = generic_file_llseek,
139 .read = do_sync_read,
140 .write = do_sync_write,
141 .aio_read = generic_file_aio_read,
142 .aio_write = generic_file_aio_write,
143 .unlocked_ioctl = nilfs_ioctl,
144#ifdef CONFIG_COMPAT
145 .compat_ioctl = nilfs_ioctl,
146#endif /* CONFIG_COMPAT */
147 .mmap = nilfs_file_mmap,
148 .open = generic_file_open,
149 /* .release = nilfs_release_file, */
150 .fsync = nilfs_sync_file,
151 .splice_read = generic_file_splice_read,
152};
153
154struct inode_operations nilfs_file_inode_operations = {
155 .truncate = nilfs_truncate,
156 .setattr = nilfs_setattr,
157 .permission = nilfs_permission,
158};
159
160/* end of file */
diff --git a/fs/nilfs2/gcdat.c b/fs/nilfs2/gcdat.c
new file mode 100644
index 000000000000..93383c5cee90
--- /dev/null
+++ b/fs/nilfs2/gcdat.c
@@ -0,0 +1,84 @@
1/*
2 * gcdat.c - NILFS shadow DAT inode for GC
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Seiji Kihara <kihara@osrg.net>, Amagai Yoshiji <amagai@osrg.net>,
21 * and Ryusuke Konishi <ryusuke@osrg.net>.
22 *
23 */
24
25#include <linux/buffer_head.h>
26#include "nilfs.h"
27#include "page.h"
28#include "mdt.h"
29
30int nilfs_init_gcdat_inode(struct the_nilfs *nilfs)
31{
32 struct inode *dat = nilfs->ns_dat, *gcdat = nilfs->ns_gc_dat;
33 struct nilfs_inode_info *dii = NILFS_I(dat), *gii = NILFS_I(gcdat);
34 int err;
35
36 gcdat->i_state = 0;
37 gcdat->i_blocks = dat->i_blocks;
38 gii->i_flags = dii->i_flags;
39 gii->i_state = dii->i_state | (1 << NILFS_I_GCDAT);
40 gii->i_cno = 0;
41 nilfs_bmap_init_gcdat(gii->i_bmap, dii->i_bmap);
42 err = nilfs_copy_dirty_pages(gcdat->i_mapping, dat->i_mapping);
43 if (unlikely(err))
44 return err;
45
46 return nilfs_copy_dirty_pages(&gii->i_btnode_cache,
47 &dii->i_btnode_cache);
48}
49
50void nilfs_commit_gcdat_inode(struct the_nilfs *nilfs)
51{
52 struct inode *dat = nilfs->ns_dat, *gcdat = nilfs->ns_gc_dat;
53 struct nilfs_inode_info *dii = NILFS_I(dat), *gii = NILFS_I(gcdat);
54 struct address_space *mapping = dat->i_mapping;
55 struct address_space *gmapping = gcdat->i_mapping;
56
57 down_write(&NILFS_MDT(dat)->mi_sem);
58 dat->i_blocks = gcdat->i_blocks;
59 dii->i_flags = gii->i_flags;
60 dii->i_state = gii->i_state & ~(1 << NILFS_I_GCDAT);
61
62 nilfs_bmap_commit_gcdat(gii->i_bmap, dii->i_bmap);
63
64 nilfs_clear_dirty_pages(mapping);
65 nilfs_copy_back_pages(mapping, gmapping);
66 /* note: mdt dirty flags should be cleared by segctor. */
67
68 nilfs_clear_dirty_pages(&dii->i_btnode_cache);
69 nilfs_copy_back_pages(&dii->i_btnode_cache, &gii->i_btnode_cache);
70
71 up_write(&NILFS_MDT(dat)->mi_sem);
72}
73
74void nilfs_clear_gcdat_inode(struct the_nilfs *nilfs)
75{
76 struct inode *gcdat = nilfs->ns_gc_dat;
77 struct nilfs_inode_info *gii = NILFS_I(gcdat);
78
79 gcdat->i_state = I_CLEAR;
80 gii->i_flags = 0;
81
82 truncate_inode_pages(gcdat->i_mapping, 0);
83 truncate_inode_pages(&gii->i_btnode_cache, 0);
84}
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
new file mode 100644
index 000000000000..19d2102b6a69
--- /dev/null
+++ b/fs/nilfs2/gcinode.c
@@ -0,0 +1,288 @@
1/*
2 * gcinode.c - dummy inodes to buffer blocks for garbage collection
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Seiji Kihara <kihara@osrg.net>, Amagai Yoshiji <amagai@osrg.net>,
21 * and Ryusuke Konishi <ryusuke@osrg.net>.
22 * Revised by Ryusuke Konishi <ryusuke@osrg.net>.
23 *
24 */
25/*
26 * This file adds the cache of on-disk blocks to be moved in garbage
27 * collection. The disk blocks are held with dummy inodes (called
28 * gcinodes), and this file provides lookup function of the dummy
29 * inodes and their buffer read function.
30 *
31 * Since NILFS2 keeps up multiple checkpoints/snapshots accross GC, it
32 * has to treat blocks that belong to a same file but have different
33 * checkpoint numbers. To avoid interference among generations, dummy
34 * inodes are managed separatly from actual inodes, and their lookup
35 * function (nilfs_gc_iget) is designed to be specified with a
36 * checkpoint number argument as well as an inode number.
37 *
38 * Buffers and pages held by the dummy inodes will be released each
39 * time after they are copied to a new log. Dirty blocks made on the
40 * current generation and the blocks to be moved by GC never overlap
41 * because the dirty blocks make a new generation; they rather must be
42 * written individually.
43 */
44
45#include <linux/buffer_head.h>
46#include <linux/mpage.h>
47#include <linux/hash.h>
48#include <linux/swap.h>
49#include "nilfs.h"
50#include "page.h"
51#include "mdt.h"
52#include "dat.h"
53#include "ifile.h"
54
55static struct address_space_operations def_gcinode_aops = {};
56/* XXX need def_gcinode_iops/fops? */
57
58/*
59 * nilfs_gccache_submit_read_data() - add data buffer and submit read request
60 * @inode - gc inode
61 * @blkoff - dummy offset treated as the key for the page cache
62 * @pbn - physical block number of the block
63 * @vbn - virtual block number of the block, 0 for non-virtual block
64 * @out_bh - indirect pointer to a buffer_head struct to receive the results
65 *
66 * Description: nilfs_gccache_submit_read_data() registers the data buffer
67 * specified by @pbn to the GC pagecache with the key @blkoff.
68 * This function sets @vbn (@pbn if @vbn is zero) in b_blocknr of the buffer.
69 *
70 * Return Value: On success, 0 is returned. On Error, one of the following
71 * negative error code is returned.
72 *
73 * %-EIO - I/O error.
74 *
75 * %-ENOMEM - Insufficient amount of memory available.
76 *
77 * %-ENOENT - The block specified with @pbn does not exist.
78 */
79int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff,
80 sector_t pbn, __u64 vbn,
81 struct buffer_head **out_bh)
82{
83 struct buffer_head *bh;
84 int err;
85
86 bh = nilfs_grab_buffer(inode, inode->i_mapping, blkoff, 0);
87 if (unlikely(!bh))
88 return -ENOMEM;
89
90 if (buffer_uptodate(bh))
91 goto out;
92
93 if (pbn == 0) {
94 struct inode *dat_inode = NILFS_I_NILFS(inode)->ns_dat;
95 /* use original dat, not gc dat. */
96 err = nilfs_dat_translate(dat_inode, vbn, &pbn);
97 if (unlikely(err)) { /* -EIO, -ENOMEM, -ENOENT */
98 brelse(bh);
99 goto failed;
100 }
101 }
102
103 lock_buffer(bh);
104 if (buffer_uptodate(bh)) {
105 unlock_buffer(bh);
106 goto out;
107 }
108
109 if (!buffer_mapped(bh)) {
110 bh->b_bdev = NILFS_I_NILFS(inode)->ns_bdev;
111 set_buffer_mapped(bh);
112 }
113 bh->b_blocknr = pbn;
114 bh->b_end_io = end_buffer_read_sync;
115 get_bh(bh);
116 submit_bh(READ, bh);
117 if (vbn)
118 bh->b_blocknr = vbn;
119 out:
120 err = 0;
121 *out_bh = bh;
122
123 failed:
124 unlock_page(bh->b_page);
125 page_cache_release(bh->b_page);
126 return err;
127}
128
129/*
130 * nilfs_gccache_submit_read_node() - add node buffer and submit read request
131 * @inode - gc inode
132 * @pbn - physical block number for the block
133 * @vbn - virtual block number for the block
134 * @out_bh - indirect pointer to a buffer_head struct to receive the results
135 *
136 * Description: nilfs_gccache_submit_read_node() registers the node buffer
137 * specified by @vbn to the GC pagecache. @pbn can be supplied by the
138 * caller to avoid translation of the disk block address.
139 *
140 * Return Value: On success, 0 is returned. On Error, one of the following
141 * negative error code is returned.
142 *
143 * %-EIO - I/O error.
144 *
145 * %-ENOMEM - Insufficient amount of memory available.
146 */
147int nilfs_gccache_submit_read_node(struct inode *inode, sector_t pbn,
148 __u64 vbn, struct buffer_head **out_bh)
149{
150 int ret = nilfs_btnode_submit_block(&NILFS_I(inode)->i_btnode_cache,
151 vbn ? : pbn, pbn, out_bh, 0);
152 if (ret == -EEXIST) /* internal code (cache hit) */
153 ret = 0;
154 return ret;
155}
156
157int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *bh)
158{
159 wait_on_buffer(bh);
160 if (!buffer_uptodate(bh))
161 return -EIO;
162 if (buffer_dirty(bh))
163 return -EEXIST;
164
165 if (buffer_nilfs_node(bh))
166 nilfs_btnode_mark_dirty(bh);
167 else
168 nilfs_mdt_mark_buffer_dirty(bh);
169 return 0;
170}
171
172/*
173 * nilfs_init_gccache() - allocate and initialize gc_inode hash table
174 * @nilfs - the_nilfs
175 *
176 * Return Value: On success, 0.
177 * On error, a negative error code is returned.
178 */
179int nilfs_init_gccache(struct the_nilfs *nilfs)
180{
181 int loop;
182
183 BUG_ON(nilfs->ns_gc_inodes_h);
184
185 INIT_LIST_HEAD(&nilfs->ns_gc_inodes);
186
187 nilfs->ns_gc_inodes_h =
188 kmalloc(sizeof(struct hlist_head) * NILFS_GCINODE_HASH_SIZE,
189 GFP_NOFS);
190 if (nilfs->ns_gc_inodes_h == NULL)
191 return -ENOMEM;
192
193 for (loop = 0; loop < NILFS_GCINODE_HASH_SIZE; loop++)
194 INIT_HLIST_HEAD(&nilfs->ns_gc_inodes_h[loop]);
195 return 0;
196}
197
198/*
199 * nilfs_destroy_gccache() - free gc_inode hash table
200 * @nilfs - the nilfs
201 */
202void nilfs_destroy_gccache(struct the_nilfs *nilfs)
203{
204 if (nilfs->ns_gc_inodes_h) {
205 nilfs_remove_all_gcinode(nilfs);
206 kfree(nilfs->ns_gc_inodes_h);
207 nilfs->ns_gc_inodes_h = NULL;
208 }
209}
210
211static struct inode *alloc_gcinode(struct the_nilfs *nilfs, ino_t ino,
212 __u64 cno)
213{
214 struct inode *inode = nilfs_mdt_new_common(nilfs, NULL, ino, GFP_NOFS);
215 struct nilfs_inode_info *ii;
216
217 if (!inode)
218 return NULL;
219
220 inode->i_op = NULL;
221 inode->i_fop = NULL;
222 inode->i_mapping->a_ops = &def_gcinode_aops;
223
224 ii = NILFS_I(inode);
225 ii->i_cno = cno;
226 ii->i_flags = 0;
227 ii->i_state = 1 << NILFS_I_GCINODE;
228 ii->i_bh = NULL;
229 nilfs_bmap_init_gc(ii->i_bmap);
230
231 return inode;
232}
233
234static unsigned long ihash(ino_t ino, __u64 cno)
235{
236 return hash_long((unsigned long)((ino << 2) + cno),
237 NILFS_GCINODE_HASH_BITS);
238}
239
240/*
241 * nilfs_gc_iget() - find or create gc inode with specified (ino,cno)
242 */
243struct inode *nilfs_gc_iget(struct the_nilfs *nilfs, ino_t ino, __u64 cno)
244{
245 struct hlist_head *head = nilfs->ns_gc_inodes_h + ihash(ino, cno);
246 struct hlist_node *node;
247 struct inode *inode;
248
249 hlist_for_each_entry(inode, node, head, i_hash) {
250 if (inode->i_ino == ino && NILFS_I(inode)->i_cno == cno)
251 return inode;
252 }
253
254 inode = alloc_gcinode(nilfs, ino, cno);
255 if (likely(inode)) {
256 hlist_add_head(&inode->i_hash, head);
257 list_add(&NILFS_I(inode)->i_dirty, &nilfs->ns_gc_inodes);
258 }
259 return inode;
260}
261
262/*
263 * nilfs_clear_gcinode() - clear and free a gc inode
264 */
265void nilfs_clear_gcinode(struct inode *inode)
266{
267 nilfs_mdt_clear(inode);
268 nilfs_mdt_destroy(inode);
269}
270
271/*
272 * nilfs_remove_all_gcinode() - remove all inodes from the_nilfs
273 */
274void nilfs_remove_all_gcinode(struct the_nilfs *nilfs)
275{
276 struct hlist_head *head = nilfs->ns_gc_inodes_h;
277 struct hlist_node *node, *n;
278 struct inode *inode;
279 int loop;
280
281 for (loop = 0; loop < NILFS_GCINODE_HASH_SIZE; loop++, head++) {
282 hlist_for_each_entry_safe(inode, node, n, head, i_hash) {
283 hlist_del_init(&inode->i_hash);
284 list_del_init(&NILFS_I(inode)->i_dirty);
285 nilfs_clear_gcinode(inode); /* might sleep */
286 }
287 }
288}
diff --git a/fs/nilfs2/ifile.c b/fs/nilfs2/ifile.c
new file mode 100644
index 000000000000..de86401f209f
--- /dev/null
+++ b/fs/nilfs2/ifile.c
@@ -0,0 +1,150 @@
1/*
2 * ifile.c - NILFS inode file
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Amagai Yoshiji <amagai@osrg.net>.
21 * Revised by Ryusuke Konishi <ryusuke@osrg.net>.
22 *
23 */
24
25#include <linux/types.h>
26#include <linux/buffer_head.h>
27#include "nilfs.h"
28#include "mdt.h"
29#include "alloc.h"
30#include "ifile.h"
31
32/**
33 * nilfs_ifile_create_inode - create a new disk inode
34 * @ifile: ifile inode
35 * @out_ino: pointer to a variable to store inode number
36 * @out_bh: buffer_head contains newly allocated disk inode
37 *
38 * Return Value: On success, 0 is returned and the newly allocated inode
39 * number is stored in the place pointed by @ino, and buffer_head pointer
40 * that contains newly allocated disk inode structure is stored in the
41 * place pointed by @out_bh
42 * On error, one of the following negative error codes is returned.
43 *
44 * %-EIO - I/O error.
45 *
46 * %-ENOMEM - Insufficient amount of memory available.
47 *
48 * %-ENOSPC - No inode left.
49 */
50int nilfs_ifile_create_inode(struct inode *ifile, ino_t *out_ino,
51 struct buffer_head **out_bh)
52{
53 struct nilfs_palloc_req req;
54 int ret;
55
56 req.pr_entry_nr = 0; /* 0 says find free inode from beginning of
57 a group. dull code!! */
58 req.pr_entry_bh = NULL;
59
60 ret = nilfs_palloc_prepare_alloc_entry(ifile, &req);
61 if (!ret) {
62 ret = nilfs_palloc_get_entry_block(ifile, req.pr_entry_nr, 1,
63 &req.pr_entry_bh);
64 if (ret < 0)
65 nilfs_palloc_abort_alloc_entry(ifile, &req);
66 }
67 if (ret < 0) {
68 brelse(req.pr_entry_bh);
69 return ret;
70 }
71 nilfs_palloc_commit_alloc_entry(ifile, &req);
72 nilfs_mdt_mark_buffer_dirty(req.pr_entry_bh);
73 nilfs_mdt_mark_dirty(ifile);
74 *out_ino = (ino_t)req.pr_entry_nr;
75 *out_bh = req.pr_entry_bh;
76 return 0;
77}
78
79/**
80 * nilfs_ifile_delete_inode - delete a disk inode
81 * @ifile: ifile inode
82 * @ino: inode number
83 *
84 * Return Value: On success, 0 is returned. On error, one of the following
85 * negative error codes is returned.
86 *
87 * %-EIO - I/O error.
88 *
89 * %-ENOMEM - Insufficient amount of memory available.
90 *
91 * %-ENOENT - The inode number @ino have not been allocated.
92 */
93int nilfs_ifile_delete_inode(struct inode *ifile, ino_t ino)
94{
95 struct nilfs_palloc_req req = {
96 .pr_entry_nr = ino, .pr_entry_bh = NULL
97 };
98 struct nilfs_inode *raw_inode;
99 void *kaddr;
100 int ret;
101
102 ret = nilfs_palloc_prepare_free_entry(ifile, &req);
103 if (!ret) {
104 ret = nilfs_palloc_get_entry_block(ifile, req.pr_entry_nr, 0,
105 &req.pr_entry_bh);
106 if (ret < 0)
107 nilfs_palloc_abort_free_entry(ifile, &req);
108 }
109 if (ret < 0) {
110 brelse(req.pr_entry_bh);
111 return ret;
112 }
113
114 kaddr = kmap_atomic(req.pr_entry_bh->b_page, KM_USER0);
115 raw_inode = nilfs_palloc_block_get_entry(ifile, req.pr_entry_nr,
116 req.pr_entry_bh, kaddr);
117 raw_inode->i_flags = 0;
118 kunmap_atomic(kaddr, KM_USER0);
119
120 nilfs_mdt_mark_buffer_dirty(req.pr_entry_bh);
121 brelse(req.pr_entry_bh);
122
123 nilfs_palloc_commit_free_entry(ifile, &req);
124
125 return 0;
126}
127
128int nilfs_ifile_get_inode_block(struct inode *ifile, ino_t ino,
129 struct buffer_head **out_bh)
130{
131 struct super_block *sb = ifile->i_sb;
132 int err;
133
134 if (unlikely(!NILFS_VALID_INODE(sb, ino))) {
135 nilfs_error(sb, __func__, "bad inode number: %lu",
136 (unsigned long) ino);
137 return -EINVAL;
138 }
139
140 err = nilfs_palloc_get_entry_block(ifile, ino, 0, out_bh);
141 if (unlikely(err)) {
142 if (err == -EINVAL)
143 nilfs_error(sb, __func__, "ifile is broken");
144 else
145 nilfs_warning(sb, __func__,
146 "unable to read inode: %lu",
147 (unsigned long) ino);
148 }
149 return err;
150}
diff --git a/fs/nilfs2/ifile.h b/fs/nilfs2/ifile.h
new file mode 100644
index 000000000000..5d30a35679b5
--- /dev/null
+++ b/fs/nilfs2/ifile.h
@@ -0,0 +1,53 @@
1/*
2 * ifile.h - NILFS inode file
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Amagai Yoshiji <amagai@osrg.net>
21 * Revised by Ryusuke Konishi <ryusuke@osrg.net>
22 *
23 */
24
25#ifndef _NILFS_IFILE_H
26#define _NILFS_IFILE_H
27
28#include <linux/fs.h>
29#include <linux/buffer_head.h>
30#include <linux/nilfs2_fs.h>
31#include "mdt.h"
32#include "alloc.h"
33
34#define NILFS_IFILE_GFP NILFS_MDT_GFP
35
36static inline struct nilfs_inode *
37nilfs_ifile_map_inode(struct inode *ifile, ino_t ino, struct buffer_head *ibh)
38{
39 void *kaddr = kmap(ibh->b_page);
40 return nilfs_palloc_block_get_entry(ifile, ino, ibh, kaddr);
41}
42
43static inline void nilfs_ifile_unmap_inode(struct inode *ifile, ino_t ino,
44 struct buffer_head *ibh)
45{
46 kunmap(ibh->b_page);
47}
48
49int nilfs_ifile_create_inode(struct inode *, ino_t *, struct buffer_head **);
50int nilfs_ifile_delete_inode(struct inode *, ino_t);
51int nilfs_ifile_get_inode_block(struct inode *, ino_t, struct buffer_head **);
52
53#endif /* _NILFS_IFILE_H */
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
new file mode 100644
index 000000000000..49ab4a49bb4f
--- /dev/null
+++ b/fs/nilfs2/inode.c
@@ -0,0 +1,785 @@
1/*
2 * inode.c - NILFS inode operations.
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>
21 *
22 */
23
24#include <linux/buffer_head.h>
25#include <linux/mpage.h>
26#include <linux/writeback.h>
27#include <linux/uio.h>
28#include "nilfs.h"
29#include "segment.h"
30#include "page.h"
31#include "mdt.h"
32#include "cpfile.h"
33#include "ifile.h"
34
35
36/**
37 * nilfs_get_block() - get a file block on the filesystem (callback function)
38 * @inode - inode struct of the target file
39 * @blkoff - file block number
40 * @bh_result - buffer head to be mapped on
41 * @create - indicate whether allocating the block or not when it has not
42 * been allocated yet.
43 *
44 * This function does not issue actual read request of the specified data
45 * block. It is done by VFS.
46 * Bulk read for direct-io is not supported yet. (should be supported)
47 */
48int nilfs_get_block(struct inode *inode, sector_t blkoff,
49 struct buffer_head *bh_result, int create)
50{
51 struct nilfs_inode_info *ii = NILFS_I(inode);
52 unsigned long blknum = 0;
53 int err = 0, ret;
54 struct inode *dat = nilfs_dat_inode(NILFS_I_NILFS(inode));
55
56 /* This exclusion control is a workaround; should be revised */
57 down_read(&NILFS_MDT(dat)->mi_sem); /* XXX */
58 ret = nilfs_bmap_lookup(ii->i_bmap, (unsigned long)blkoff, &blknum);
59 up_read(&NILFS_MDT(dat)->mi_sem); /* XXX */
60 if (ret == 0) { /* found */
61 map_bh(bh_result, inode->i_sb, blknum);
62 goto out;
63 }
64 /* data block was not found */
65 if (ret == -ENOENT && create) {
66 struct nilfs_transaction_info ti;
67
68 bh_result->b_blocknr = 0;
69 err = nilfs_transaction_begin(inode->i_sb, &ti, 1);
70 if (unlikely(err))
71 goto out;
72 err = nilfs_bmap_insert(ii->i_bmap, (unsigned long)blkoff,
73 (unsigned long)bh_result);
74 if (unlikely(err != 0)) {
75 if (err == -EEXIST) {
76 /*
77 * The get_block() function could be called
78 * from multiple callers for an inode.
79 * However, the page having this block must
80 * be locked in this case.
81 */
82 printk(KERN_WARNING
83 "nilfs_get_block: a race condition "
84 "while inserting a data block. "
85 "(inode number=%lu, file block "
86 "offset=%llu)\n",
87 inode->i_ino,
88 (unsigned long long)blkoff);
89 err = 0;
90 } else if (err == -EINVAL) {
91 nilfs_error(inode->i_sb, __func__,
92 "broken bmap (inode=%lu)\n",
93 inode->i_ino);
94 err = -EIO;
95 }
96 nilfs_transaction_abort(inode->i_sb);
97 goto out;
98 }
99 nilfs_transaction_commit(inode->i_sb); /* never fails */
100 /* Error handling should be detailed */
101 set_buffer_new(bh_result);
102 map_bh(bh_result, inode->i_sb, 0); /* dbn must be changed
103 to proper value */
104 } else if (ret == -ENOENT) {
105 /* not found is not error (e.g. hole); must return without
106 the mapped state flag. */
107 ;
108 } else {
109 err = ret;
110 }
111
112 out:
113 return err;
114}
115
116/**
117 * nilfs_readpage() - implement readpage() method of nilfs_aops {}
118 * address_space_operations.
119 * @file - file struct of the file to be read
120 * @page - the page to be read
121 */
122static int nilfs_readpage(struct file *file, struct page *page)
123{
124 return mpage_readpage(page, nilfs_get_block);
125}
126
127/**
128 * nilfs_readpages() - implement readpages() method of nilfs_aops {}
129 * address_space_operations.
130 * @file - file struct of the file to be read
131 * @mapping - address_space struct used for reading multiple pages
132 * @pages - the pages to be read
133 * @nr_pages - number of pages to be read
134 */
135static int nilfs_readpages(struct file *file, struct address_space *mapping,
136 struct list_head *pages, unsigned nr_pages)
137{
138 return mpage_readpages(mapping, pages, nr_pages, nilfs_get_block);
139}
140
141static int nilfs_writepages(struct address_space *mapping,
142 struct writeback_control *wbc)
143{
144 struct inode *inode = mapping->host;
145 int err = 0;
146
147 if (wbc->sync_mode == WB_SYNC_ALL)
148 err = nilfs_construct_dsync_segment(inode->i_sb, inode,
149 wbc->range_start,
150 wbc->range_end);
151 return err;
152}
153
154static int nilfs_writepage(struct page *page, struct writeback_control *wbc)
155{
156 struct inode *inode = page->mapping->host;
157 int err;
158
159 redirty_page_for_writepage(wbc, page);
160 unlock_page(page);
161
162 if (wbc->sync_mode == WB_SYNC_ALL) {
163 err = nilfs_construct_segment(inode->i_sb);
164 if (unlikely(err))
165 return err;
166 } else if (wbc->for_reclaim)
167 nilfs_flush_segment(inode->i_sb, inode->i_ino);
168
169 return 0;
170}
171
172static int nilfs_set_page_dirty(struct page *page)
173{
174 int ret = __set_page_dirty_buffers(page);
175
176 if (ret) {
177 struct inode *inode = page->mapping->host;
178 struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
179 unsigned nr_dirty = 1 << (PAGE_SHIFT - inode->i_blkbits);
180
181 nilfs_set_file_dirty(sbi, inode, nr_dirty);
182 }
183 return ret;
184}
185
186static int nilfs_write_begin(struct file *file, struct address_space *mapping,
187 loff_t pos, unsigned len, unsigned flags,
188 struct page **pagep, void **fsdata)
189
190{
191 struct inode *inode = mapping->host;
192 int err = nilfs_transaction_begin(inode->i_sb, NULL, 1);
193
194 if (unlikely(err))
195 return err;
196
197 *pagep = NULL;
198 err = block_write_begin(file, mapping, pos, len, flags, pagep,
199 fsdata, nilfs_get_block);
200 if (unlikely(err))
201 nilfs_transaction_abort(inode->i_sb);
202 return err;
203}
204
205static int nilfs_write_end(struct file *file, struct address_space *mapping,
206 loff_t pos, unsigned len, unsigned copied,
207 struct page *page, void *fsdata)
208{
209 struct inode *inode = mapping->host;
210 unsigned start = pos & (PAGE_CACHE_SIZE - 1);
211 unsigned nr_dirty;
212 int err;
213
214 nr_dirty = nilfs_page_count_clean_buffers(page, start,
215 start + copied);
216 copied = generic_write_end(file, mapping, pos, len, copied, page,
217 fsdata);
218 nilfs_set_file_dirty(NILFS_SB(inode->i_sb), inode, nr_dirty);
219 err = nilfs_transaction_commit(inode->i_sb);
220 return err ? : copied;
221}
222
223static ssize_t
224nilfs_direct_IO(int rw, struct kiocb *iocb, const struct iovec *iov,
225 loff_t offset, unsigned long nr_segs)
226{
227 struct file *file = iocb->ki_filp;
228 struct inode *inode = file->f_mapping->host;
229 ssize_t size;
230
231 if (rw == WRITE)
232 return 0;
233
234 /* Needs synchronization with the cleaner */
235 size = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
236 offset, nr_segs, nilfs_get_block, NULL);
237 return size;
238}
239
240struct address_space_operations nilfs_aops = {
241 .writepage = nilfs_writepage,
242 .readpage = nilfs_readpage,
243 /* .sync_page = nilfs_sync_page, */
244 .writepages = nilfs_writepages,
245 .set_page_dirty = nilfs_set_page_dirty,
246 .readpages = nilfs_readpages,
247 .write_begin = nilfs_write_begin,
248 .write_end = nilfs_write_end,
249 /* .releasepage = nilfs_releasepage, */
250 .invalidatepage = block_invalidatepage,
251 .direct_IO = nilfs_direct_IO,
252};
253
254struct inode *nilfs_new_inode(struct inode *dir, int mode)
255{
256 struct super_block *sb = dir->i_sb;
257 struct nilfs_sb_info *sbi = NILFS_SB(sb);
258 struct inode *inode;
259 struct nilfs_inode_info *ii;
260 int err = -ENOMEM;
261 ino_t ino;
262
263 inode = new_inode(sb);
264 if (unlikely(!inode))
265 goto failed;
266
267 mapping_set_gfp_mask(inode->i_mapping,
268 mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
269
270 ii = NILFS_I(inode);
271 ii->i_state = 1 << NILFS_I_NEW;
272
273 err = nilfs_ifile_create_inode(sbi->s_ifile, &ino, &ii->i_bh);
274 if (unlikely(err))
275 goto failed_ifile_create_inode;
276 /* reference count of i_bh inherits from nilfs_mdt_read_block() */
277
278 atomic_inc(&sbi->s_inodes_count);
279
280 inode->i_uid = current_fsuid();
281 if (dir->i_mode & S_ISGID) {
282 inode->i_gid = dir->i_gid;
283 if (S_ISDIR(mode))
284 mode |= S_ISGID;
285 } else
286 inode->i_gid = current_fsgid();
287
288 inode->i_mode = mode;
289 inode->i_ino = ino;
290 inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
291
292 if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) {
293 err = nilfs_bmap_read(ii->i_bmap, NULL);
294 if (err < 0)
295 goto failed_bmap;
296
297 set_bit(NILFS_I_BMAP, &ii->i_state);
298 /* No lock is needed; iget() ensures it. */
299 }
300
301 ii->i_flags = NILFS_I(dir)->i_flags;
302 if (S_ISLNK(mode))
303 ii->i_flags &= ~(NILFS_IMMUTABLE_FL | NILFS_APPEND_FL);
304 if (!S_ISDIR(mode))
305 ii->i_flags &= ~NILFS_DIRSYNC_FL;
306
307 /* ii->i_file_acl = 0; */
308 /* ii->i_dir_acl = 0; */
309 ii->i_dir_start_lookup = 0;
310#ifdef CONFIG_NILFS_FS_POSIX_ACL
311 ii->i_acl = NULL;
312 ii->i_default_acl = NULL;
313#endif
314 ii->i_cno = 0;
315 nilfs_set_inode_flags(inode);
316 spin_lock(&sbi->s_next_gen_lock);
317 inode->i_generation = sbi->s_next_generation++;
318 spin_unlock(&sbi->s_next_gen_lock);
319 insert_inode_hash(inode);
320
321 err = nilfs_init_acl(inode, dir);
322 if (unlikely(err))
323 goto failed_acl; /* never occur. When supporting
324 nilfs_init_acl(), proper cancellation of
325 above jobs should be considered */
326
327 mark_inode_dirty(inode);
328 return inode;
329
330 failed_acl:
331 failed_bmap:
332 inode->i_nlink = 0;
333 iput(inode); /* raw_inode will be deleted through
334 generic_delete_inode() */
335 goto failed;
336
337 failed_ifile_create_inode:
338 make_bad_inode(inode);
339 iput(inode); /* if i_nlink == 1, generic_forget_inode() will be
340 called */
341 failed:
342 return ERR_PTR(err);
343}
344
345void nilfs_free_inode(struct inode *inode)
346{
347 struct super_block *sb = inode->i_sb;
348 struct nilfs_sb_info *sbi = NILFS_SB(sb);
349
350 clear_inode(inode);
351 /* XXX: check error code? Is there any thing I can do? */
352 (void) nilfs_ifile_delete_inode(sbi->s_ifile, inode->i_ino);
353 atomic_dec(&sbi->s_inodes_count);
354}
355
356void nilfs_set_inode_flags(struct inode *inode)
357{
358 unsigned int flags = NILFS_I(inode)->i_flags;
359
360 inode->i_flags &= ~(S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME |
361 S_DIRSYNC);
362 if (flags & NILFS_SYNC_FL)
363 inode->i_flags |= S_SYNC;
364 if (flags & NILFS_APPEND_FL)
365 inode->i_flags |= S_APPEND;
366 if (flags & NILFS_IMMUTABLE_FL)
367 inode->i_flags |= S_IMMUTABLE;
368#ifndef NILFS_ATIME_DISABLE
369 if (flags & NILFS_NOATIME_FL)
370#endif
371 inode->i_flags |= S_NOATIME;
372 if (flags & NILFS_DIRSYNC_FL)
373 inode->i_flags |= S_DIRSYNC;
374 mapping_set_gfp_mask(inode->i_mapping,
375 mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
376}
377
378int nilfs_read_inode_common(struct inode *inode,
379 struct nilfs_inode *raw_inode)
380{
381 struct nilfs_inode_info *ii = NILFS_I(inode);
382 int err;
383
384 inode->i_mode = le16_to_cpu(raw_inode->i_mode);
385 inode->i_uid = (uid_t)le32_to_cpu(raw_inode->i_uid);
386 inode->i_gid = (gid_t)le32_to_cpu(raw_inode->i_gid);
387 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
388 inode->i_size = le64_to_cpu(raw_inode->i_size);
389 inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
390 inode->i_ctime.tv_sec = le64_to_cpu(raw_inode->i_ctime);
391 inode->i_mtime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
392 inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
393 inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec);
394 inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
395 if (inode->i_nlink == 0 && inode->i_mode == 0)
396 return -EINVAL; /* this inode is deleted */
397
398 inode->i_blocks = le64_to_cpu(raw_inode->i_blocks);
399 ii->i_flags = le32_to_cpu(raw_inode->i_flags);
400#if 0
401 ii->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
402 ii->i_dir_acl = S_ISREG(inode->i_mode) ?
403 0 : le32_to_cpu(raw_inode->i_dir_acl);
404#endif
405 ii->i_cno = 0;
406 inode->i_generation = le32_to_cpu(raw_inode->i_generation);
407
408 if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
409 S_ISLNK(inode->i_mode)) {
410 err = nilfs_bmap_read(ii->i_bmap, raw_inode);
411 if (err < 0)
412 return err;
413 set_bit(NILFS_I_BMAP, &ii->i_state);
414 /* No lock is needed; iget() ensures it. */
415 }
416 return 0;
417}
418
419static int __nilfs_read_inode(struct super_block *sb, unsigned long ino,
420 struct inode *inode)
421{
422 struct nilfs_sb_info *sbi = NILFS_SB(sb);
423 struct inode *dat = nilfs_dat_inode(sbi->s_nilfs);
424 struct buffer_head *bh;
425 struct nilfs_inode *raw_inode;
426 int err;
427
428 down_read(&NILFS_MDT(dat)->mi_sem); /* XXX */
429 err = nilfs_ifile_get_inode_block(sbi->s_ifile, ino, &bh);
430 if (unlikely(err))
431 goto bad_inode;
432
433 raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, bh);
434
435#ifdef CONFIG_NILFS_FS_POSIX_ACL
436 ii->i_acl = NILFS_ACL_NOT_CACHED;
437 ii->i_default_acl = NILFS_ACL_NOT_CACHED;
438#endif
439 if (nilfs_read_inode_common(inode, raw_inode))
440 goto failed_unmap;
441
442 if (S_ISREG(inode->i_mode)) {
443 inode->i_op = &nilfs_file_inode_operations;
444 inode->i_fop = &nilfs_file_operations;
445 inode->i_mapping->a_ops = &nilfs_aops;
446 } else if (S_ISDIR(inode->i_mode)) {
447 inode->i_op = &nilfs_dir_inode_operations;
448 inode->i_fop = &nilfs_dir_operations;
449 inode->i_mapping->a_ops = &nilfs_aops;
450 } else if (S_ISLNK(inode->i_mode)) {
451 inode->i_op = &nilfs_symlink_inode_operations;
452 inode->i_mapping->a_ops = &nilfs_aops;
453 } else {
454 inode->i_op = &nilfs_special_inode_operations;
455 init_special_inode(
456 inode, inode->i_mode,
457 new_decode_dev(le64_to_cpu(raw_inode->i_device_code)));
458 }
459 nilfs_ifile_unmap_inode(sbi->s_ifile, ino, bh);
460 brelse(bh);
461 up_read(&NILFS_MDT(dat)->mi_sem); /* XXX */
462 nilfs_set_inode_flags(inode);
463 return 0;
464
465 failed_unmap:
466 nilfs_ifile_unmap_inode(sbi->s_ifile, ino, bh);
467 brelse(bh);
468
469 bad_inode:
470 up_read(&NILFS_MDT(dat)->mi_sem); /* XXX */
471 return err;
472}
473
474struct inode *nilfs_iget(struct super_block *sb, unsigned long ino)
475{
476 struct inode *inode;
477 int err;
478
479 inode = iget_locked(sb, ino);
480 if (unlikely(!inode))
481 return ERR_PTR(-ENOMEM);
482 if (!(inode->i_state & I_NEW))
483 return inode;
484
485 err = __nilfs_read_inode(sb, ino, inode);
486 if (unlikely(err)) {
487 iget_failed(inode);
488 return ERR_PTR(err);
489 }
490 unlock_new_inode(inode);
491 return inode;
492}
493
494void nilfs_write_inode_common(struct inode *inode,
495 struct nilfs_inode *raw_inode, int has_bmap)
496{
497 struct nilfs_inode_info *ii = NILFS_I(inode);
498
499 raw_inode->i_mode = cpu_to_le16(inode->i_mode);
500 raw_inode->i_uid = cpu_to_le32(inode->i_uid);
501 raw_inode->i_gid = cpu_to_le32(inode->i_gid);
502 raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
503 raw_inode->i_size = cpu_to_le64(inode->i_size);
504 raw_inode->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
505 raw_inode->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
506 raw_inode->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
507 raw_inode->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
508 raw_inode->i_blocks = cpu_to_le64(inode->i_blocks);
509
510 raw_inode->i_flags = cpu_to_le32(ii->i_flags);
511 raw_inode->i_generation = cpu_to_le32(inode->i_generation);
512
513 if (has_bmap)
514 nilfs_bmap_write(ii->i_bmap, raw_inode);
515 else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
516 raw_inode->i_device_code =
517 cpu_to_le64(new_encode_dev(inode->i_rdev));
518 /* When extending inode, nilfs->ns_inode_size should be checked
519 for substitutions of appended fields */
520}
521
522void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh)
523{
524 ino_t ino = inode->i_ino;
525 struct nilfs_inode_info *ii = NILFS_I(inode);
526 struct super_block *sb = inode->i_sb;
527 struct nilfs_sb_info *sbi = NILFS_SB(sb);
528 struct nilfs_inode *raw_inode;
529
530 raw_inode = nilfs_ifile_map_inode(sbi->s_ifile, ino, ibh);
531
532 /* The buffer is guarded with lock_buffer() by the caller */
533 if (test_and_clear_bit(NILFS_I_NEW, &ii->i_state))
534 memset(raw_inode, 0, NILFS_MDT(sbi->s_ifile)->mi_entry_size);
535 set_bit(NILFS_I_INODE_DIRTY, &ii->i_state);
536
537 nilfs_write_inode_common(inode, raw_inode, 0);
538 /* XXX: call with has_bmap = 0 is a workaround to avoid
539 deadlock of bmap. This delays update of i_bmap to just
540 before writing */
541 nilfs_ifile_unmap_inode(sbi->s_ifile, ino, ibh);
542}
543
544#define NILFS_MAX_TRUNCATE_BLOCKS 16384 /* 64MB for 4KB block */
545
546static void nilfs_truncate_bmap(struct nilfs_inode_info *ii,
547 unsigned long from)
548{
549 unsigned long b;
550 int ret;
551
552 if (!test_bit(NILFS_I_BMAP, &ii->i_state))
553 return;
554 repeat:
555 ret = nilfs_bmap_last_key(ii->i_bmap, &b);
556 if (ret == -ENOENT)
557 return;
558 else if (ret < 0)
559 goto failed;
560
561 if (b < from)
562 return;
563
564 b -= min_t(unsigned long, NILFS_MAX_TRUNCATE_BLOCKS, b - from);
565 ret = nilfs_bmap_truncate(ii->i_bmap, b);
566 nilfs_relax_pressure_in_lock(ii->vfs_inode.i_sb);
567 if (!ret || (ret == -ENOMEM &&
568 nilfs_bmap_truncate(ii->i_bmap, b) == 0))
569 goto repeat;
570
571 failed:
572 if (ret == -EINVAL)
573 nilfs_error(ii->vfs_inode.i_sb, __func__,
574 "bmap is broken (ino=%lu)", ii->vfs_inode.i_ino);
575 else
576 nilfs_warning(ii->vfs_inode.i_sb, __func__,
577 "failed to truncate bmap (ino=%lu, err=%d)",
578 ii->vfs_inode.i_ino, ret);
579}
580
581void nilfs_truncate(struct inode *inode)
582{
583 unsigned long blkoff;
584 unsigned int blocksize;
585 struct nilfs_transaction_info ti;
586 struct super_block *sb = inode->i_sb;
587 struct nilfs_inode_info *ii = NILFS_I(inode);
588
589 if (!test_bit(NILFS_I_BMAP, &ii->i_state))
590 return;
591 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
592 return;
593
594 blocksize = sb->s_blocksize;
595 blkoff = (inode->i_size + blocksize - 1) >> sb->s_blocksize_bits;
596 nilfs_transaction_begin(sb, &ti, 0); /* never fails */
597
598 block_truncate_page(inode->i_mapping, inode->i_size, nilfs_get_block);
599
600 nilfs_truncate_bmap(ii, blkoff);
601
602 inode->i_mtime = inode->i_ctime = CURRENT_TIME;
603 if (IS_SYNC(inode))
604 nilfs_set_transaction_flag(NILFS_TI_SYNC);
605
606 nilfs_set_file_dirty(NILFS_SB(sb), inode, 0);
607 nilfs_transaction_commit(sb);
608 /* May construct a logical segment and may fail in sync mode.
609 But truncate has no return value. */
610}
611
612void nilfs_delete_inode(struct inode *inode)
613{
614 struct nilfs_transaction_info ti;
615 struct super_block *sb = inode->i_sb;
616 struct nilfs_inode_info *ii = NILFS_I(inode);
617
618 if (unlikely(is_bad_inode(inode))) {
619 if (inode->i_data.nrpages)
620 truncate_inode_pages(&inode->i_data, 0);
621 clear_inode(inode);
622 return;
623 }
624 nilfs_transaction_begin(sb, &ti, 0); /* never fails */
625
626 if (inode->i_data.nrpages)
627 truncate_inode_pages(&inode->i_data, 0);
628
629 nilfs_truncate_bmap(ii, 0);
630 nilfs_free_inode(inode);
631 /* nilfs_free_inode() marks inode buffer dirty */
632 if (IS_SYNC(inode))
633 nilfs_set_transaction_flag(NILFS_TI_SYNC);
634 nilfs_transaction_commit(sb);
635 /* May construct a logical segment and may fail in sync mode.
636 But delete_inode has no return value. */
637}
638
639int nilfs_setattr(struct dentry *dentry, struct iattr *iattr)
640{
641 struct nilfs_transaction_info ti;
642 struct inode *inode = dentry->d_inode;
643 struct super_block *sb = inode->i_sb;
644 int err;
645
646 err = inode_change_ok(inode, iattr);
647 if (err)
648 return err;
649
650 err = nilfs_transaction_begin(sb, &ti, 0);
651 if (unlikely(err))
652 return err;
653 err = inode_setattr(inode, iattr);
654 if (!err && (iattr->ia_valid & ATTR_MODE))
655 err = nilfs_acl_chmod(inode);
656 if (likely(!err))
657 err = nilfs_transaction_commit(sb);
658 else
659 nilfs_transaction_abort(sb);
660
661 return err;
662}
663
664int nilfs_load_inode_block(struct nilfs_sb_info *sbi, struct inode *inode,
665 struct buffer_head **pbh)
666{
667 struct nilfs_inode_info *ii = NILFS_I(inode);
668 int err;
669
670 spin_lock(&sbi->s_inode_lock);
671 /* Caller of this function MUST lock s_inode_lock */
672 if (ii->i_bh == NULL) {
673 spin_unlock(&sbi->s_inode_lock);
674 err = nilfs_ifile_get_inode_block(sbi->s_ifile, inode->i_ino,
675 pbh);
676 if (unlikely(err))
677 return err;
678 spin_lock(&sbi->s_inode_lock);
679 if (ii->i_bh == NULL)
680 ii->i_bh = *pbh;
681 else {
682 brelse(*pbh);
683 *pbh = ii->i_bh;
684 }
685 } else
686 *pbh = ii->i_bh;
687
688 get_bh(*pbh);
689 spin_unlock(&sbi->s_inode_lock);
690 return 0;
691}
692
693int nilfs_inode_dirty(struct inode *inode)
694{
695 struct nilfs_inode_info *ii = NILFS_I(inode);
696 struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
697 int ret = 0;
698
699 if (!list_empty(&ii->i_dirty)) {
700 spin_lock(&sbi->s_inode_lock);
701 ret = test_bit(NILFS_I_DIRTY, &ii->i_state) ||
702 test_bit(NILFS_I_BUSY, &ii->i_state);
703 spin_unlock(&sbi->s_inode_lock);
704 }
705 return ret;
706}
707
708int nilfs_set_file_dirty(struct nilfs_sb_info *sbi, struct inode *inode,
709 unsigned nr_dirty)
710{
711 struct nilfs_inode_info *ii = NILFS_I(inode);
712
713 atomic_add(nr_dirty, &sbi->s_nilfs->ns_ndirtyblks);
714
715 if (test_and_set_bit(NILFS_I_DIRTY, &ii->i_state))
716 return 0;
717
718 spin_lock(&sbi->s_inode_lock);
719 if (!test_bit(NILFS_I_QUEUED, &ii->i_state) &&
720 !test_bit(NILFS_I_BUSY, &ii->i_state)) {
721 /* Because this routine may race with nilfs_dispose_list(),
722 we have to check NILFS_I_QUEUED here, too. */
723 if (list_empty(&ii->i_dirty) && igrab(inode) == NULL) {
724 /* This will happen when somebody is freeing
725 this inode. */
726 nilfs_warning(sbi->s_super, __func__,
727 "cannot get inode (ino=%lu)\n",
728 inode->i_ino);
729 spin_unlock(&sbi->s_inode_lock);
730 return -EINVAL; /* NILFS_I_DIRTY may remain for
731 freeing inode */
732 }
733 list_del(&ii->i_dirty);
734 list_add_tail(&ii->i_dirty, &sbi->s_dirty_files);
735 set_bit(NILFS_I_QUEUED, &ii->i_state);
736 }
737 spin_unlock(&sbi->s_inode_lock);
738 return 0;
739}
740
741int nilfs_mark_inode_dirty(struct inode *inode)
742{
743 struct nilfs_sb_info *sbi = NILFS_SB(inode->i_sb);
744 struct buffer_head *ibh;
745 int err;
746
747 err = nilfs_load_inode_block(sbi, inode, &ibh);
748 if (unlikely(err)) {
749 nilfs_warning(inode->i_sb, __func__,
750 "failed to reget inode block.\n");
751 return err;
752 }
753 lock_buffer(ibh);
754 nilfs_update_inode(inode, ibh);
755 unlock_buffer(ibh);
756 nilfs_mdt_mark_buffer_dirty(ibh);
757 nilfs_mdt_mark_dirty(sbi->s_ifile);
758 brelse(ibh);
759 return 0;
760}
761
762/**
763 * nilfs_dirty_inode - reflect changes on given inode to an inode block.
764 * @inode: inode of the file to be registered.
765 *
766 * nilfs_dirty_inode() loads a inode block containing the specified
767 * @inode and copies data from a nilfs_inode to a corresponding inode
768 * entry in the inode block. This operation is excluded from the segment
769 * construction. This function can be called both as a single operation
770 * and as a part of indivisible file operations.
771 */
772void nilfs_dirty_inode(struct inode *inode)
773{
774 struct nilfs_transaction_info ti;
775
776 if (is_bad_inode(inode)) {
777 nilfs_warning(inode->i_sb, __func__,
778 "tried to mark bad_inode dirty. ignored.\n");
779 dump_stack();
780 return;
781 }
782 nilfs_transaction_begin(inode->i_sb, &ti, 0);
783 nilfs_mark_inode_dirty(inode);
784 nilfs_transaction_commit(inode->i_sb); /* never fails */
785}
diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c
new file mode 100644
index 000000000000..108d281ebca5
--- /dev/null
+++ b/fs/nilfs2/ioctl.c
@@ -0,0 +1,654 @@
1/*
2 * ioctl.c - NILFS ioctl operations.
3 *
4 * Copyright (C) 2007, 2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#include <linux/fs.h>
24#include <linux/wait.h>
25#include <linux/smp_lock.h> /* lock_kernel(), unlock_kernel() */
26#include <linux/capability.h> /* capable() */
27#include <linux/uaccess.h> /* copy_from_user(), copy_to_user() */
28#include <linux/nilfs2_fs.h>
29#include "nilfs.h"
30#include "segment.h"
31#include "bmap.h"
32#include "cpfile.h"
33#include "sufile.h"
34#include "dat.h"
35
36
37static int nilfs_ioctl_wrap_copy(struct the_nilfs *nilfs,
38 struct nilfs_argv *argv, int dir,
39 ssize_t (*dofunc)(struct the_nilfs *,
40 __u64 *, int,
41 void *, size_t, size_t))
42{
43 void *buf;
44 void __user *base = (void __user *)(unsigned long)argv->v_base;
45 size_t maxmembs, total, n;
46 ssize_t nr;
47 int ret, i;
48 __u64 pos, ppos;
49
50 if (argv->v_nmembs == 0)
51 return 0;
52
53 if (argv->v_size > PAGE_SIZE)
54 return -EINVAL;
55
56 buf = (void *)__get_free_pages(GFP_NOFS, 0);
57 if (unlikely(!buf))
58 return -ENOMEM;
59 maxmembs = PAGE_SIZE / argv->v_size;
60
61 ret = 0;
62 total = 0;
63 pos = argv->v_index;
64 for (i = 0; i < argv->v_nmembs; i += n) {
65 n = (argv->v_nmembs - i < maxmembs) ?
66 argv->v_nmembs - i : maxmembs;
67 if ((dir & _IOC_WRITE) &&
68 copy_from_user(buf, base + argv->v_size * i,
69 argv->v_size * n)) {
70 ret = -EFAULT;
71 break;
72 }
73 ppos = pos;
74 nr = dofunc(nilfs, &pos, argv->v_flags, buf, argv->v_size,
75 n);
76 if (nr < 0) {
77 ret = nr;
78 break;
79 }
80 if ((dir & _IOC_READ) &&
81 copy_to_user(base + argv->v_size * i, buf,
82 argv->v_size * nr)) {
83 ret = -EFAULT;
84 break;
85 }
86 total += nr;
87 if ((size_t)nr < n)
88 break;
89 if (pos == ppos)
90 pos += n;
91 }
92 argv->v_nmembs = total;
93
94 free_pages((unsigned long)buf, 0);
95 return ret;
96}
97
98static int nilfs_ioctl_change_cpmode(struct inode *inode, struct file *filp,
99 unsigned int cmd, void __user *argp)
100{
101 struct inode *cpfile = NILFS_SB(inode->i_sb)->s_nilfs->ns_cpfile;
102 struct nilfs_transaction_info ti;
103 struct nilfs_cpmode cpmode;
104 int ret;
105
106 if (!capable(CAP_SYS_ADMIN))
107 return -EPERM;
108 if (copy_from_user(&cpmode, argp, sizeof(cpmode)))
109 return -EFAULT;
110
111 nilfs_transaction_begin(inode->i_sb, &ti, 0);
112 ret = nilfs_cpfile_change_cpmode(
113 cpfile, cpmode.cm_cno, cpmode.cm_mode);
114 if (unlikely(ret < 0)) {
115 nilfs_transaction_abort(inode->i_sb);
116 return ret;
117 }
118 nilfs_transaction_commit(inode->i_sb); /* never fails */
119 return ret;
120}
121
122static int
123nilfs_ioctl_delete_checkpoint(struct inode *inode, struct file *filp,
124 unsigned int cmd, void __user *argp)
125{
126 struct inode *cpfile = NILFS_SB(inode->i_sb)->s_nilfs->ns_cpfile;
127 struct nilfs_transaction_info ti;
128 __u64 cno;
129 int ret;
130
131 if (!capable(CAP_SYS_ADMIN))
132 return -EPERM;
133 if (copy_from_user(&cno, argp, sizeof(cno)))
134 return -EFAULT;
135
136 nilfs_transaction_begin(inode->i_sb, &ti, 0);
137 ret = nilfs_cpfile_delete_checkpoint(cpfile, cno);
138 if (unlikely(ret < 0)) {
139 nilfs_transaction_abort(inode->i_sb);
140 return ret;
141 }
142 nilfs_transaction_commit(inode->i_sb); /* never fails */
143 return ret;
144}
145
146static ssize_t
147nilfs_ioctl_do_get_cpinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
148 void *buf, size_t size, size_t nmembs)
149{
150 return nilfs_cpfile_get_cpinfo(nilfs->ns_cpfile, posp, flags, buf,
151 nmembs);
152}
153
154static int nilfs_ioctl_get_cpinfo(struct inode *inode, struct file *filp,
155 unsigned int cmd, void __user *argp)
156{
157 struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
158 struct nilfs_argv argv;
159 int ret;
160
161 if (copy_from_user(&argv, argp, sizeof(argv)))
162 return -EFAULT;
163
164 down_read(&nilfs->ns_segctor_sem);
165 ret = nilfs_ioctl_wrap_copy(nilfs, &argv, _IOC_DIR(cmd),
166 nilfs_ioctl_do_get_cpinfo);
167 up_read(&nilfs->ns_segctor_sem);
168 if (ret < 0)
169 return ret;
170
171 if (copy_to_user(argp, &argv, sizeof(argv)))
172 ret = -EFAULT;
173 return ret;
174}
175
176static int nilfs_ioctl_get_cpstat(struct inode *inode, struct file *filp,
177 unsigned int cmd, void __user *argp)
178{
179 struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
180 struct nilfs_cpstat cpstat;
181 int ret;
182
183 down_read(&nilfs->ns_segctor_sem);
184 ret = nilfs_cpfile_get_stat(nilfs->ns_cpfile, &cpstat);
185 up_read(&nilfs->ns_segctor_sem);
186 if (ret < 0)
187 return ret;
188
189 if (copy_to_user(argp, &cpstat, sizeof(cpstat)))
190 ret = -EFAULT;
191 return ret;
192}
193
194static ssize_t
195nilfs_ioctl_do_get_suinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
196 void *buf, size_t size, size_t nmembs)
197{
198 return nilfs_sufile_get_suinfo(nilfs->ns_sufile, *posp, buf, nmembs);
199}
200
201static int nilfs_ioctl_get_suinfo(struct inode *inode, struct file *filp,
202 unsigned int cmd, void __user *argp)
203{
204 struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
205 struct nilfs_argv argv;
206 int ret;
207
208 if (copy_from_user(&argv, argp, sizeof(argv)))
209 return -EFAULT;
210
211 down_read(&nilfs->ns_segctor_sem);
212 ret = nilfs_ioctl_wrap_copy(nilfs, &argv, _IOC_DIR(cmd),
213 nilfs_ioctl_do_get_suinfo);
214 up_read(&nilfs->ns_segctor_sem);
215 if (ret < 0)
216 return ret;
217
218 if (copy_to_user(argp, &argv, sizeof(argv)))
219 ret = -EFAULT;
220 return ret;
221}
222
223static int nilfs_ioctl_get_sustat(struct inode *inode, struct file *filp,
224 unsigned int cmd, void __user *argp)
225{
226 struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
227 struct nilfs_sustat sustat;
228 int ret;
229
230 down_read(&nilfs->ns_segctor_sem);
231 ret = nilfs_sufile_get_stat(nilfs->ns_sufile, &sustat);
232 up_read(&nilfs->ns_segctor_sem);
233 if (ret < 0)
234 return ret;
235
236 if (copy_to_user(argp, &sustat, sizeof(sustat)))
237 ret = -EFAULT;
238 return ret;
239}
240
241static ssize_t
242nilfs_ioctl_do_get_vinfo(struct the_nilfs *nilfs, __u64 *posp, int flags,
243 void *buf, size_t size, size_t nmembs)
244{
245 return nilfs_dat_get_vinfo(nilfs_dat_inode(nilfs), buf, nmembs);
246}
247
248static int nilfs_ioctl_get_vinfo(struct inode *inode, struct file *filp,
249 unsigned int cmd, void __user *argp)
250{
251 struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
252 struct nilfs_argv argv;
253 int ret;
254
255 if (copy_from_user(&argv, argp, sizeof(argv)))
256 return -EFAULT;
257
258 down_read(&nilfs->ns_segctor_sem);
259 ret = nilfs_ioctl_wrap_copy(nilfs, &argv, _IOC_DIR(cmd),
260 nilfs_ioctl_do_get_vinfo);
261 up_read(&nilfs->ns_segctor_sem);
262 if (ret < 0)
263 return ret;
264
265 if (copy_to_user(argp, &argv, sizeof(argv)))
266 ret = -EFAULT;
267 return ret;
268}
269
270static ssize_t
271nilfs_ioctl_do_get_bdescs(struct the_nilfs *nilfs, __u64 *posp, int flags,
272 void *buf, size_t size, size_t nmembs)
273{
274 struct inode *dat = nilfs_dat_inode(nilfs);
275 struct nilfs_bmap *bmap = NILFS_I(dat)->i_bmap;
276 struct nilfs_bdesc *bdescs = buf;
277 int ret, i;
278
279 for (i = 0; i < nmembs; i++) {
280 ret = nilfs_bmap_lookup_at_level(bmap,
281 bdescs[i].bd_offset,
282 bdescs[i].bd_level + 1,
283 &bdescs[i].bd_blocknr);
284 if (ret < 0) {
285 if (ret != -ENOENT)
286 return ret;
287 bdescs[i].bd_blocknr = 0;
288 }
289 }
290 return nmembs;
291}
292
293static int nilfs_ioctl_get_bdescs(struct inode *inode, struct file *filp,
294 unsigned int cmd, void __user *argp)
295{
296 struct the_nilfs *nilfs = NILFS_SB(inode->i_sb)->s_nilfs;
297 struct nilfs_argv argv;
298 int ret;
299
300 if (copy_from_user(&argv, argp, sizeof(argv)))
301 return -EFAULT;
302
303 down_read(&nilfs->ns_segctor_sem);
304 ret = nilfs_ioctl_wrap_copy(nilfs, &argv, _IOC_DIR(cmd),
305 nilfs_ioctl_do_get_bdescs);
306 up_read(&nilfs->ns_segctor_sem);
307 if (ret < 0)
308 return ret;
309
310 if (copy_to_user(argp, &argv, sizeof(argv)))
311 ret = -EFAULT;
312 return ret;
313}
314
315static int nilfs_ioctl_move_inode_block(struct inode *inode,
316 struct nilfs_vdesc *vdesc,
317 struct list_head *buffers)
318{
319 struct buffer_head *bh;
320 int ret;
321
322 if (vdesc->vd_flags == 0)
323 ret = nilfs_gccache_submit_read_data(
324 inode, vdesc->vd_offset, vdesc->vd_blocknr,
325 vdesc->vd_vblocknr, &bh);
326 else
327 ret = nilfs_gccache_submit_read_node(
328 inode, vdesc->vd_blocknr, vdesc->vd_vblocknr, &bh);
329
330 if (unlikely(ret < 0)) {
331 if (ret == -ENOENT)
332 printk(KERN_CRIT
333 "%s: invalid virtual block address (%s): "
334 "ino=%llu, cno=%llu, offset=%llu, "
335 "blocknr=%llu, vblocknr=%llu\n",
336 __func__, vdesc->vd_flags ? "node" : "data",
337 (unsigned long long)vdesc->vd_ino,
338 (unsigned long long)vdesc->vd_cno,
339 (unsigned long long)vdesc->vd_offset,
340 (unsigned long long)vdesc->vd_blocknr,
341 (unsigned long long)vdesc->vd_vblocknr);
342 return ret;
343 }
344 bh->b_private = vdesc;
345 list_add_tail(&bh->b_assoc_buffers, buffers);
346 return 0;
347}
348
349static ssize_t
350nilfs_ioctl_do_move_blocks(struct the_nilfs *nilfs, __u64 *posp, int flags,
351 void *buf, size_t size, size_t nmembs)
352{
353 struct inode *inode;
354 struct nilfs_vdesc *vdesc;
355 struct buffer_head *bh, *n;
356 LIST_HEAD(buffers);
357 ino_t ino;
358 __u64 cno;
359 int i, ret;
360
361 for (i = 0, vdesc = buf; i < nmembs; ) {
362 ino = vdesc->vd_ino;
363 cno = vdesc->vd_cno;
364 inode = nilfs_gc_iget(nilfs, ino, cno);
365 if (unlikely(inode == NULL)) {
366 ret = -ENOMEM;
367 goto failed;
368 }
369 do {
370 ret = nilfs_ioctl_move_inode_block(inode, vdesc,
371 &buffers);
372 if (unlikely(ret < 0))
373 goto failed;
374 vdesc++;
375 } while (++i < nmembs &&
376 vdesc->vd_ino == ino && vdesc->vd_cno == cno);
377 }
378
379 list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) {
380 ret = nilfs_gccache_wait_and_mark_dirty(bh);
381 if (unlikely(ret < 0)) {
382 if (ret == -EEXIST) {
383 vdesc = bh->b_private;
384 printk(KERN_CRIT
385 "%s: conflicting %s buffer: "
386 "ino=%llu, cno=%llu, offset=%llu, "
387 "blocknr=%llu, vblocknr=%llu\n",
388 __func__,
389 vdesc->vd_flags ? "node" : "data",
390 (unsigned long long)vdesc->vd_ino,
391 (unsigned long long)vdesc->vd_cno,
392 (unsigned long long)vdesc->vd_offset,
393 (unsigned long long)vdesc->vd_blocknr,
394 (unsigned long long)vdesc->vd_vblocknr);
395 }
396 goto failed;
397 }
398 list_del_init(&bh->b_assoc_buffers);
399 bh->b_private = NULL;
400 brelse(bh);
401 }
402 return nmembs;
403
404 failed:
405 list_for_each_entry_safe(bh, n, &buffers, b_assoc_buffers) {
406 list_del_init(&bh->b_assoc_buffers);
407 bh->b_private = NULL;
408 brelse(bh);
409 }
410 return ret;
411}
412
413static inline int nilfs_ioctl_move_blocks(struct the_nilfs *nilfs,
414 struct nilfs_argv *argv,
415 int dir)
416{
417 return nilfs_ioctl_wrap_copy(nilfs, argv, dir,
418 nilfs_ioctl_do_move_blocks);
419}
420
421static ssize_t
422nilfs_ioctl_do_delete_checkpoints(struct the_nilfs *nilfs, __u64 *posp,
423 int flags, void *buf, size_t size,
424 size_t nmembs)
425{
426 struct inode *cpfile = nilfs->ns_cpfile;
427 struct nilfs_period *periods = buf;
428 int ret, i;
429
430 for (i = 0; i < nmembs; i++) {
431 ret = nilfs_cpfile_delete_checkpoints(
432 cpfile, periods[i].p_start, periods[i].p_end);
433 if (ret < 0)
434 return ret;
435 }
436 return nmembs;
437}
438
439static inline int nilfs_ioctl_delete_checkpoints(struct the_nilfs *nilfs,
440 struct nilfs_argv *argv,
441 int dir)
442{
443 return nilfs_ioctl_wrap_copy(nilfs, argv, dir,
444 nilfs_ioctl_do_delete_checkpoints);
445}
446
447static ssize_t
448nilfs_ioctl_do_free_vblocknrs(struct the_nilfs *nilfs, __u64 *posp, int flags,
449 void *buf, size_t size, size_t nmembs)
450{
451 int ret = nilfs_dat_freev(nilfs_dat_inode(nilfs), buf, nmembs);
452
453 return (ret < 0) ? ret : nmembs;
454}
455
456static inline int nilfs_ioctl_free_vblocknrs(struct the_nilfs *nilfs,
457 struct nilfs_argv *argv,
458 int dir)
459{
460 return nilfs_ioctl_wrap_copy(nilfs, argv, dir,
461 nilfs_ioctl_do_free_vblocknrs);
462}
463
464static ssize_t
465nilfs_ioctl_do_mark_blocks_dirty(struct the_nilfs *nilfs, __u64 *posp,
466 int flags, void *buf, size_t size,
467 size_t nmembs)
468{
469 struct inode *dat = nilfs_dat_inode(nilfs);
470 struct nilfs_bmap *bmap = NILFS_I(dat)->i_bmap;
471 struct nilfs_bdesc *bdescs = buf;
472 int ret, i;
473
474 for (i = 0; i < nmembs; i++) {
475 /* XXX: use macro or inline func to check liveness */
476 ret = nilfs_bmap_lookup_at_level(bmap,
477 bdescs[i].bd_offset,
478 bdescs[i].bd_level + 1,
479 &bdescs[i].bd_blocknr);
480 if (ret < 0) {
481 if (ret != -ENOENT)
482 return ret;
483 bdescs[i].bd_blocknr = 0;
484 }
485 if (bdescs[i].bd_blocknr != bdescs[i].bd_oblocknr)
486 /* skip dead block */
487 continue;
488 if (bdescs[i].bd_level == 0) {
489 ret = nilfs_mdt_mark_block_dirty(dat,
490 bdescs[i].bd_offset);
491 if (ret < 0) {
492 WARN_ON(ret == -ENOENT);
493 return ret;
494 }
495 } else {
496 ret = nilfs_bmap_mark(bmap, bdescs[i].bd_offset,
497 bdescs[i].bd_level);
498 if (ret < 0) {
499 WARN_ON(ret == -ENOENT);
500 return ret;
501 }
502 }
503 }
504 return nmembs;
505}
506
507static inline int nilfs_ioctl_mark_blocks_dirty(struct the_nilfs *nilfs,
508 struct nilfs_argv *argv,
509 int dir)
510{
511 return nilfs_ioctl_wrap_copy(nilfs, argv, dir,
512 nilfs_ioctl_do_mark_blocks_dirty);
513}
514
515static ssize_t
516nilfs_ioctl_do_free_segments(struct the_nilfs *nilfs, __u64 *posp, int flags,
517 void *buf, size_t size, size_t nmembs)
518{
519 struct nilfs_sb_info *sbi = nilfs_get_writer(nilfs);
520 int ret;
521
522 if (unlikely(!sbi))
523 return -EROFS;
524 ret = nilfs_segctor_add_segments_to_be_freed(
525 NILFS_SC(sbi), buf, nmembs);
526 nilfs_put_writer(nilfs);
527
528 return (ret < 0) ? ret : nmembs;
529}
530
531static inline int nilfs_ioctl_free_segments(struct the_nilfs *nilfs,
532 struct nilfs_argv *argv,
533 int dir)
534{
535 return nilfs_ioctl_wrap_copy(nilfs, argv, dir,
536 nilfs_ioctl_do_free_segments);
537}
538
539int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *nilfs,
540 void __user *argp)
541{
542 struct nilfs_argv argv[5];
543 const char *msg;
544 int dir, ret;
545
546 if (copy_from_user(argv, argp, sizeof(argv)))
547 return -EFAULT;
548
549 dir = _IOC_WRITE;
550 ret = nilfs_ioctl_move_blocks(nilfs, &argv[0], dir);
551 if (ret < 0) {
552 msg = "cannot read source blocks";
553 goto failed;
554 }
555 ret = nilfs_ioctl_delete_checkpoints(nilfs, &argv[1], dir);
556 if (ret < 0) {
557 /*
558 * can safely abort because checkpoints can be removed
559 * independently.
560 */
561 msg = "cannot delete checkpoints";
562 goto failed;
563 }
564 ret = nilfs_ioctl_free_vblocknrs(nilfs, &argv[2], dir);
565 if (ret < 0) {
566 /*
567 * can safely abort because DAT file is updated atomically
568 * using a copy-on-write technique.
569 */
570 msg = "cannot delete virtual blocks from DAT file";
571 goto failed;
572 }
573 ret = nilfs_ioctl_mark_blocks_dirty(nilfs, &argv[3], dir);
574 if (ret < 0) {
575 /*
576 * can safely abort because the operation is nondestructive.
577 */
578 msg = "cannot mark copying blocks dirty";
579 goto failed;
580 }
581 ret = nilfs_ioctl_free_segments(nilfs, &argv[4], dir);
582 if (ret < 0) {
583 /*
584 * can safely abort because this operation is atomic.
585 */
586 msg = "cannot set segments to be freed";
587 goto failed;
588 }
589 return 0;
590
591 failed:
592 nilfs_remove_all_gcinode(nilfs);
593 printk(KERN_ERR "NILFS: GC failed during preparation: %s: err=%d\n",
594 msg, ret);
595 return ret;
596}
597
598static int nilfs_ioctl_clean_segments(struct inode *inode, struct file *filp,
599 unsigned int cmd, void __user *argp)
600{
601 if (!capable(CAP_SYS_ADMIN))
602 return -EPERM;
603 return nilfs_clean_segments(inode->i_sb, argp);
604}
605
606static int nilfs_ioctl_sync(struct inode *inode, struct file *filp,
607 unsigned int cmd, void __user *argp)
608{
609 __u64 cno;
610 int ret;
611
612 ret = nilfs_construct_segment(inode->i_sb);
613 if (ret < 0)
614 return ret;
615
616 if (argp != NULL) {
617 cno = NILFS_SB(inode->i_sb)->s_nilfs->ns_cno - 1;
618 if (copy_to_user(argp, &cno, sizeof(cno)))
619 return -EFAULT;
620 }
621 return 0;
622}
623
624long nilfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
625{
626 struct inode *inode = filp->f_dentry->d_inode;
627 void __user *argp = (void * __user *)arg;
628
629 switch (cmd) {
630 case NILFS_IOCTL_CHANGE_CPMODE:
631 return nilfs_ioctl_change_cpmode(inode, filp, cmd, argp);
632 case NILFS_IOCTL_DELETE_CHECKPOINT:
633 return nilfs_ioctl_delete_checkpoint(inode, filp, cmd, argp);
634 case NILFS_IOCTL_GET_CPINFO:
635 return nilfs_ioctl_get_cpinfo(inode, filp, cmd, argp);
636 case NILFS_IOCTL_GET_CPSTAT:
637 return nilfs_ioctl_get_cpstat(inode, filp, cmd, argp);
638 case NILFS_IOCTL_GET_SUINFO:
639 return nilfs_ioctl_get_suinfo(inode, filp, cmd, argp);
640 case NILFS_IOCTL_GET_SUSTAT:
641 return nilfs_ioctl_get_sustat(inode, filp, cmd, argp);
642 case NILFS_IOCTL_GET_VINFO:
643 /* XXX: rename to ??? */
644 return nilfs_ioctl_get_vinfo(inode, filp, cmd, argp);
645 case NILFS_IOCTL_GET_BDESCS:
646 return nilfs_ioctl_get_bdescs(inode, filp, cmd, argp);
647 case NILFS_IOCTL_CLEAN_SEGMENTS:
648 return nilfs_ioctl_clean_segments(inode, filp, cmd, argp);
649 case NILFS_IOCTL_SYNC:
650 return nilfs_ioctl_sync(inode, filp, cmd, argp);
651 default:
652 return -ENOTTY;
653 }
654}
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
new file mode 100644
index 000000000000..47dd815433fd
--- /dev/null
+++ b/fs/nilfs2/mdt.c
@@ -0,0 +1,563 @@
1/*
2 * mdt.c - meta data file for NILFS
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>
21 */
22
23#include <linux/buffer_head.h>
24#include <linux/mpage.h>
25#include <linux/mm.h>
26#include <linux/writeback.h>
27#include <linux/backing-dev.h>
28#include <linux/swap.h>
29#include "nilfs.h"
30#include "segment.h"
31#include "page.h"
32#include "mdt.h"
33
34
35#define NILFS_MDT_MAX_RA_BLOCKS (16 - 1)
36
37#define INIT_UNUSED_INODE_FIELDS
38
39static int
40nilfs_mdt_insert_new_block(struct inode *inode, unsigned long block,
41 struct buffer_head *bh,
42 void (*init_block)(struct inode *,
43 struct buffer_head *, void *))
44{
45 struct nilfs_inode_info *ii = NILFS_I(inode);
46 void *kaddr;
47 int ret;
48
49 /* Caller exclude read accesses using page lock */
50
51 /* set_buffer_new(bh); */
52 bh->b_blocknr = 0;
53
54 ret = nilfs_bmap_insert(ii->i_bmap, block, (unsigned long)bh);
55 if (unlikely(ret))
56 return ret;
57
58 set_buffer_mapped(bh);
59
60 kaddr = kmap_atomic(bh->b_page, KM_USER0);
61 memset(kaddr + bh_offset(bh), 0, 1 << inode->i_blkbits);
62 if (init_block)
63 init_block(inode, bh, kaddr);
64 flush_dcache_page(bh->b_page);
65 kunmap_atomic(kaddr, KM_USER0);
66
67 set_buffer_uptodate(bh);
68 nilfs_mark_buffer_dirty(bh);
69 nilfs_mdt_mark_dirty(inode);
70 return 0;
71}
72
73static int nilfs_mdt_create_block(struct inode *inode, unsigned long block,
74 struct buffer_head **out_bh,
75 void (*init_block)(struct inode *,
76 struct buffer_head *,
77 void *))
78{
79 struct the_nilfs *nilfs = NILFS_MDT(inode)->mi_nilfs;
80 struct nilfs_sb_info *writer = NULL;
81 struct super_block *sb = inode->i_sb;
82 struct nilfs_transaction_info ti;
83 struct buffer_head *bh;
84 int err;
85
86 if (!sb) {
87 writer = nilfs_get_writer(nilfs);
88 if (!writer) {
89 err = -EROFS;
90 goto out;
91 }
92 sb = writer->s_super;
93 }
94
95 nilfs_transaction_begin(sb, &ti, 0);
96
97 err = -ENOMEM;
98 bh = nilfs_grab_buffer(inode, inode->i_mapping, block, 0);
99 if (unlikely(!bh))
100 goto failed_unlock;
101
102 err = -EEXIST;
103 if (buffer_uptodate(bh) || buffer_mapped(bh))
104 goto failed_bh;
105#if 0
106 /* The uptodate flag is not protected by the page lock, but
107 the mapped flag is. Thus, we don't have to wait the buffer. */
108 wait_on_buffer(bh);
109 if (buffer_uptodate(bh))
110 goto failed_bh;
111#endif
112
113 bh->b_bdev = nilfs->ns_bdev;
114 err = nilfs_mdt_insert_new_block(inode, block, bh, init_block);
115 if (likely(!err)) {
116 get_bh(bh);
117 *out_bh = bh;
118 }
119
120 failed_bh:
121 unlock_page(bh->b_page);
122 page_cache_release(bh->b_page);
123 brelse(bh);
124
125 failed_unlock:
126 if (likely(!err))
127 err = nilfs_transaction_commit(sb);
128 else
129 nilfs_transaction_abort(sb);
130 if (writer)
131 nilfs_put_writer(nilfs);
132 out:
133 return err;
134}
135
136static int
137nilfs_mdt_submit_block(struct inode *inode, unsigned long blkoff,
138 int mode, struct buffer_head **out_bh)
139{
140 struct buffer_head *bh;
141 unsigned long blknum = 0;
142 int ret = -ENOMEM;
143
144 bh = nilfs_grab_buffer(inode, inode->i_mapping, blkoff, 0);
145 if (unlikely(!bh))
146 goto failed;
147
148 ret = -EEXIST; /* internal code */
149 if (buffer_uptodate(bh))
150 goto out;
151
152 if (mode == READA) {
153 if (!trylock_buffer(bh)) {
154 ret = -EBUSY;
155 goto failed_bh;
156 }
157 } else /* mode == READ */
158 lock_buffer(bh);
159
160 if (buffer_uptodate(bh)) {
161 unlock_buffer(bh);
162 goto out;
163 }
164 if (!buffer_mapped(bh)) { /* unused buffer */
165 ret = nilfs_bmap_lookup(NILFS_I(inode)->i_bmap, blkoff,
166 &blknum);
167 if (unlikely(ret)) {
168 unlock_buffer(bh);
169 goto failed_bh;
170 }
171 bh->b_bdev = NILFS_MDT(inode)->mi_nilfs->ns_bdev;
172 bh->b_blocknr = blknum;
173 set_buffer_mapped(bh);
174 }
175
176 bh->b_end_io = end_buffer_read_sync;
177 get_bh(bh);
178 submit_bh(mode, bh);
179 ret = 0;
180 out:
181 get_bh(bh);
182 *out_bh = bh;
183
184 failed_bh:
185 unlock_page(bh->b_page);
186 page_cache_release(bh->b_page);
187 brelse(bh);
188 failed:
189 return ret;
190}
191
192static int nilfs_mdt_read_block(struct inode *inode, unsigned long block,
193 struct buffer_head **out_bh)
194{
195 struct buffer_head *first_bh, *bh;
196 unsigned long blkoff;
197 int i, nr_ra_blocks = NILFS_MDT_MAX_RA_BLOCKS;
198 int err;
199
200 err = nilfs_mdt_submit_block(inode, block, READ, &first_bh);
201 if (err == -EEXIST) /* internal code */
202 goto out;
203
204 if (unlikely(err))
205 goto failed;
206
207 blkoff = block + 1;
208 for (i = 0; i < nr_ra_blocks; i++, blkoff++) {
209 err = nilfs_mdt_submit_block(inode, blkoff, READA, &bh);
210 if (likely(!err || err == -EEXIST))
211 brelse(bh);
212 else if (err != -EBUSY)
213 break; /* abort readahead if bmap lookup failed */
214
215 if (!buffer_locked(first_bh))
216 goto out_no_wait;
217 }
218
219 wait_on_buffer(first_bh);
220
221 out_no_wait:
222 err = -EIO;
223 if (!buffer_uptodate(first_bh))
224 goto failed_bh;
225 out:
226 *out_bh = first_bh;
227 return 0;
228
229 failed_bh:
230 brelse(first_bh);
231 failed:
232 return err;
233}
234
235/**
236 * nilfs_mdt_get_block - read or create a buffer on meta data file.
237 * @inode: inode of the meta data file
238 * @blkoff: block offset
239 * @create: create flag
240 * @init_block: initializer used for newly allocated block
241 * @out_bh: output of a pointer to the buffer_head
242 *
243 * nilfs_mdt_get_block() looks up the specified buffer and tries to create
244 * a new buffer if @create is not zero. On success, the returned buffer is
245 * assured to be either existing or formatted using a buffer lock on success.
246 * @out_bh is substituted only when zero is returned.
247 *
248 * Return Value: On success, it returns 0. On error, the following negative
249 * error code is returned.
250 *
251 * %-ENOMEM - Insufficient memory available.
252 *
253 * %-EIO - I/O error
254 *
255 * %-ENOENT - the specified block does not exist (hole block)
256 *
257 * %-EINVAL - bmap is broken. (the caller should call nilfs_error())
258 *
259 * %-EROFS - Read only filesystem (for create mode)
260 */
261int nilfs_mdt_get_block(struct inode *inode, unsigned long blkoff, int create,
262 void (*init_block)(struct inode *,
263 struct buffer_head *, void *),
264 struct buffer_head **out_bh)
265{
266 int ret;
267
268 /* Should be rewritten with merging nilfs_mdt_read_block() */
269 retry:
270 ret = nilfs_mdt_read_block(inode, blkoff, out_bh);
271 if (!create || ret != -ENOENT)
272 return ret;
273
274 ret = nilfs_mdt_create_block(inode, blkoff, out_bh, init_block);
275 if (unlikely(ret == -EEXIST)) {
276 /* create = 0; */ /* limit read-create loop retries */
277 goto retry;
278 }
279 return ret;
280}
281
282/**
283 * nilfs_mdt_delete_block - make a hole on the meta data file.
284 * @inode: inode of the meta data file
285 * @block: block offset
286 *
287 * Return Value: On success, zero is returned.
288 * On error, one of the following negative error code is returned.
289 *
290 * %-ENOMEM - Insufficient memory available.
291 *
292 * %-EIO - I/O error
293 *
294 * %-EINVAL - bmap is broken. (the caller should call nilfs_error())
295 */
296int nilfs_mdt_delete_block(struct inode *inode, unsigned long block)
297{
298 struct nilfs_inode_info *ii = NILFS_I(inode);
299 int err;
300
301 err = nilfs_bmap_delete(ii->i_bmap, block);
302 if (likely(!err)) {
303 nilfs_mdt_mark_dirty(inode);
304 nilfs_mdt_forget_block(inode, block);
305 }
306 return err;
307}
308
309/**
310 * nilfs_mdt_forget_block - discard dirty state and try to remove the page
311 * @inode: inode of the meta data file
312 * @block: block offset
313 *
314 * nilfs_mdt_forget_block() clears a dirty flag of the specified buffer, and
315 * tries to release the page including the buffer from a page cache.
316 *
317 * Return Value: On success, 0 is returned. On error, one of the following
318 * negative error code is returned.
319 *
320 * %-EBUSY - page has an active buffer.
321 *
322 * %-ENOENT - page cache has no page addressed by the offset.
323 */
324int nilfs_mdt_forget_block(struct inode *inode, unsigned long block)
325{
326 pgoff_t index = (pgoff_t)block >>
327 (PAGE_CACHE_SHIFT - inode->i_blkbits);
328 struct page *page;
329 unsigned long first_block;
330 int ret = 0;
331 int still_dirty;
332
333 page = find_lock_page(inode->i_mapping, index);
334 if (!page)
335 return -ENOENT;
336
337 wait_on_page_writeback(page);
338
339 first_block = (unsigned long)index <<
340 (PAGE_CACHE_SHIFT - inode->i_blkbits);
341 if (page_has_buffers(page)) {
342 struct buffer_head *bh;
343
344 bh = nilfs_page_get_nth_block(page, block - first_block);
345 nilfs_forget_buffer(bh);
346 }
347 still_dirty = PageDirty(page);
348 unlock_page(page);
349 page_cache_release(page);
350
351 if (still_dirty ||
352 invalidate_inode_pages2_range(inode->i_mapping, index, index) != 0)
353 ret = -EBUSY;
354 return ret;
355}
356
357/**
358 * nilfs_mdt_mark_block_dirty - mark a block on the meta data file dirty.
359 * @inode: inode of the meta data file
360 * @block: block offset
361 *
362 * Return Value: On success, it returns 0. On error, the following negative
363 * error code is returned.
364 *
365 * %-ENOMEM - Insufficient memory available.
366 *
367 * %-EIO - I/O error
368 *
369 * %-ENOENT - the specified block does not exist (hole block)
370 *
371 * %-EINVAL - bmap is broken. (the caller should call nilfs_error())
372 */
373int nilfs_mdt_mark_block_dirty(struct inode *inode, unsigned long block)
374{
375 struct buffer_head *bh;
376 int err;
377
378 err = nilfs_mdt_read_block(inode, block, &bh);
379 if (unlikely(err))
380 return err;
381 nilfs_mark_buffer_dirty(bh);
382 nilfs_mdt_mark_dirty(inode);
383 brelse(bh);
384 return 0;
385}
386
387int nilfs_mdt_fetch_dirty(struct inode *inode)
388{
389 struct nilfs_inode_info *ii = NILFS_I(inode);
390
391 if (nilfs_bmap_test_and_clear_dirty(ii->i_bmap)) {
392 set_bit(NILFS_I_DIRTY, &ii->i_state);
393 return 1;
394 }
395 return test_bit(NILFS_I_DIRTY, &ii->i_state);
396}
397
398static int
399nilfs_mdt_write_page(struct page *page, struct writeback_control *wbc)
400{
401 struct inode *inode = container_of(page->mapping,
402 struct inode, i_data);
403 struct super_block *sb = inode->i_sb;
404 struct nilfs_sb_info *writer = NULL;
405 int err = 0;
406
407 redirty_page_for_writepage(wbc, page);
408 unlock_page(page);
409
410 if (page->mapping->assoc_mapping)
411 return 0; /* Do not request flush for shadow page cache */
412 if (!sb) {
413 writer = nilfs_get_writer(NILFS_MDT(inode)->mi_nilfs);
414 if (!writer)
415 return -EROFS;
416 sb = writer->s_super;
417 }
418
419 if (wbc->sync_mode == WB_SYNC_ALL)
420 err = nilfs_construct_segment(sb);
421 else if (wbc->for_reclaim)
422 nilfs_flush_segment(sb, inode->i_ino);
423
424 if (writer)
425 nilfs_put_writer(NILFS_MDT(inode)->mi_nilfs);
426 return err;
427}
428
429
430static struct address_space_operations def_mdt_aops = {
431 .writepage = nilfs_mdt_write_page,
432};
433
434static struct inode_operations def_mdt_iops;
435static struct file_operations def_mdt_fops;
436
437/*
438 * NILFS2 uses pseudo inodes for meta data files such as DAT, cpfile, sufile,
439 * ifile, or gcinodes. This allows the B-tree code and segment constructor
440 * to treat them like regular files, and this helps to simplify the
441 * implementation.
442 * On the other hand, some of the pseudo inodes have an irregular point:
443 * They don't have valid inode->i_sb pointer because their lifetimes are
444 * longer than those of the super block structs; they may continue for
445 * several consecutive mounts/umounts. This would need discussions.
446 */
447struct inode *
448nilfs_mdt_new_common(struct the_nilfs *nilfs, struct super_block *sb,
449 ino_t ino, gfp_t gfp_mask)
450{
451 struct inode *inode = nilfs_alloc_inode(sb);
452
453 if (!inode)
454 return NULL;
455 else {
456 struct address_space * const mapping = &inode->i_data;
457 struct nilfs_mdt_info *mi = kzalloc(sizeof(*mi), GFP_NOFS);
458
459 if (!mi) {
460 nilfs_destroy_inode(inode);
461 return NULL;
462 }
463 mi->mi_nilfs = nilfs;
464 init_rwsem(&mi->mi_sem);
465
466 inode->i_sb = sb; /* sb may be NULL for some meta data files */
467 inode->i_blkbits = nilfs->ns_blocksize_bits;
468 inode->i_flags = 0;
469 atomic_set(&inode->i_count, 1);
470 inode->i_nlink = 1;
471 inode->i_ino = ino;
472 inode->i_mode = S_IFREG;
473 inode->i_private = mi;
474
475#ifdef INIT_UNUSED_INODE_FIELDS
476 atomic_set(&inode->i_writecount, 0);
477 inode->i_size = 0;
478 inode->i_blocks = 0;
479 inode->i_bytes = 0;
480 inode->i_generation = 0;
481#ifdef CONFIG_QUOTA
482 memset(&inode->i_dquot, 0, sizeof(inode->i_dquot));
483#endif
484 inode->i_pipe = NULL;
485 inode->i_bdev = NULL;
486 inode->i_cdev = NULL;
487 inode->i_rdev = 0;
488#ifdef CONFIG_SECURITY
489 inode->i_security = NULL;
490#endif
491 inode->dirtied_when = 0;
492
493 INIT_LIST_HEAD(&inode->i_list);
494 INIT_LIST_HEAD(&inode->i_sb_list);
495 inode->i_state = 0;
496#endif
497
498 spin_lock_init(&inode->i_lock);
499 mutex_init(&inode->i_mutex);
500 init_rwsem(&inode->i_alloc_sem);
501
502 mapping->host = NULL; /* instead of inode */
503 mapping->flags = 0;
504 mapping_set_gfp_mask(mapping, gfp_mask);
505 mapping->assoc_mapping = NULL;
506 mapping->backing_dev_info = nilfs->ns_bdi;
507
508 inode->i_mapping = mapping;
509 }
510
511 return inode;
512}
513
514struct inode *nilfs_mdt_new(struct the_nilfs *nilfs, struct super_block *sb,
515 ino_t ino, gfp_t gfp_mask)
516{
517 struct inode *inode = nilfs_mdt_new_common(nilfs, sb, ino, gfp_mask);
518
519 if (!inode)
520 return NULL;
521
522 inode->i_op = &def_mdt_iops;
523 inode->i_fop = &def_mdt_fops;
524 inode->i_mapping->a_ops = &def_mdt_aops;
525 return inode;
526}
527
528void nilfs_mdt_set_entry_size(struct inode *inode, unsigned entry_size,
529 unsigned header_size)
530{
531 struct nilfs_mdt_info *mi = NILFS_MDT(inode);
532
533 mi->mi_entry_size = entry_size;
534 mi->mi_entries_per_block = (1 << inode->i_blkbits) / entry_size;
535 mi->mi_first_entry_offset = DIV_ROUND_UP(header_size, entry_size);
536}
537
538void nilfs_mdt_set_shadow(struct inode *orig, struct inode *shadow)
539{
540 shadow->i_mapping->assoc_mapping = orig->i_mapping;
541 NILFS_I(shadow)->i_btnode_cache.assoc_mapping =
542 &NILFS_I(orig)->i_btnode_cache;
543}
544
545void nilfs_mdt_clear(struct inode *inode)
546{
547 struct nilfs_inode_info *ii = NILFS_I(inode);
548
549 invalidate_mapping_pages(inode->i_mapping, 0, -1);
550 truncate_inode_pages(inode->i_mapping, 0);
551
552 nilfs_bmap_clear(ii->i_bmap);
553 nilfs_btnode_cache_clear(&ii->i_btnode_cache);
554}
555
556void nilfs_mdt_destroy(struct inode *inode)
557{
558 struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
559
560 kfree(mdi->mi_bgl); /* kfree(NULL) is safe */
561 kfree(mdi);
562 nilfs_destroy_inode(inode);
563}
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
new file mode 100644
index 000000000000..df683e0bca6a
--- /dev/null
+++ b/fs/nilfs2/mdt.h
@@ -0,0 +1,125 @@
1/*
2 * mdt.h - NILFS meta data file prototype and definitions
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>
21 */
22
23#ifndef _NILFS_MDT_H
24#define _NILFS_MDT_H
25
26#include <linux/buffer_head.h>
27#include <linux/blockgroup_lock.h>
28#include "nilfs.h"
29#include "page.h"
30
31/**
32 * struct nilfs_mdt_info - on-memory private data of meta data files
33 * @mi_nilfs: back pointer to the_nilfs struct
34 * @mi_sem: reader/writer semaphore for meta data operations
35 * @mi_bgl: per-blockgroup locking
36 * @mi_entry_size: size of an entry
37 * @mi_first_entry_offset: offset to the first entry
38 * @mi_entries_per_block: number of entries in a block
39 * @mi_blocks_per_group: number of blocks in a group
40 * @mi_blocks_per_desc_block: number of blocks per descriptor block
41 */
42struct nilfs_mdt_info {
43 struct the_nilfs *mi_nilfs;
44 struct rw_semaphore mi_sem;
45 struct blockgroup_lock *mi_bgl;
46 unsigned mi_entry_size;
47 unsigned mi_first_entry_offset;
48 unsigned long mi_entries_per_block;
49 unsigned long mi_blocks_per_group;
50 unsigned long mi_blocks_per_desc_block;
51};
52
53static inline struct nilfs_mdt_info *NILFS_MDT(const struct inode *inode)
54{
55 return inode->i_private;
56}
57
58static inline struct the_nilfs *NILFS_I_NILFS(struct inode *inode)
59{
60 struct super_block *sb = inode->i_sb;
61
62 return sb ? NILFS_SB(sb)->s_nilfs : NILFS_MDT(inode)->mi_nilfs;
63}
64
65/* Default GFP flags using highmem */
66#define NILFS_MDT_GFP (__GFP_WAIT | __GFP_IO | __GFP_HIGHMEM)
67
68int nilfs_mdt_get_block(struct inode *, unsigned long, int,
69 void (*init_block)(struct inode *,
70 struct buffer_head *, void *),
71 struct buffer_head **);
72int nilfs_mdt_delete_block(struct inode *, unsigned long);
73int nilfs_mdt_forget_block(struct inode *, unsigned long);
74int nilfs_mdt_mark_block_dirty(struct inode *, unsigned long);
75int nilfs_mdt_fetch_dirty(struct inode *);
76
77struct inode *nilfs_mdt_new(struct the_nilfs *, struct super_block *, ino_t,
78 gfp_t);
79struct inode *nilfs_mdt_new_common(struct the_nilfs *, struct super_block *,
80 ino_t, gfp_t);
81void nilfs_mdt_destroy(struct inode *);
82void nilfs_mdt_clear(struct inode *);
83void nilfs_mdt_set_entry_size(struct inode *, unsigned, unsigned);
84void nilfs_mdt_set_shadow(struct inode *, struct inode *);
85
86
87#define nilfs_mdt_mark_buffer_dirty(bh) nilfs_mark_buffer_dirty(bh)
88
89static inline void nilfs_mdt_mark_dirty(struct inode *inode)
90{
91 if (!test_bit(NILFS_I_DIRTY, &NILFS_I(inode)->i_state))
92 set_bit(NILFS_I_DIRTY, &NILFS_I(inode)->i_state);
93}
94
95static inline void nilfs_mdt_clear_dirty(struct inode *inode)
96{
97 clear_bit(NILFS_I_DIRTY, &NILFS_I(inode)->i_state);
98}
99
100static inline __u64 nilfs_mdt_cno(struct inode *inode)
101{
102 return NILFS_MDT(inode)->mi_nilfs->ns_cno;
103}
104
105#define nilfs_mdt_bgl_lock(inode, bg) \
106 (&NILFS_MDT(inode)->mi_bgl->locks[(bg) & (NR_BG_LOCKS-1)].lock)
107
108
109static inline int
110nilfs_mdt_read_inode_direct(struct inode *inode, struct buffer_head *bh,
111 unsigned n)
112{
113 return nilfs_read_inode_common(
114 inode, (struct nilfs_inode *)(bh->b_data + n));
115}
116
117static inline void
118nilfs_mdt_write_inode_direct(struct inode *inode, struct buffer_head *bh,
119 unsigned n)
120{
121 nilfs_write_inode_common(
122 inode, (struct nilfs_inode *)(bh->b_data + n), 1);
123}
124
125#endif /* _NILFS_MDT_H */
diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c
new file mode 100644
index 000000000000..df70dadb336f
--- /dev/null
+++ b/fs/nilfs2/namei.c
@@ -0,0 +1,474 @@
1/*
2 * namei.c - NILFS pathname lookup operations.
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Modified for NILFS by Amagai Yoshiji <amagai@osrg.net>,
21 * Ryusuke Konishi <ryusuke@osrg.net>
22 */
23/*
24 * linux/fs/ext2/namei.c
25 *
26 * Copyright (C) 1992, 1993, 1994, 1995
27 * Remy Card (card@masi.ibp.fr)
28 * Laboratoire MASI - Institut Blaise Pascal
29 * Universite Pierre et Marie Curie (Paris VI)
30 *
31 * from
32 *
33 * linux/fs/minix/namei.c
34 *
35 * Copyright (C) 1991, 1992 Linus Torvalds
36 *
37 * Big-endian to little-endian byte-swapping/bitmaps by
38 * David S. Miller (davem@caip.rutgers.edu), 1995
39 */
40
41#include <linux/pagemap.h>
42#include "nilfs.h"
43
44
45static inline int nilfs_add_nondir(struct dentry *dentry, struct inode *inode)
46{
47 int err = nilfs_add_link(dentry, inode);
48 if (!err) {
49 d_instantiate(dentry, inode);
50 return 0;
51 }
52 inode_dec_link_count(inode);
53 iput(inode);
54 return err;
55}
56
57/*
58 * Methods themselves.
59 */
60
61static struct dentry *
62nilfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
63{
64 struct inode *inode;
65 ino_t ino;
66
67 if (dentry->d_name.len > NILFS_NAME_LEN)
68 return ERR_PTR(-ENAMETOOLONG);
69
70 ino = nilfs_inode_by_name(dir, dentry);
71 inode = NULL;
72 if (ino) {
73 inode = nilfs_iget(dir->i_sb, ino);
74 if (IS_ERR(inode))
75 return ERR_CAST(inode);
76 }
77 return d_splice_alias(inode, dentry);
78}
79
80struct dentry *nilfs_get_parent(struct dentry *child)
81{
82 unsigned long ino;
83 struct inode *inode;
84 struct dentry dotdot;
85
86 dotdot.d_name.name = "..";
87 dotdot.d_name.len = 2;
88
89 ino = nilfs_inode_by_name(child->d_inode, &dotdot);
90 if (!ino)
91 return ERR_PTR(-ENOENT);
92
93 inode = nilfs_iget(child->d_inode->i_sb, ino);
94 if (IS_ERR(inode))
95 return ERR_CAST(inode);
96 return d_obtain_alias(inode);
97}
98
99/*
100 * By the time this is called, we already have created
101 * the directory cache entry for the new file, but it
102 * is so far negative - it has no inode.
103 *
104 * If the create succeeds, we fill in the inode information
105 * with d_instantiate().
106 */
107static int nilfs_create(struct inode *dir, struct dentry *dentry, int mode,
108 struct nameidata *nd)
109{
110 struct inode *inode;
111 struct nilfs_transaction_info ti;
112 int err;
113
114 err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
115 if (err)
116 return err;
117 inode = nilfs_new_inode(dir, mode);
118 err = PTR_ERR(inode);
119 if (!IS_ERR(inode)) {
120 inode->i_op = &nilfs_file_inode_operations;
121 inode->i_fop = &nilfs_file_operations;
122 inode->i_mapping->a_ops = &nilfs_aops;
123 mark_inode_dirty(inode);
124 err = nilfs_add_nondir(dentry, inode);
125 }
126 if (!err)
127 err = nilfs_transaction_commit(dir->i_sb);
128 else
129 nilfs_transaction_abort(dir->i_sb);
130
131 return err;
132}
133
134static int
135nilfs_mknod(struct inode *dir, struct dentry *dentry, int mode, dev_t rdev)
136{
137 struct inode *inode;
138 struct nilfs_transaction_info ti;
139 int err;
140
141 if (!new_valid_dev(rdev))
142 return -EINVAL;
143
144 err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
145 if (err)
146 return err;
147 inode = nilfs_new_inode(dir, mode);
148 err = PTR_ERR(inode);
149 if (!IS_ERR(inode)) {
150 init_special_inode(inode, inode->i_mode, rdev);
151 mark_inode_dirty(inode);
152 err = nilfs_add_nondir(dentry, inode);
153 }
154 if (!err)
155 err = nilfs_transaction_commit(dir->i_sb);
156 else
157 nilfs_transaction_abort(dir->i_sb);
158
159 return err;
160}
161
162static int nilfs_symlink(struct inode *dir, struct dentry *dentry,
163 const char *symname)
164{
165 struct nilfs_transaction_info ti;
166 struct super_block *sb = dir->i_sb;
167 unsigned l = strlen(symname)+1;
168 struct inode *inode;
169 int err;
170
171 if (l > sb->s_blocksize)
172 return -ENAMETOOLONG;
173
174 err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
175 if (err)
176 return err;
177
178 inode = nilfs_new_inode(dir, S_IFLNK | S_IRWXUGO);
179 err = PTR_ERR(inode);
180 if (IS_ERR(inode))
181 goto out;
182
183 /* slow symlink */
184 inode->i_op = &nilfs_symlink_inode_operations;
185 inode->i_mapping->a_ops = &nilfs_aops;
186 err = page_symlink(inode, symname, l);
187 if (err)
188 goto out_fail;
189
190 /* mark_inode_dirty(inode); */
191 /* nilfs_new_inode() and page_symlink() do this */
192
193 err = nilfs_add_nondir(dentry, inode);
194out:
195 if (!err)
196 err = nilfs_transaction_commit(dir->i_sb);
197 else
198 nilfs_transaction_abort(dir->i_sb);
199
200 return err;
201
202out_fail:
203 inode_dec_link_count(inode);
204 iput(inode);
205 goto out;
206}
207
208static int nilfs_link(struct dentry *old_dentry, struct inode *dir,
209 struct dentry *dentry)
210{
211 struct inode *inode = old_dentry->d_inode;
212 struct nilfs_transaction_info ti;
213 int err;
214
215 if (inode->i_nlink >= NILFS_LINK_MAX)
216 return -EMLINK;
217
218 err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
219 if (err)
220 return err;
221
222 inode->i_ctime = CURRENT_TIME;
223 inode_inc_link_count(inode);
224 atomic_inc(&inode->i_count);
225
226 err = nilfs_add_nondir(dentry, inode);
227 if (!err)
228 err = nilfs_transaction_commit(dir->i_sb);
229 else
230 nilfs_transaction_abort(dir->i_sb);
231
232 return err;
233}
234
235static int nilfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
236{
237 struct inode *inode;
238 struct nilfs_transaction_info ti;
239 int err;
240
241 if (dir->i_nlink >= NILFS_LINK_MAX)
242 return -EMLINK;
243
244 err = nilfs_transaction_begin(dir->i_sb, &ti, 1);
245 if (err)
246 return err;
247
248 inode_inc_link_count(dir);
249
250 inode = nilfs_new_inode(dir, S_IFDIR | mode);
251 err = PTR_ERR(inode);
252 if (IS_ERR(inode))
253 goto out_dir;
254
255 inode->i_op = &nilfs_dir_inode_operations;
256 inode->i_fop = &nilfs_dir_operations;
257 inode->i_mapping->a_ops = &nilfs_aops;
258
259 inode_inc_link_count(inode);
260
261 err = nilfs_make_empty(inode, dir);
262 if (err)
263 goto out_fail;
264
265 err = nilfs_add_link(dentry, inode);
266 if (err)
267 goto out_fail;
268
269 d_instantiate(dentry, inode);
270out:
271 if (!err)
272 err = nilfs_transaction_commit(dir->i_sb);
273 else
274 nilfs_transaction_abort(dir->i_sb);
275
276 return err;
277
278out_fail:
279 inode_dec_link_count(inode);
280 inode_dec_link_count(inode);
281 iput(inode);
282out_dir:
283 inode_dec_link_count(dir);
284 goto out;
285}
286
287static int nilfs_unlink(struct inode *dir, struct dentry *dentry)
288{
289 struct inode *inode;
290 struct nilfs_dir_entry *de;
291 struct page *page;
292 struct nilfs_transaction_info ti;
293 int err;
294
295 err = nilfs_transaction_begin(dir->i_sb, &ti, 0);
296 if (err)
297 return err;
298
299 err = -ENOENT;
300 de = nilfs_find_entry(dir, dentry, &page);
301 if (!de)
302 goto out;
303
304 inode = dentry->d_inode;
305 err = -EIO;
306 if (le64_to_cpu(de->inode) != inode->i_ino)
307 goto out;
308
309 if (!inode->i_nlink) {
310 nilfs_warning(inode->i_sb, __func__,
311 "deleting nonexistent file (%lu), %d\n",
312 inode->i_ino, inode->i_nlink);
313 inode->i_nlink = 1;
314 }
315 err = nilfs_delete_entry(de, page);
316 if (err)
317 goto out;
318
319 inode->i_ctime = dir->i_ctime;
320 inode_dec_link_count(inode);
321 err = 0;
322out:
323 if (!err)
324 err = nilfs_transaction_commit(dir->i_sb);
325 else
326 nilfs_transaction_abort(dir->i_sb);
327
328 return err;
329}
330
331static int nilfs_rmdir(struct inode *dir, struct dentry *dentry)
332{
333 struct inode *inode = dentry->d_inode;
334 struct nilfs_transaction_info ti;
335 int err;
336
337 err = nilfs_transaction_begin(dir->i_sb, &ti, 0);
338 if (err)
339 return err;
340
341 err = -ENOTEMPTY;
342 if (nilfs_empty_dir(inode)) {
343 err = nilfs_unlink(dir, dentry);
344 if (!err) {
345 inode->i_size = 0;
346 inode_dec_link_count(inode);
347 inode_dec_link_count(dir);
348 }
349 }
350 if (!err)
351 err = nilfs_transaction_commit(dir->i_sb);
352 else
353 nilfs_transaction_abort(dir->i_sb);
354
355 return err;
356}
357
358static int nilfs_rename(struct inode *old_dir, struct dentry *old_dentry,
359 struct inode *new_dir, struct dentry *new_dentry)
360{
361 struct inode *old_inode = old_dentry->d_inode;
362 struct inode *new_inode = new_dentry->d_inode;
363 struct page *dir_page = NULL;
364 struct nilfs_dir_entry *dir_de = NULL;
365 struct page *old_page;
366 struct nilfs_dir_entry *old_de;
367 struct nilfs_transaction_info ti;
368 int err;
369
370 err = nilfs_transaction_begin(old_dir->i_sb, &ti, 1);
371 if (unlikely(err))
372 return err;
373
374 err = -ENOENT;
375 old_de = nilfs_find_entry(old_dir, old_dentry, &old_page);
376 if (!old_de)
377 goto out;
378
379 if (S_ISDIR(old_inode->i_mode)) {
380 err = -EIO;
381 dir_de = nilfs_dotdot(old_inode, &dir_page);
382 if (!dir_de)
383 goto out_old;
384 }
385
386 if (new_inode) {
387 struct page *new_page;
388 struct nilfs_dir_entry *new_de;
389
390 err = -ENOTEMPTY;
391 if (dir_de && !nilfs_empty_dir(new_inode))
392 goto out_dir;
393
394 err = -ENOENT;
395 new_de = nilfs_find_entry(new_dir, new_dentry, &new_page);
396 if (!new_de)
397 goto out_dir;
398 inode_inc_link_count(old_inode);
399 nilfs_set_link(new_dir, new_de, new_page, old_inode);
400 new_inode->i_ctime = CURRENT_TIME;
401 if (dir_de)
402 drop_nlink(new_inode);
403 inode_dec_link_count(new_inode);
404 } else {
405 if (dir_de) {
406 err = -EMLINK;
407 if (new_dir->i_nlink >= NILFS_LINK_MAX)
408 goto out_dir;
409 }
410 inode_inc_link_count(old_inode);
411 err = nilfs_add_link(new_dentry, old_inode);
412 if (err) {
413 inode_dec_link_count(old_inode);
414 goto out_dir;
415 }
416 if (dir_de)
417 inode_inc_link_count(new_dir);
418 }
419
420 /*
421 * Like most other Unix systems, set the ctime for inodes on a
422 * rename.
423 * inode_dec_link_count() will mark the inode dirty.
424 */
425 old_inode->i_ctime = CURRENT_TIME;
426
427 nilfs_delete_entry(old_de, old_page);
428 inode_dec_link_count(old_inode);
429
430 if (dir_de) {
431 nilfs_set_link(old_inode, dir_de, dir_page, new_dir);
432 inode_dec_link_count(old_dir);
433 }
434
435 err = nilfs_transaction_commit(old_dir->i_sb);
436 return err;
437
438out_dir:
439 if (dir_de) {
440 kunmap(dir_page);
441 page_cache_release(dir_page);
442 }
443out_old:
444 kunmap(old_page);
445 page_cache_release(old_page);
446out:
447 nilfs_transaction_abort(old_dir->i_sb);
448 return err;
449}
450
451struct inode_operations nilfs_dir_inode_operations = {
452 .create = nilfs_create,
453 .lookup = nilfs_lookup,
454 .link = nilfs_link,
455 .unlink = nilfs_unlink,
456 .symlink = nilfs_symlink,
457 .mkdir = nilfs_mkdir,
458 .rmdir = nilfs_rmdir,
459 .mknod = nilfs_mknod,
460 .rename = nilfs_rename,
461 .setattr = nilfs_setattr,
462 .permission = nilfs_permission,
463};
464
465struct inode_operations nilfs_special_inode_operations = {
466 .setattr = nilfs_setattr,
467 .permission = nilfs_permission,
468};
469
470struct inode_operations nilfs_symlink_inode_operations = {
471 .readlink = generic_readlink,
472 .follow_link = page_follow_link_light,
473 .put_link = page_put_link,
474};
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
new file mode 100644
index 000000000000..7558c977db02
--- /dev/null
+++ b/fs/nilfs2/nilfs.h
@@ -0,0 +1,318 @@
1/*
2 * nilfs.h - NILFS local header file.
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>
21 * Ryusuke Konishi <ryusuke@osrg.net>
22 */
23
24#ifndef _NILFS_H
25#define _NILFS_H
26
27#include <linux/kernel.h>
28#include <linux/buffer_head.h>
29#include <linux/spinlock.h>
30#include <linux/blkdev.h>
31#include <linux/nilfs2_fs.h>
32#include "the_nilfs.h"
33#include "sb.h"
34#include "bmap.h"
35#include "bmap_union.h"
36
37/*
38 * NILFS filesystem version
39 */
40#define NILFS_VERSION "2.0.5"
41
42/*
43 * nilfs inode data in memory
44 */
45struct nilfs_inode_info {
46 __u32 i_flags;
47 unsigned long i_state; /* Dynamic state flags */
48 struct nilfs_bmap *i_bmap;
49 union nilfs_bmap_union i_bmap_union;
50 __u64 i_xattr; /* sector_t ??? */
51 __u32 i_dir_start_lookup;
52 __u64 i_cno; /* check point number for GC inode */
53 struct address_space i_btnode_cache;
54 struct list_head i_dirty; /* List for connecting dirty files */
55
56#ifdef CONFIG_NILFS_XATTR
57 /*
58 * Extended attributes can be read independently of the main file
59 * data. Taking i_sem even when reading would cause contention
60 * between readers of EAs and writers of regular file data, so
61 * instead we synchronize on xattr_sem when reading or changing
62 * EAs.
63 */
64 struct rw_semaphore xattr_sem;
65#endif
66#ifdef CONFIG_NILFS_POSIX_ACL
67 struct posix_acl *i_acl;
68 struct posix_acl *i_default_acl;
69#endif
70 struct buffer_head *i_bh; /* i_bh contains a new or dirty
71 disk inode */
72 struct inode vfs_inode;
73};
74
75static inline struct nilfs_inode_info *NILFS_I(const struct inode *inode)
76{
77 return container_of(inode, struct nilfs_inode_info, vfs_inode);
78}
79
80static inline struct nilfs_inode_info *
81NILFS_BMAP_I(const struct nilfs_bmap *bmap)
82{
83 return container_of((union nilfs_bmap_union *)bmap,
84 struct nilfs_inode_info,
85 i_bmap_union);
86}
87
88static inline struct inode *NILFS_BTNC_I(struct address_space *btnc)
89{
90 struct nilfs_inode_info *ii =
91 container_of(btnc, struct nilfs_inode_info, i_btnode_cache);
92 return &ii->vfs_inode;
93}
94
95static inline struct inode *NILFS_AS_I(struct address_space *mapping)
96{
97 return (mapping->host) ? :
98 container_of(mapping, struct inode, i_data);
99}
100
101/*
102 * Dynamic state flags of NILFS on-memory inode (i_state)
103 */
104enum {
105 NILFS_I_NEW = 0, /* Inode is newly created */
106 NILFS_I_DIRTY, /* The file is dirty */
107 NILFS_I_QUEUED, /* inode is in dirty_files list */
108 NILFS_I_BUSY, /* inode is grabbed by a segment
109 constructor */
110 NILFS_I_COLLECTED, /* All dirty blocks are collected */
111 NILFS_I_UPDATED, /* The file has been written back */
112 NILFS_I_INODE_DIRTY, /* write_inode is requested */
113 NILFS_I_BMAP, /* has bmap and btnode_cache */
114 NILFS_I_GCINODE, /* inode for GC, on memory only */
115 NILFS_I_GCDAT, /* shadow DAT, on memory only */
116};
117
118/*
119 * Macros to check inode numbers
120 */
121#define NILFS_MDT_INO_BITS \
122 ((unsigned int)(1 << NILFS_DAT_INO | 1 << NILFS_CPFILE_INO | \
123 1 << NILFS_SUFILE_INO | 1 << NILFS_IFILE_INO | \
124 1 << NILFS_ATIME_INO | 1 << NILFS_SKETCH_INO))
125
126#define NILFS_SYS_INO_BITS \
127 ((unsigned int)(1 << NILFS_ROOT_INO) | NILFS_MDT_INO_BITS)
128
129#define NILFS_FIRST_INO(sb) (NILFS_SB(sb)->s_nilfs->ns_first_ino)
130
131#define NILFS_MDT_INODE(sb, ino) \
132 ((ino) < NILFS_FIRST_INO(sb) && (NILFS_MDT_INO_BITS & (1 << (ino))))
133#define NILFS_VALID_INODE(sb, ino) \
134 ((ino) >= NILFS_FIRST_INO(sb) || (NILFS_SYS_INO_BITS & (1 << (ino))))
135
136/**
137 * struct nilfs_transaction_info: context information for synchronization
138 * @ti_magic: Magic number
139 * @ti_save: Backup of journal_info field of task_struct
140 * @ti_flags: Flags
141 * @ti_count: Nest level
142 * @ti_garbage: List of inode to be put when releasing semaphore
143 */
144struct nilfs_transaction_info {
145 u32 ti_magic;
146 void *ti_save;
147 /* This should never used. If this happens,
148 one of other filesystems has a bug. */
149 unsigned short ti_flags;
150 unsigned short ti_count;
151 struct list_head ti_garbage;
152};
153
154/* ti_magic */
155#define NILFS_TI_MAGIC 0xd9e392fb
156
157/* ti_flags */
158#define NILFS_TI_DYNAMIC_ALLOC 0x0001 /* Allocated from slab */
159#define NILFS_TI_SYNC 0x0002 /* Force to construct segment at the
160 end of transaction. */
161#define NILFS_TI_GC 0x0004 /* GC context */
162#define NILFS_TI_COMMIT 0x0008 /* Change happened or not */
163#define NILFS_TI_WRITER 0x0010 /* Constructor context */
164
165
166int nilfs_transaction_begin(struct super_block *,
167 struct nilfs_transaction_info *, int);
168int nilfs_transaction_commit(struct super_block *);
169void nilfs_transaction_abort(struct super_block *);
170
171static inline void nilfs_set_transaction_flag(unsigned int flag)
172{
173 struct nilfs_transaction_info *ti = current->journal_info;
174
175 ti->ti_flags |= flag;
176}
177
178static inline int nilfs_test_transaction_flag(unsigned int flag)
179{
180 struct nilfs_transaction_info *ti = current->journal_info;
181
182 if (ti == NULL || ti->ti_magic != NILFS_TI_MAGIC)
183 return 0;
184 return !!(ti->ti_flags & flag);
185}
186
187static inline int nilfs_doing_gc(void)
188{
189 return nilfs_test_transaction_flag(NILFS_TI_GC);
190}
191
192static inline int nilfs_doing_construction(void)
193{
194 return nilfs_test_transaction_flag(NILFS_TI_WRITER);
195}
196
197static inline struct inode *nilfs_dat_inode(const struct the_nilfs *nilfs)
198{
199 return nilfs_doing_gc() ? nilfs->ns_gc_dat : nilfs->ns_dat;
200}
201
202/*
203 * function prototype
204 */
205#ifdef CONFIG_NILFS_POSIX_ACL
206#error "NILFS: not yet supported POSIX ACL"
207extern int nilfs_permission(struct inode *, int, struct nameidata *);
208extern int nilfs_acl_chmod(struct inode *);
209extern int nilfs_init_acl(struct inode *, struct inode *);
210#else
211#define nilfs_permission NULL
212
213static inline int nilfs_acl_chmod(struct inode *inode)
214{
215 return 0;
216}
217
218static inline int nilfs_init_acl(struct inode *inode, struct inode *dir)
219{
220 inode->i_mode &= ~current_umask();
221 return 0;
222}
223#endif
224
225#define NILFS_ATIME_DISABLE
226
227/* dir.c */
228extern int nilfs_add_link(struct dentry *, struct inode *);
229extern ino_t nilfs_inode_by_name(struct inode *, struct dentry *);
230extern int nilfs_make_empty(struct inode *, struct inode *);
231extern struct nilfs_dir_entry *
232nilfs_find_entry(struct inode *, struct dentry *, struct page **);
233extern int nilfs_delete_entry(struct nilfs_dir_entry *, struct page *);
234extern int nilfs_empty_dir(struct inode *);
235extern struct nilfs_dir_entry *nilfs_dotdot(struct inode *, struct page **);
236extern void nilfs_set_link(struct inode *, struct nilfs_dir_entry *,
237 struct page *, struct inode *);
238
239/* file.c */
240extern int nilfs_sync_file(struct file *, struct dentry *, int);
241
242/* ioctl.c */
243long nilfs_ioctl(struct file *, unsigned int, unsigned long);
244int nilfs_ioctl_prepare_clean_segments(struct the_nilfs *, void __user *);
245
246/* inode.c */
247extern struct inode *nilfs_new_inode(struct inode *, int);
248extern void nilfs_free_inode(struct inode *);
249extern int nilfs_get_block(struct inode *, sector_t, struct buffer_head *, int);
250extern void nilfs_set_inode_flags(struct inode *);
251extern int nilfs_read_inode_common(struct inode *, struct nilfs_inode *);
252extern void nilfs_write_inode_common(struct inode *, struct nilfs_inode *, int);
253extern struct inode *nilfs_iget(struct super_block *, unsigned long);
254extern void nilfs_update_inode(struct inode *, struct buffer_head *);
255extern void nilfs_truncate(struct inode *);
256extern void nilfs_delete_inode(struct inode *);
257extern int nilfs_setattr(struct dentry *, struct iattr *);
258extern int nilfs_load_inode_block(struct nilfs_sb_info *, struct inode *,
259 struct buffer_head **);
260extern int nilfs_inode_dirty(struct inode *);
261extern int nilfs_set_file_dirty(struct nilfs_sb_info *, struct inode *,
262 unsigned);
263extern int nilfs_mark_inode_dirty(struct inode *);
264extern void nilfs_dirty_inode(struct inode *);
265
266/* namei.c */
267extern struct dentry *nilfs_get_parent(struct dentry *);
268
269/* super.c */
270extern struct inode *nilfs_alloc_inode(struct super_block *);
271extern void nilfs_destroy_inode(struct inode *);
272extern void nilfs_error(struct super_block *, const char *, const char *, ...)
273 __attribute__ ((format (printf, 3, 4)));
274extern void nilfs_warning(struct super_block *, const char *, const char *, ...)
275 __attribute__ ((format (printf, 3, 4)));
276extern struct nilfs_super_block *
277nilfs_read_super_block(struct super_block *, u64, int, struct buffer_head **);
278extern int nilfs_store_magic_and_option(struct super_block *,
279 struct nilfs_super_block *, char *);
280extern int nilfs_commit_super(struct nilfs_sb_info *, int);
281extern int nilfs_attach_checkpoint(struct nilfs_sb_info *, __u64);
282extern void nilfs_detach_checkpoint(struct nilfs_sb_info *);
283
284/* gcinode.c */
285int nilfs_gccache_submit_read_data(struct inode *, sector_t, sector_t, __u64,
286 struct buffer_head **);
287int nilfs_gccache_submit_read_node(struct inode *, sector_t, __u64,
288 struct buffer_head **);
289int nilfs_gccache_wait_and_mark_dirty(struct buffer_head *);
290int nilfs_init_gccache(struct the_nilfs *);
291void nilfs_destroy_gccache(struct the_nilfs *);
292void nilfs_clear_gcinode(struct inode *);
293struct inode *nilfs_gc_iget(struct the_nilfs *, ino_t, __u64);
294void nilfs_remove_all_gcinode(struct the_nilfs *);
295
296/* gcdat.c */
297int nilfs_init_gcdat_inode(struct the_nilfs *);
298void nilfs_commit_gcdat_inode(struct the_nilfs *);
299void nilfs_clear_gcdat_inode(struct the_nilfs *);
300
301/*
302 * Inodes and files operations
303 */
304extern struct file_operations nilfs_dir_operations;
305extern struct inode_operations nilfs_file_inode_operations;
306extern struct file_operations nilfs_file_operations;
307extern struct address_space_operations nilfs_aops;
308extern struct inode_operations nilfs_dir_inode_operations;
309extern struct inode_operations nilfs_special_inode_operations;
310extern struct inode_operations nilfs_symlink_inode_operations;
311
312/*
313 * filesystem type
314 */
315extern struct file_system_type nilfs_fs_type;
316
317
318#endif /* _NILFS_H */
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
new file mode 100644
index 000000000000..1bfbba9c0e9a
--- /dev/null
+++ b/fs/nilfs2/page.c
@@ -0,0 +1,540 @@
1/*
2 * page.c - buffer/page management specific to NILFS
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>,
21 * Seiji Kihara <kihara@osrg.net>.
22 */
23
24#include <linux/pagemap.h>
25#include <linux/writeback.h>
26#include <linux/swap.h>
27#include <linux/bitops.h>
28#include <linux/page-flags.h>
29#include <linux/list.h>
30#include <linux/highmem.h>
31#include <linux/pagevec.h>
32#include "nilfs.h"
33#include "page.h"
34#include "mdt.h"
35
36
37#define NILFS_BUFFER_INHERENT_BITS \
38 ((1UL << BH_Uptodate) | (1UL << BH_Mapped) | (1UL << BH_NILFS_Node) | \
39 (1UL << BH_NILFS_Volatile) | (1UL << BH_NILFS_Allocated))
40
41static struct buffer_head *
42__nilfs_get_page_block(struct page *page, unsigned long block, pgoff_t index,
43 int blkbits, unsigned long b_state)
44
45{
46 unsigned long first_block;
47 struct buffer_head *bh;
48
49 if (!page_has_buffers(page))
50 create_empty_buffers(page, 1 << blkbits, b_state);
51
52 first_block = (unsigned long)index << (PAGE_CACHE_SHIFT - blkbits);
53 bh = nilfs_page_get_nth_block(page, block - first_block);
54
55 touch_buffer(bh);
56 wait_on_buffer(bh);
57 return bh;
58}
59
60/*
61 * Since the page cache of B-tree node pages or data page cache of pseudo
62 * inodes does not have a valid mapping->host pointer, calling
63 * mark_buffer_dirty() for their buffers causes a NULL pointer dereference;
64 * it calls __mark_inode_dirty(NULL) through __set_page_dirty().
65 * To avoid this problem, the old style mark_buffer_dirty() is used instead.
66 */
67void nilfs_mark_buffer_dirty(struct buffer_head *bh)
68{
69 if (!buffer_dirty(bh) && !test_set_buffer_dirty(bh))
70 __set_page_dirty_nobuffers(bh->b_page);
71}
72
73struct buffer_head *nilfs_grab_buffer(struct inode *inode,
74 struct address_space *mapping,
75 unsigned long blkoff,
76 unsigned long b_state)
77{
78 int blkbits = inode->i_blkbits;
79 pgoff_t index = blkoff >> (PAGE_CACHE_SHIFT - blkbits);
80 struct page *page, *opage;
81 struct buffer_head *bh, *obh;
82
83 page = grab_cache_page(mapping, index);
84 if (unlikely(!page))
85 return NULL;
86
87 bh = __nilfs_get_page_block(page, blkoff, index, blkbits, b_state);
88 if (unlikely(!bh)) {
89 unlock_page(page);
90 page_cache_release(page);
91 return NULL;
92 }
93 if (!buffer_uptodate(bh) && mapping->assoc_mapping != NULL) {
94 /*
95 * Shadow page cache uses assoc_mapping to point its original
96 * page cache. The following code tries the original cache
97 * if the given cache is a shadow and it didn't hit.
98 */
99 opage = find_lock_page(mapping->assoc_mapping, index);
100 if (!opage)
101 return bh;
102
103 obh = __nilfs_get_page_block(opage, blkoff, index, blkbits,
104 b_state);
105 if (buffer_uptodate(obh)) {
106 nilfs_copy_buffer(bh, obh);
107 if (buffer_dirty(obh)) {
108 nilfs_mark_buffer_dirty(bh);
109 if (!buffer_nilfs_node(bh) && NILFS_MDT(inode))
110 nilfs_mdt_mark_dirty(inode);
111 }
112 }
113 brelse(obh);
114 unlock_page(opage);
115 page_cache_release(opage);
116 }
117 return bh;
118}
119
120/**
121 * nilfs_forget_buffer - discard dirty state
122 * @inode: owner inode of the buffer
123 * @bh: buffer head of the buffer to be discarded
124 */
125void nilfs_forget_buffer(struct buffer_head *bh)
126{
127 struct page *page = bh->b_page;
128
129 lock_buffer(bh);
130 clear_buffer_nilfs_volatile(bh);
131 if (test_clear_buffer_dirty(bh) && nilfs_page_buffers_clean(page))
132 __nilfs_clear_page_dirty(page);
133
134 clear_buffer_uptodate(bh);
135 clear_buffer_mapped(bh);
136 bh->b_blocknr = -1;
137 ClearPageUptodate(page);
138 ClearPageMappedToDisk(page);
139 unlock_buffer(bh);
140 brelse(bh);
141}
142
143/**
144 * nilfs_copy_buffer -- copy buffer data and flags
145 * @dbh: destination buffer
146 * @sbh: source buffer
147 */
148void nilfs_copy_buffer(struct buffer_head *dbh, struct buffer_head *sbh)
149{
150 void *kaddr0, *kaddr1;
151 unsigned long bits;
152 struct page *spage = sbh->b_page, *dpage = dbh->b_page;
153 struct buffer_head *bh;
154
155 kaddr0 = kmap_atomic(spage, KM_USER0);
156 kaddr1 = kmap_atomic(dpage, KM_USER1);
157 memcpy(kaddr1 + bh_offset(dbh), kaddr0 + bh_offset(sbh), sbh->b_size);
158 kunmap_atomic(kaddr1, KM_USER1);
159 kunmap_atomic(kaddr0, KM_USER0);
160
161 dbh->b_state = sbh->b_state & NILFS_BUFFER_INHERENT_BITS;
162 dbh->b_blocknr = sbh->b_blocknr;
163 dbh->b_bdev = sbh->b_bdev;
164
165 bh = dbh;
166 bits = sbh->b_state & ((1UL << BH_Uptodate) | (1UL << BH_Mapped));
167 while ((bh = bh->b_this_page) != dbh) {
168 lock_buffer(bh);
169 bits &= bh->b_state;
170 unlock_buffer(bh);
171 }
172 if (bits & (1UL << BH_Uptodate))
173 SetPageUptodate(dpage);
174 else
175 ClearPageUptodate(dpage);
176 if (bits & (1UL << BH_Mapped))
177 SetPageMappedToDisk(dpage);
178 else
179 ClearPageMappedToDisk(dpage);
180}
181
182/**
183 * nilfs_page_buffers_clean - check if a page has dirty buffers or not.
184 * @page: page to be checked
185 *
186 * nilfs_page_buffers_clean() returns zero if the page has dirty buffers.
187 * Otherwise, it returns non-zero value.
188 */
189int nilfs_page_buffers_clean(struct page *page)
190{
191 struct buffer_head *bh, *head;
192
193 bh = head = page_buffers(page);
194 do {
195 if (buffer_dirty(bh))
196 return 0;
197 bh = bh->b_this_page;
198 } while (bh != head);
199 return 1;
200}
201
202void nilfs_page_bug(struct page *page)
203{
204 struct address_space *m;
205 unsigned long ino = 0;
206
207 if (unlikely(!page)) {
208 printk(KERN_CRIT "NILFS_PAGE_BUG(NULL)\n");
209 return;
210 }
211
212 m = page->mapping;
213 if (m) {
214 struct inode *inode = NILFS_AS_I(m);
215 if (inode != NULL)
216 ino = inode->i_ino;
217 }
218 printk(KERN_CRIT "NILFS_PAGE_BUG(%p): cnt=%d index#=%llu flags=0x%lx "
219 "mapping=%p ino=%lu\n",
220 page, atomic_read(&page->_count),
221 (unsigned long long)page->index, page->flags, m, ino);
222
223 if (page_has_buffers(page)) {
224 struct buffer_head *bh, *head;
225 int i = 0;
226
227 bh = head = page_buffers(page);
228 do {
229 printk(KERN_CRIT
230 " BH[%d] %p: cnt=%d block#=%llu state=0x%lx\n",
231 i++, bh, atomic_read(&bh->b_count),
232 (unsigned long long)bh->b_blocknr, bh->b_state);
233 bh = bh->b_this_page;
234 } while (bh != head);
235 }
236}
237
238/**
239 * nilfs_alloc_private_page - allocate a private page with buffer heads
240 *
241 * Return Value: On success, a pointer to the allocated page is returned.
242 * On error, NULL is returned.
243 */
244struct page *nilfs_alloc_private_page(struct block_device *bdev, int size,
245 unsigned long state)
246{
247 struct buffer_head *bh, *head, *tail;
248 struct page *page;
249
250 page = alloc_page(GFP_NOFS); /* page_count of the returned page is 1 */
251 if (unlikely(!page))
252 return NULL;
253
254 lock_page(page);
255 head = alloc_page_buffers(page, size, 0);
256 if (unlikely(!head)) {
257 unlock_page(page);
258 __free_page(page);
259 return NULL;
260 }
261
262 bh = head;
263 do {
264 bh->b_state = (1UL << BH_NILFS_Allocated) | state;
265 tail = bh;
266 bh->b_bdev = bdev;
267 bh = bh->b_this_page;
268 } while (bh);
269
270 tail->b_this_page = head;
271 attach_page_buffers(page, head);
272
273 return page;
274}
275
276void nilfs_free_private_page(struct page *page)
277{
278 BUG_ON(!PageLocked(page));
279 BUG_ON(page->mapping);
280
281 if (page_has_buffers(page) && !try_to_free_buffers(page))
282 NILFS_PAGE_BUG(page, "failed to free page");
283
284 unlock_page(page);
285 __free_page(page);
286}
287
288/**
289 * nilfs_copy_page -- copy the page with buffers
290 * @dst: destination page
291 * @src: source page
292 * @copy_dirty: flag whether to copy dirty states on the page's buffer heads.
293 *
294 * This fuction is for both data pages and btnode pages. The dirty flag
295 * should be treated by caller. The page must not be under i/o.
296 * Both src and dst page must be locked
297 */
298static void nilfs_copy_page(struct page *dst, struct page *src, int copy_dirty)
299{
300 struct buffer_head *dbh, *dbufs, *sbh, *sbufs;
301 unsigned long mask = NILFS_BUFFER_INHERENT_BITS;
302
303 BUG_ON(PageWriteback(dst));
304
305 sbh = sbufs = page_buffers(src);
306 if (!page_has_buffers(dst))
307 create_empty_buffers(dst, sbh->b_size, 0);
308
309 if (copy_dirty)
310 mask |= (1UL << BH_Dirty);
311
312 dbh = dbufs = page_buffers(dst);
313 do {
314 lock_buffer(sbh);
315 lock_buffer(dbh);
316 dbh->b_state = sbh->b_state & mask;
317 dbh->b_blocknr = sbh->b_blocknr;
318 dbh->b_bdev = sbh->b_bdev;
319 sbh = sbh->b_this_page;
320 dbh = dbh->b_this_page;
321 } while (dbh != dbufs);
322
323 copy_highpage(dst, src);
324
325 if (PageUptodate(src) && !PageUptodate(dst))
326 SetPageUptodate(dst);
327 else if (!PageUptodate(src) && PageUptodate(dst))
328 ClearPageUptodate(dst);
329 if (PageMappedToDisk(src) && !PageMappedToDisk(dst))
330 SetPageMappedToDisk(dst);
331 else if (!PageMappedToDisk(src) && PageMappedToDisk(dst))
332 ClearPageMappedToDisk(dst);
333
334 do {
335 unlock_buffer(sbh);
336 unlock_buffer(dbh);
337 sbh = sbh->b_this_page;
338 dbh = dbh->b_this_page;
339 } while (dbh != dbufs);
340}
341
342int nilfs_copy_dirty_pages(struct address_space *dmap,
343 struct address_space *smap)
344{
345 struct pagevec pvec;
346 unsigned int i;
347 pgoff_t index = 0;
348 int err = 0;
349
350 pagevec_init(&pvec, 0);
351repeat:
352 if (!pagevec_lookup_tag(&pvec, smap, &index, PAGECACHE_TAG_DIRTY,
353 PAGEVEC_SIZE))
354 return 0;
355
356 for (i = 0; i < pagevec_count(&pvec); i++) {
357 struct page *page = pvec.pages[i], *dpage;
358
359 lock_page(page);
360 if (unlikely(!PageDirty(page)))
361 NILFS_PAGE_BUG(page, "inconsistent dirty state");
362
363 dpage = grab_cache_page(dmap, page->index);
364 if (unlikely(!dpage)) {
365 /* No empty page is added to the page cache */
366 err = -ENOMEM;
367 unlock_page(page);
368 break;
369 }
370 if (unlikely(!page_has_buffers(page)))
371 NILFS_PAGE_BUG(page,
372 "found empty page in dat page cache");
373
374 nilfs_copy_page(dpage, page, 1);
375 __set_page_dirty_nobuffers(dpage);
376
377 unlock_page(dpage);
378 page_cache_release(dpage);
379 unlock_page(page);
380 }
381 pagevec_release(&pvec);
382 cond_resched();
383
384 if (likely(!err))
385 goto repeat;
386 return err;
387}
388
389/**
390 * nilfs_copy_back_pages -- copy back pages to orignal cache from shadow cache
391 * @dmap: destination page cache
392 * @smap: source page cache
393 *
394 * No pages must no be added to the cache during this process.
395 * This must be ensured by the caller.
396 */
397void nilfs_copy_back_pages(struct address_space *dmap,
398 struct address_space *smap)
399{
400 struct pagevec pvec;
401 unsigned int i, n;
402 pgoff_t index = 0;
403 int err;
404
405 pagevec_init(&pvec, 0);
406repeat:
407 n = pagevec_lookup(&pvec, smap, index, PAGEVEC_SIZE);
408 if (!n)
409 return;
410 index = pvec.pages[n - 1]->index + 1;
411
412 for (i = 0; i < pagevec_count(&pvec); i++) {
413 struct page *page = pvec.pages[i], *dpage;
414 pgoff_t offset = page->index;
415
416 lock_page(page);
417 dpage = find_lock_page(dmap, offset);
418 if (dpage) {
419 /* override existing page on the destination cache */
420 WARN_ON(PageDirty(dpage));
421 nilfs_copy_page(dpage, page, 0);
422 unlock_page(dpage);
423 page_cache_release(dpage);
424 } else {
425 struct page *page2;
426
427 /* move the page to the destination cache */
428 spin_lock_irq(&smap->tree_lock);
429 page2 = radix_tree_delete(&smap->page_tree, offset);
430 WARN_ON(page2 != page);
431
432 smap->nrpages--;
433 spin_unlock_irq(&smap->tree_lock);
434
435 spin_lock_irq(&dmap->tree_lock);
436 err = radix_tree_insert(&dmap->page_tree, offset, page);
437 if (unlikely(err < 0)) {
438 WARN_ON(err == -EEXIST);
439 page->mapping = NULL;
440 page_cache_release(page); /* for cache */
441 } else {
442 page->mapping = dmap;
443 dmap->nrpages++;
444 if (PageDirty(page))
445 radix_tree_tag_set(&dmap->page_tree,
446 offset,
447 PAGECACHE_TAG_DIRTY);
448 }
449 spin_unlock_irq(&dmap->tree_lock);
450 }
451 unlock_page(page);
452 }
453 pagevec_release(&pvec);
454 cond_resched();
455
456 goto repeat;
457}
458
459void nilfs_clear_dirty_pages(struct address_space *mapping)
460{
461 struct pagevec pvec;
462 unsigned int i;
463 pgoff_t index = 0;
464
465 pagevec_init(&pvec, 0);
466
467 while (pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY,
468 PAGEVEC_SIZE)) {
469 for (i = 0; i < pagevec_count(&pvec); i++) {
470 struct page *page = pvec.pages[i];
471 struct buffer_head *bh, *head;
472
473 lock_page(page);
474 ClearPageUptodate(page);
475 ClearPageMappedToDisk(page);
476 bh = head = page_buffers(page);
477 do {
478 lock_buffer(bh);
479 clear_buffer_dirty(bh);
480 clear_buffer_nilfs_volatile(bh);
481 clear_buffer_uptodate(bh);
482 clear_buffer_mapped(bh);
483 unlock_buffer(bh);
484 bh = bh->b_this_page;
485 } while (bh != head);
486
487 __nilfs_clear_page_dirty(page);
488 unlock_page(page);
489 }
490 pagevec_release(&pvec);
491 cond_resched();
492 }
493}
494
495unsigned nilfs_page_count_clean_buffers(struct page *page,
496 unsigned from, unsigned to)
497{
498 unsigned block_start, block_end;
499 struct buffer_head *bh, *head;
500 unsigned nc = 0;
501
502 for (bh = head = page_buffers(page), block_start = 0;
503 bh != head || !block_start;
504 block_start = block_end, bh = bh->b_this_page) {
505 block_end = block_start + bh->b_size;
506 if (block_end > from && block_start < to && !buffer_dirty(bh))
507 nc++;
508 }
509 return nc;
510}
511
512/*
513 * NILFS2 needs clear_page_dirty() in the following two cases:
514 *
515 * 1) For B-tree node pages and data pages of the dat/gcdat, NILFS2 clears
516 * page dirty flags when it copies back pages from the shadow cache
517 * (gcdat->{i_mapping,i_btnode_cache}) to its original cache
518 * (dat->{i_mapping,i_btnode_cache}).
519 *
520 * 2) Some B-tree operations like insertion or deletion may dispose buffers
521 * in dirty state, and this needs to cancel the dirty state of their pages.
522 */
523int __nilfs_clear_page_dirty(struct page *page)
524{
525 struct address_space *mapping = page->mapping;
526
527 if (mapping) {
528 spin_lock_irq(&mapping->tree_lock);
529 if (test_bit(PG_dirty, &page->flags)) {
530 radix_tree_tag_clear(&mapping->page_tree,
531 page_index(page),
532 PAGECACHE_TAG_DIRTY);
533 spin_unlock_irq(&mapping->tree_lock);
534 return clear_page_dirty_for_io(page);
535 }
536 spin_unlock_irq(&mapping->tree_lock);
537 return 0;
538 }
539 return TestClearPageDirty(page);
540}
diff --git a/fs/nilfs2/page.h b/fs/nilfs2/page.h
new file mode 100644
index 000000000000..8abca4d1c1f8
--- /dev/null
+++ b/fs/nilfs2/page.h
@@ -0,0 +1,76 @@
1/*
2 * page.h - buffer/page management specific to NILFS
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>,
21 * Seiji Kihara <kihara@osrg.net>.
22 */
23
24#ifndef _NILFS_PAGE_H
25#define _NILFS_PAGE_H
26
27#include <linux/buffer_head.h>
28#include "nilfs.h"
29
30/*
31 * Extended buffer state bits
32 */
33enum {
34 BH_NILFS_Allocated = BH_PrivateStart,
35 BH_NILFS_Node,
36 BH_NILFS_Volatile,
37};
38
39BUFFER_FNS(NILFS_Allocated, nilfs_allocated) /* nilfs private buffers */
40BUFFER_FNS(NILFS_Node, nilfs_node) /* nilfs node buffers */
41BUFFER_FNS(NILFS_Volatile, nilfs_volatile)
42
43
44void nilfs_mark_buffer_dirty(struct buffer_head *bh);
45int __nilfs_clear_page_dirty(struct page *);
46
47struct buffer_head *nilfs_grab_buffer(struct inode *, struct address_space *,
48 unsigned long, unsigned long);
49void nilfs_forget_buffer(struct buffer_head *);
50void nilfs_copy_buffer(struct buffer_head *, struct buffer_head *);
51int nilfs_page_buffers_clean(struct page *);
52void nilfs_page_bug(struct page *);
53struct page *nilfs_alloc_private_page(struct block_device *, int,
54 unsigned long);
55void nilfs_free_private_page(struct page *);
56
57int nilfs_copy_dirty_pages(struct address_space *, struct address_space *);
58void nilfs_copy_back_pages(struct address_space *, struct address_space *);
59void nilfs_clear_dirty_pages(struct address_space *);
60unsigned nilfs_page_count_clean_buffers(struct page *, unsigned, unsigned);
61
62#define NILFS_PAGE_BUG(page, m, a...) \
63 do { nilfs_page_bug(page); BUG(); } while (0)
64
65static inline struct buffer_head *
66nilfs_page_get_nth_block(struct page *page, unsigned int count)
67{
68 struct buffer_head *bh = page_buffers(page);
69
70 while (count-- > 0)
71 bh = bh->b_this_page;
72 get_bh(bh);
73 return bh;
74}
75
76#endif /* _NILFS_PAGE_H */
diff --git a/fs/nilfs2/recovery.c b/fs/nilfs2/recovery.c
new file mode 100644
index 000000000000..6ade0963fc1d
--- /dev/null
+++ b/fs/nilfs2/recovery.c
@@ -0,0 +1,929 @@
1/*
2 * recovery.c - NILFS recovery logic
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>
21 */
22
23#include <linux/buffer_head.h>
24#include <linux/blkdev.h>
25#include <linux/swap.h>
26#include <linux/crc32.h>
27#include "nilfs.h"
28#include "segment.h"
29#include "sufile.h"
30#include "page.h"
31#include "seglist.h"
32#include "segbuf.h"
33
34/*
35 * Segment check result
36 */
37enum {
38 NILFS_SEG_VALID,
39 NILFS_SEG_NO_SUPER_ROOT,
40 NILFS_SEG_FAIL_IO,
41 NILFS_SEG_FAIL_MAGIC,
42 NILFS_SEG_FAIL_SEQ,
43 NILFS_SEG_FAIL_CHECKSUM_SEGSUM,
44 NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT,
45 NILFS_SEG_FAIL_CHECKSUM_FULL,
46 NILFS_SEG_FAIL_CONSISTENCY,
47};
48
49/* work structure for recovery */
50struct nilfs_recovery_block {
51 ino_t ino; /* Inode number of the file that this block
52 belongs to */
53 sector_t blocknr; /* block number */
54 __u64 vblocknr; /* virtual block number */
55 unsigned long blkoff; /* File offset of the data block (per block) */
56 struct list_head list;
57};
58
59
60static int nilfs_warn_segment_error(int err)
61{
62 switch (err) {
63 case NILFS_SEG_FAIL_IO:
64 printk(KERN_WARNING
65 "NILFS warning: I/O error on loading last segment\n");
66 return -EIO;
67 case NILFS_SEG_FAIL_MAGIC:
68 printk(KERN_WARNING
69 "NILFS warning: Segment magic number invalid\n");
70 break;
71 case NILFS_SEG_FAIL_SEQ:
72 printk(KERN_WARNING
73 "NILFS warning: Sequence number mismatch\n");
74 break;
75 case NILFS_SEG_FAIL_CHECKSUM_SEGSUM:
76 printk(KERN_WARNING
77 "NILFS warning: Checksum error in segment summary\n");
78 break;
79 case NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT:
80 printk(KERN_WARNING
81 "NILFS warning: Checksum error in super root\n");
82 break;
83 case NILFS_SEG_FAIL_CHECKSUM_FULL:
84 printk(KERN_WARNING
85 "NILFS warning: Checksum error in segment payload\n");
86 break;
87 case NILFS_SEG_FAIL_CONSISTENCY:
88 printk(KERN_WARNING
89 "NILFS warning: Inconsistent segment\n");
90 break;
91 case NILFS_SEG_NO_SUPER_ROOT:
92 printk(KERN_WARNING
93 "NILFS warning: No super root in the last segment\n");
94 break;
95 }
96 return -EINVAL;
97}
98
99static void store_segsum_info(struct nilfs_segsum_info *ssi,
100 struct nilfs_segment_summary *sum,
101 unsigned int blocksize)
102{
103 ssi->flags = le16_to_cpu(sum->ss_flags);
104 ssi->seg_seq = le64_to_cpu(sum->ss_seq);
105 ssi->ctime = le64_to_cpu(sum->ss_create);
106 ssi->next = le64_to_cpu(sum->ss_next);
107 ssi->nblocks = le32_to_cpu(sum->ss_nblocks);
108 ssi->nfinfo = le32_to_cpu(sum->ss_nfinfo);
109 ssi->sumbytes = le32_to_cpu(sum->ss_sumbytes);
110
111 ssi->nsumblk = DIV_ROUND_UP(ssi->sumbytes, blocksize);
112 ssi->nfileblk = ssi->nblocks - ssi->nsumblk - !!NILFS_SEG_HAS_SR(ssi);
113}
114
115/**
116 * calc_crc_cont - check CRC of blocks continuously
117 * @sbi: nilfs_sb_info
118 * @bhs: buffer head of start block
119 * @sum: place to store result
120 * @offset: offset bytes in the first block
121 * @check_bytes: number of bytes to be checked
122 * @start: DBN of start block
123 * @nblock: number of blocks to be checked
124 */
125static int calc_crc_cont(struct nilfs_sb_info *sbi, struct buffer_head *bhs,
126 u32 *sum, unsigned long offset, u64 check_bytes,
127 sector_t start, unsigned long nblock)
128{
129 unsigned long blocksize = sbi->s_super->s_blocksize;
130 unsigned long size;
131 u32 crc;
132
133 BUG_ON(offset >= blocksize);
134 check_bytes -= offset;
135 size = min_t(u64, check_bytes, blocksize - offset);
136 crc = crc32_le(sbi->s_nilfs->ns_crc_seed,
137 (unsigned char *)bhs->b_data + offset, size);
138 if (--nblock > 0) {
139 do {
140 struct buffer_head *bh
141 = sb_bread(sbi->s_super, ++start);
142 if (!bh)
143 return -EIO;
144 check_bytes -= size;
145 size = min_t(u64, check_bytes, blocksize);
146 crc = crc32_le(crc, bh->b_data, size);
147 brelse(bh);
148 } while (--nblock > 0);
149 }
150 *sum = crc;
151 return 0;
152}
153
154/**
155 * nilfs_read_super_root_block - read super root block
156 * @sb: super_block
157 * @sr_block: disk block number of the super root block
158 * @pbh: address of a buffer_head pointer to return super root buffer
159 * @check: CRC check flag
160 */
161int nilfs_read_super_root_block(struct super_block *sb, sector_t sr_block,
162 struct buffer_head **pbh, int check)
163{
164 struct buffer_head *bh_sr;
165 struct nilfs_super_root *sr;
166 u32 crc;
167 int ret;
168
169 *pbh = NULL;
170 bh_sr = sb_bread(sb, sr_block);
171 if (unlikely(!bh_sr)) {
172 ret = NILFS_SEG_FAIL_IO;
173 goto failed;
174 }
175
176 sr = (struct nilfs_super_root *)bh_sr->b_data;
177 if (check) {
178 unsigned bytes = le16_to_cpu(sr->sr_bytes);
179
180 if (bytes == 0 || bytes > sb->s_blocksize) {
181 ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT;
182 goto failed_bh;
183 }
184 if (calc_crc_cont(NILFS_SB(sb), bh_sr, &crc,
185 sizeof(sr->sr_sum), bytes, sr_block, 1)) {
186 ret = NILFS_SEG_FAIL_IO;
187 goto failed_bh;
188 }
189 if (crc != le32_to_cpu(sr->sr_sum)) {
190 ret = NILFS_SEG_FAIL_CHECKSUM_SUPER_ROOT;
191 goto failed_bh;
192 }
193 }
194 *pbh = bh_sr;
195 return 0;
196
197 failed_bh:
198 brelse(bh_sr);
199
200 failed:
201 return nilfs_warn_segment_error(ret);
202}
203
204/**
205 * load_segment_summary - read segment summary of the specified partial segment
206 * @sbi: nilfs_sb_info
207 * @pseg_start: start disk block number of partial segment
208 * @seg_seq: sequence number requested
209 * @ssi: pointer to nilfs_segsum_info struct to store information
210 * @full_check: full check flag
211 * (0: only checks segment summary CRC, 1: data CRC)
212 */
213static int
214load_segment_summary(struct nilfs_sb_info *sbi, sector_t pseg_start,
215 u64 seg_seq, struct nilfs_segsum_info *ssi,
216 int full_check)
217{
218 struct buffer_head *bh_sum;
219 struct nilfs_segment_summary *sum;
220 unsigned long offset, nblock;
221 u64 check_bytes;
222 u32 crc, crc_sum;
223 int ret = NILFS_SEG_FAIL_IO;
224
225 bh_sum = sb_bread(sbi->s_super, pseg_start);
226 if (!bh_sum)
227 goto out;
228
229 sum = (struct nilfs_segment_summary *)bh_sum->b_data;
230
231 /* Check consistency of segment summary */
232 if (le32_to_cpu(sum->ss_magic) != NILFS_SEGSUM_MAGIC) {
233 ret = NILFS_SEG_FAIL_MAGIC;
234 goto failed;
235 }
236 store_segsum_info(ssi, sum, sbi->s_super->s_blocksize);
237 if (seg_seq != ssi->seg_seq) {
238 ret = NILFS_SEG_FAIL_SEQ;
239 goto failed;
240 }
241 if (full_check) {
242 offset = sizeof(sum->ss_datasum);
243 check_bytes =
244 ((u64)ssi->nblocks << sbi->s_super->s_blocksize_bits);
245 nblock = ssi->nblocks;
246 crc_sum = le32_to_cpu(sum->ss_datasum);
247 ret = NILFS_SEG_FAIL_CHECKSUM_FULL;
248 } else { /* only checks segment summary */
249 offset = sizeof(sum->ss_datasum) + sizeof(sum->ss_sumsum);
250 check_bytes = ssi->sumbytes;
251 nblock = ssi->nsumblk;
252 crc_sum = le32_to_cpu(sum->ss_sumsum);
253 ret = NILFS_SEG_FAIL_CHECKSUM_SEGSUM;
254 }
255
256 if (unlikely(nblock == 0 ||
257 nblock > sbi->s_nilfs->ns_blocks_per_segment)) {
258 /* This limits the number of blocks read in the CRC check */
259 ret = NILFS_SEG_FAIL_CONSISTENCY;
260 goto failed;
261 }
262 if (calc_crc_cont(sbi, bh_sum, &crc, offset, check_bytes,
263 pseg_start, nblock)) {
264 ret = NILFS_SEG_FAIL_IO;
265 goto failed;
266 }
267 if (crc == crc_sum)
268 ret = 0;
269 failed:
270 brelse(bh_sum);
271 out:
272 return ret;
273}
274
275static void *segsum_get(struct super_block *sb, struct buffer_head **pbh,
276 unsigned int *offset, unsigned int bytes)
277{
278 void *ptr;
279 sector_t blocknr;
280
281 BUG_ON((*pbh)->b_size < *offset);
282 if (bytes > (*pbh)->b_size - *offset) {
283 blocknr = (*pbh)->b_blocknr;
284 brelse(*pbh);
285 *pbh = sb_bread(sb, blocknr + 1);
286 if (unlikely(!*pbh))
287 return NULL;
288 *offset = 0;
289 }
290 ptr = (*pbh)->b_data + *offset;
291 *offset += bytes;
292 return ptr;
293}
294
295static void segsum_skip(struct super_block *sb, struct buffer_head **pbh,
296 unsigned int *offset, unsigned int bytes,
297 unsigned long count)
298{
299 unsigned int rest_item_in_current_block
300 = ((*pbh)->b_size - *offset) / bytes;
301
302 if (count <= rest_item_in_current_block) {
303 *offset += bytes * count;
304 } else {
305 sector_t blocknr = (*pbh)->b_blocknr;
306 unsigned int nitem_per_block = (*pbh)->b_size / bytes;
307 unsigned int bcnt;
308
309 count -= rest_item_in_current_block;
310 bcnt = DIV_ROUND_UP(count, nitem_per_block);
311 *offset = bytes * (count - (bcnt - 1) * nitem_per_block);
312
313 brelse(*pbh);
314 *pbh = sb_bread(sb, blocknr + bcnt);
315 }
316}
317
318static int
319collect_blocks_from_segsum(struct nilfs_sb_info *sbi, sector_t sum_blocknr,
320 struct nilfs_segsum_info *ssi,
321 struct list_head *head)
322{
323 struct buffer_head *bh;
324 unsigned int offset;
325 unsigned long nfinfo = ssi->nfinfo;
326 sector_t blocknr = sum_blocknr + ssi->nsumblk;
327 ino_t ino;
328 int err = -EIO;
329
330 if (!nfinfo)
331 return 0;
332
333 bh = sb_bread(sbi->s_super, sum_blocknr);
334 if (unlikely(!bh))
335 goto out;
336
337 offset = le16_to_cpu(
338 ((struct nilfs_segment_summary *)bh->b_data)->ss_bytes);
339 for (;;) {
340 unsigned long nblocks, ndatablk, nnodeblk;
341 struct nilfs_finfo *finfo;
342
343 finfo = segsum_get(sbi->s_super, &bh, &offset, sizeof(*finfo));
344 if (unlikely(!finfo))
345 goto out;
346
347 ino = le64_to_cpu(finfo->fi_ino);
348 nblocks = le32_to_cpu(finfo->fi_nblocks);
349 ndatablk = le32_to_cpu(finfo->fi_ndatablk);
350 nnodeblk = nblocks - ndatablk;
351
352 while (ndatablk-- > 0) {
353 struct nilfs_recovery_block *rb;
354 struct nilfs_binfo_v *binfo;
355
356 binfo = segsum_get(sbi->s_super, &bh, &offset,
357 sizeof(*binfo));
358 if (unlikely(!binfo))
359 goto out;
360
361 rb = kmalloc(sizeof(*rb), GFP_NOFS);
362 if (unlikely(!rb)) {
363 err = -ENOMEM;
364 goto out;
365 }
366 rb->ino = ino;
367 rb->blocknr = blocknr++;
368 rb->vblocknr = le64_to_cpu(binfo->bi_vblocknr);
369 rb->blkoff = le64_to_cpu(binfo->bi_blkoff);
370 /* INIT_LIST_HEAD(&rb->list); */
371 list_add_tail(&rb->list, head);
372 }
373 if (--nfinfo == 0)
374 break;
375 blocknr += nnodeblk; /* always 0 for the data sync segments */
376 segsum_skip(sbi->s_super, &bh, &offset, sizeof(__le64),
377 nnodeblk);
378 if (unlikely(!bh))
379 goto out;
380 }
381 err = 0;
382 out:
383 brelse(bh); /* brelse(NULL) is just ignored */
384 return err;
385}
386
387static void dispose_recovery_list(struct list_head *head)
388{
389 while (!list_empty(head)) {
390 struct nilfs_recovery_block *rb
391 = list_entry(head->next,
392 struct nilfs_recovery_block, list);
393 list_del(&rb->list);
394 kfree(rb);
395 }
396}
397
398void nilfs_dispose_segment_list(struct list_head *head)
399{
400 while (!list_empty(head)) {
401 struct nilfs_segment_entry *ent
402 = list_entry(head->next,
403 struct nilfs_segment_entry, list);
404 list_del(&ent->list);
405 nilfs_free_segment_entry(ent);
406 }
407}
408
409static int nilfs_prepare_segment_for_recovery(struct the_nilfs *nilfs,
410 struct nilfs_recovery_info *ri)
411{
412 struct list_head *head = &ri->ri_used_segments;
413 struct nilfs_segment_entry *ent, *n;
414 struct inode *sufile = nilfs->ns_sufile;
415 __u64 segnum[4];
416 time_t mtime;
417 int err;
418 int i;
419
420 segnum[0] = nilfs->ns_segnum;
421 segnum[1] = nilfs->ns_nextnum;
422 segnum[2] = ri->ri_segnum;
423 segnum[3] = ri->ri_nextnum;
424
425 /*
426 * Releasing the next segment of the latest super root.
427 * The next segment is invalidated by this recovery.
428 */
429 err = nilfs_sufile_free(sufile, segnum[1]);
430 if (unlikely(err))
431 goto failed;
432
433 err = -ENOMEM;
434 for (i = 1; i < 4; i++) {
435 ent = nilfs_alloc_segment_entry(segnum[i]);
436 if (unlikely(!ent))
437 goto failed;
438 list_add_tail(&ent->list, head);
439 }
440
441 /*
442 * Collecting segments written after the latest super root.
443 * These are marked dirty to avoid being reallocated in the next write.
444 */
445 mtime = get_seconds();
446 list_for_each_entry_safe(ent, n, head, list) {
447 if (ent->segnum == segnum[0]) {
448 list_del(&ent->list);
449 nilfs_free_segment_entry(ent);
450 continue;
451 }
452 err = nilfs_open_segment_entry(ent, sufile);
453 if (unlikely(err))
454 goto failed;
455 if (!nilfs_segment_usage_dirty(ent->raw_su)) {
456 /* make the segment garbage */
457 ent->raw_su->su_nblocks = cpu_to_le32(0);
458 ent->raw_su->su_lastmod = cpu_to_le32(mtime);
459 nilfs_segment_usage_set_dirty(ent->raw_su);
460 }
461 list_del(&ent->list);
462 nilfs_close_segment_entry(ent, sufile);
463 nilfs_free_segment_entry(ent);
464 }
465
466 /* Allocate new segments for recovery */
467 err = nilfs_sufile_alloc(sufile, &segnum[0]);
468 if (unlikely(err))
469 goto failed;
470
471 nilfs->ns_pseg_offset = 0;
472 nilfs->ns_seg_seq = ri->ri_seq + 2;
473 nilfs->ns_nextnum = nilfs->ns_segnum = segnum[0];
474 return 0;
475
476 failed:
477 /* No need to recover sufile because it will be destroyed on error */
478 return err;
479}
480
481static int nilfs_recovery_copy_block(struct nilfs_sb_info *sbi,
482 struct nilfs_recovery_block *rb,
483 struct page *page)
484{
485 struct buffer_head *bh_org;
486 void *kaddr;
487
488 bh_org = sb_bread(sbi->s_super, rb->blocknr);
489 if (unlikely(!bh_org))
490 return -EIO;
491
492 kaddr = kmap_atomic(page, KM_USER0);
493 memcpy(kaddr + bh_offset(bh_org), bh_org->b_data, bh_org->b_size);
494 kunmap_atomic(kaddr, KM_USER0);
495 brelse(bh_org);
496 return 0;
497}
498
499static int recover_dsync_blocks(struct nilfs_sb_info *sbi,
500 struct list_head *head,
501 unsigned long *nr_salvaged_blocks)
502{
503 struct inode *inode;
504 struct nilfs_recovery_block *rb, *n;
505 unsigned blocksize = sbi->s_super->s_blocksize;
506 struct page *page;
507 loff_t pos;
508 int err = 0, err2 = 0;
509
510 list_for_each_entry_safe(rb, n, head, list) {
511 inode = nilfs_iget(sbi->s_super, rb->ino);
512 if (IS_ERR(inode)) {
513 err = PTR_ERR(inode);
514 inode = NULL;
515 goto failed_inode;
516 }
517
518 pos = rb->blkoff << inode->i_blkbits;
519 page = NULL;
520 err = block_write_begin(NULL, inode->i_mapping, pos, blocksize,
521 0, &page, NULL, nilfs_get_block);
522 if (unlikely(err))
523 goto failed_inode;
524
525 err = nilfs_recovery_copy_block(sbi, rb, page);
526 if (unlikely(err))
527 goto failed_page;
528
529 err = nilfs_set_file_dirty(sbi, inode, 1);
530 if (unlikely(err))
531 goto failed_page;
532
533 block_write_end(NULL, inode->i_mapping, pos, blocksize,
534 blocksize, page, NULL);
535
536 unlock_page(page);
537 page_cache_release(page);
538
539 (*nr_salvaged_blocks)++;
540 goto next;
541
542 failed_page:
543 unlock_page(page);
544 page_cache_release(page);
545
546 failed_inode:
547 printk(KERN_WARNING
548 "NILFS warning: error recovering data block "
549 "(err=%d, ino=%lu, block-offset=%llu)\n",
550 err, rb->ino, (unsigned long long)rb->blkoff);
551 if (!err2)
552 err2 = err;
553 next:
554 iput(inode); /* iput(NULL) is just ignored */
555 list_del_init(&rb->list);
556 kfree(rb);
557 }
558 return err2;
559}
560
561/**
562 * nilfs_do_roll_forward - salvage logical segments newer than the latest
563 * checkpoint
564 * @sbi: nilfs_sb_info
565 * @nilfs: the_nilfs
566 * @ri: pointer to a nilfs_recovery_info
567 */
568static int nilfs_do_roll_forward(struct the_nilfs *nilfs,
569 struct nilfs_sb_info *sbi,
570 struct nilfs_recovery_info *ri)
571{
572 struct nilfs_segsum_info ssi;
573 sector_t pseg_start;
574 sector_t seg_start, seg_end; /* Starting/ending DBN of full segment */
575 unsigned long nsalvaged_blocks = 0;
576 u64 seg_seq;
577 __u64 segnum, nextnum = 0;
578 int empty_seg = 0;
579 int err = 0, ret;
580 LIST_HEAD(dsync_blocks); /* list of data blocks to be recovered */
581 enum {
582 RF_INIT_ST,
583 RF_DSYNC_ST, /* scanning data-sync segments */
584 };
585 int state = RF_INIT_ST;
586
587 nilfs_attach_writer(nilfs, sbi);
588 pseg_start = ri->ri_lsegs_start;
589 seg_seq = ri->ri_lsegs_start_seq;
590 segnum = nilfs_get_segnum_of_block(nilfs, pseg_start);
591 nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end);
592
593 while (segnum != ri->ri_segnum || pseg_start <= ri->ri_pseg_start) {
594
595 ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi, 1);
596 if (ret) {
597 if (ret == NILFS_SEG_FAIL_IO) {
598 err = -EIO;
599 goto failed;
600 }
601 goto strayed;
602 }
603 if (unlikely(NILFS_SEG_HAS_SR(&ssi)))
604 goto confused;
605
606 /* Found a valid partial segment; do recovery actions */
607 nextnum = nilfs_get_segnum_of_block(nilfs, ssi.next);
608 empty_seg = 0;
609 nilfs->ns_ctime = ssi.ctime;
610 if (!(ssi.flags & NILFS_SS_GC))
611 nilfs->ns_nongc_ctime = ssi.ctime;
612
613 switch (state) {
614 case RF_INIT_ST:
615 if (!NILFS_SEG_LOGBGN(&ssi) || !NILFS_SEG_DSYNC(&ssi))
616 goto try_next_pseg;
617 state = RF_DSYNC_ST;
618 /* Fall through */
619 case RF_DSYNC_ST:
620 if (!NILFS_SEG_DSYNC(&ssi))
621 goto confused;
622
623 err = collect_blocks_from_segsum(
624 sbi, pseg_start, &ssi, &dsync_blocks);
625 if (unlikely(err))
626 goto failed;
627 if (NILFS_SEG_LOGEND(&ssi)) {
628 err = recover_dsync_blocks(
629 sbi, &dsync_blocks, &nsalvaged_blocks);
630 if (unlikely(err))
631 goto failed;
632 state = RF_INIT_ST;
633 }
634 break; /* Fall through to try_next_pseg */
635 }
636
637 try_next_pseg:
638 if (pseg_start == ri->ri_lsegs_end)
639 break;
640 pseg_start += ssi.nblocks;
641 if (pseg_start < seg_end)
642 continue;
643 goto feed_segment;
644
645 strayed:
646 if (pseg_start == ri->ri_lsegs_end)
647 break;
648
649 feed_segment:
650 /* Looking to the next full segment */
651 if (empty_seg++)
652 break;
653 seg_seq++;
654 segnum = nextnum;
655 nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end);
656 pseg_start = seg_start;
657 }
658
659 if (nsalvaged_blocks) {
660 printk(KERN_INFO "NILFS (device %s): salvaged %lu blocks\n",
661 sbi->s_super->s_id, nsalvaged_blocks);
662 ri->ri_need_recovery = NILFS_RECOVERY_ROLLFORWARD_DONE;
663 }
664 out:
665 dispose_recovery_list(&dsync_blocks);
666 nilfs_detach_writer(sbi->s_nilfs, sbi);
667 return err;
668
669 confused:
670 err = -EINVAL;
671 failed:
672 printk(KERN_ERR
673 "NILFS (device %s): Error roll-forwarding "
674 "(err=%d, pseg block=%llu). ",
675 sbi->s_super->s_id, err, (unsigned long long)pseg_start);
676 goto out;
677}
678
679static void nilfs_finish_roll_forward(struct the_nilfs *nilfs,
680 struct nilfs_sb_info *sbi,
681 struct nilfs_recovery_info *ri)
682{
683 struct buffer_head *bh;
684 int err;
685
686 if (nilfs_get_segnum_of_block(nilfs, ri->ri_lsegs_start) !=
687 nilfs_get_segnum_of_block(nilfs, ri->ri_super_root))
688 return;
689
690 bh = sb_getblk(sbi->s_super, ri->ri_lsegs_start);
691 BUG_ON(!bh);
692 memset(bh->b_data, 0, bh->b_size);
693 set_buffer_dirty(bh);
694 err = sync_dirty_buffer(bh);
695 if (unlikely(err))
696 printk(KERN_WARNING
697 "NILFS warning: buffer sync write failed during "
698 "post-cleaning of recovery.\n");
699 brelse(bh);
700}
701
702/**
703 * nilfs_recover_logical_segments - salvage logical segments written after
704 * the latest super root
705 * @nilfs: the_nilfs
706 * @sbi: nilfs_sb_info
707 * @ri: pointer to a nilfs_recovery_info struct to store search results.
708 *
709 * Return Value: On success, 0 is returned. On error, one of the following
710 * negative error code is returned.
711 *
712 * %-EINVAL - Inconsistent filesystem state.
713 *
714 * %-EIO - I/O error
715 *
716 * %-ENOSPC - No space left on device (only in a panic state).
717 *
718 * %-ERESTARTSYS - Interrupted.
719 *
720 * %-ENOMEM - Insufficient memory available.
721 */
722int nilfs_recover_logical_segments(struct the_nilfs *nilfs,
723 struct nilfs_sb_info *sbi,
724 struct nilfs_recovery_info *ri)
725{
726 int err;
727
728 if (ri->ri_lsegs_start == 0 || ri->ri_lsegs_end == 0)
729 return 0;
730
731 err = nilfs_attach_checkpoint(sbi, ri->ri_cno);
732 if (unlikely(err)) {
733 printk(KERN_ERR
734 "NILFS: error loading the latest checkpoint.\n");
735 return err;
736 }
737
738 err = nilfs_do_roll_forward(nilfs, sbi, ri);
739 if (unlikely(err))
740 goto failed;
741
742 if (ri->ri_need_recovery == NILFS_RECOVERY_ROLLFORWARD_DONE) {
743 err = nilfs_prepare_segment_for_recovery(nilfs, ri);
744 if (unlikely(err)) {
745 printk(KERN_ERR "NILFS: Error preparing segments for "
746 "recovery.\n");
747 goto failed;
748 }
749
750 err = nilfs_attach_segment_constructor(sbi);
751 if (unlikely(err))
752 goto failed;
753
754 set_nilfs_discontinued(nilfs);
755 err = nilfs_construct_segment(sbi->s_super);
756 nilfs_detach_segment_constructor(sbi);
757
758 if (unlikely(err)) {
759 printk(KERN_ERR "NILFS: Oops! recovery failed. "
760 "(err=%d)\n", err);
761 goto failed;
762 }
763
764 nilfs_finish_roll_forward(nilfs, sbi, ri);
765 }
766
767 nilfs_detach_checkpoint(sbi);
768 return 0;
769
770 failed:
771 nilfs_detach_checkpoint(sbi);
772 nilfs_mdt_clear(nilfs->ns_cpfile);
773 nilfs_mdt_clear(nilfs->ns_sufile);
774 nilfs_mdt_clear(nilfs->ns_dat);
775 return err;
776}
777
778/**
779 * nilfs_search_super_root - search the latest valid super root
780 * @nilfs: the_nilfs
781 * @sbi: nilfs_sb_info
782 * @ri: pointer to a nilfs_recovery_info struct to store search results.
783 *
784 * nilfs_search_super_root() looks for the latest super-root from a partial
785 * segment pointed by the superblock. It sets up struct the_nilfs through
786 * this search. It fills nilfs_recovery_info (ri) required for recovery.
787 *
788 * Return Value: On success, 0 is returned. On error, one of the following
789 * negative error code is returned.
790 *
791 * %-EINVAL - No valid segment found
792 *
793 * %-EIO - I/O error
794 */
795int nilfs_search_super_root(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi,
796 struct nilfs_recovery_info *ri)
797{
798 struct nilfs_segsum_info ssi;
799 sector_t pseg_start, pseg_end, sr_pseg_start = 0;
800 sector_t seg_start, seg_end; /* range of full segment (block number) */
801 u64 seg_seq;
802 __u64 segnum, nextnum = 0;
803 __u64 cno;
804 struct nilfs_segment_entry *ent;
805 LIST_HEAD(segments);
806 int empty_seg = 0, scan_newer = 0;
807 int ret;
808
809 pseg_start = nilfs->ns_last_pseg;
810 seg_seq = nilfs->ns_last_seq;
811 cno = nilfs->ns_last_cno;
812 segnum = nilfs_get_segnum_of_block(nilfs, pseg_start);
813
814 /* Calculate range of segment */
815 nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end);
816
817 for (;;) {
818 /* Load segment summary */
819 ret = load_segment_summary(sbi, pseg_start, seg_seq, &ssi, 1);
820 if (ret) {
821 if (ret == NILFS_SEG_FAIL_IO)
822 goto failed;
823 goto strayed;
824 }
825 pseg_end = pseg_start + ssi.nblocks - 1;
826 if (unlikely(pseg_end > seg_end)) {
827 ret = NILFS_SEG_FAIL_CONSISTENCY;
828 goto strayed;
829 }
830
831 /* A valid partial segment */
832 ri->ri_pseg_start = pseg_start;
833 ri->ri_seq = seg_seq;
834 ri->ri_segnum = segnum;
835 nextnum = nilfs_get_segnum_of_block(nilfs, ssi.next);
836 ri->ri_nextnum = nextnum;
837 empty_seg = 0;
838
839 if (!NILFS_SEG_HAS_SR(&ssi)) {
840 if (!scan_newer) {
841 /* This will never happen because a superblock
842 (last_segment) always points to a pseg
843 having a super root. */
844 ret = NILFS_SEG_FAIL_CONSISTENCY;
845 goto failed;
846 }
847 if (!ri->ri_lsegs_start && NILFS_SEG_LOGBGN(&ssi)) {
848 ri->ri_lsegs_start = pseg_start;
849 ri->ri_lsegs_start_seq = seg_seq;
850 }
851 if (NILFS_SEG_LOGEND(&ssi))
852 ri->ri_lsegs_end = pseg_start;
853 goto try_next_pseg;
854 }
855
856 /* A valid super root was found. */
857 ri->ri_cno = cno++;
858 ri->ri_super_root = pseg_end;
859 ri->ri_lsegs_start = ri->ri_lsegs_end = 0;
860
861 nilfs_dispose_segment_list(&segments);
862 nilfs->ns_pseg_offset = (sr_pseg_start = pseg_start)
863 + ssi.nblocks - seg_start;
864 nilfs->ns_seg_seq = seg_seq;
865 nilfs->ns_segnum = segnum;
866 nilfs->ns_cno = cno; /* nilfs->ns_cno = ri->ri_cno + 1 */
867 nilfs->ns_ctime = ssi.ctime;
868 nilfs->ns_nextnum = nextnum;
869
870 if (scan_newer)
871 ri->ri_need_recovery = NILFS_RECOVERY_SR_UPDATED;
872 else {
873 if (nilfs->ns_mount_state & NILFS_VALID_FS)
874 goto super_root_found;
875 scan_newer = 1;
876 }
877
878 /* reset region for roll-forward */
879 pseg_start += ssi.nblocks;
880 if (pseg_start < seg_end)
881 continue;
882 goto feed_segment;
883
884 try_next_pseg:
885 /* Standing on a course, or met an inconsistent state */
886 pseg_start += ssi.nblocks;
887 if (pseg_start < seg_end)
888 continue;
889 goto feed_segment;
890
891 strayed:
892 /* Off the trail */
893 if (!scan_newer)
894 /*
895 * This can happen if a checkpoint was written without
896 * barriers, or as a result of an I/O failure.
897 */
898 goto failed;
899
900 feed_segment:
901 /* Looking to the next full segment */
902 if (empty_seg++)
903 goto super_root_found; /* found a valid super root */
904
905 ent = nilfs_alloc_segment_entry(segnum);
906 if (unlikely(!ent)) {
907 ret = -ENOMEM;
908 goto failed;
909 }
910 list_add_tail(&ent->list, &segments);
911
912 seg_seq++;
913 segnum = nextnum;
914 nilfs_get_segment_range(nilfs, segnum, &seg_start, &seg_end);
915 pseg_start = seg_start;
916 }
917
918 super_root_found:
919 /* Updating pointers relating to the latest checkpoint */
920 list_splice(&segments, ri->ri_used_segments.prev);
921 nilfs->ns_last_pseg = sr_pseg_start;
922 nilfs->ns_last_seq = nilfs->ns_seg_seq;
923 nilfs->ns_last_cno = ri->ri_cno;
924 return 0;
925
926 failed:
927 nilfs_dispose_segment_list(&segments);
928 return (ret < 0) ? ret : nilfs_warn_segment_error(ret);
929}
diff --git a/fs/nilfs2/sb.h b/fs/nilfs2/sb.h
new file mode 100644
index 000000000000..adccd4fc654e
--- /dev/null
+++ b/fs/nilfs2/sb.h
@@ -0,0 +1,102 @@
1/*
2 * sb.h - NILFS on-memory super block structure.
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>
21 *
22 */
23
24#ifndef _NILFS_SB
25#define _NILFS_SB
26
27#include <linux/types.h>
28#include <linux/fs.h>
29
30/*
31 * Mount options
32 */
33struct nilfs_mount_options {
34 unsigned long mount_opt;
35 __u64 snapshot_cno;
36};
37
38struct the_nilfs;
39struct nilfs_sc_info;
40
41/*
42 * NILFS super-block data in memory
43 */
44struct nilfs_sb_info {
45 /* Snapshot status */
46 __u64 s_snapshot_cno; /* Checkpoint number */
47 atomic_t s_inodes_count;
48 atomic_t s_blocks_count; /* Reserved (might be deleted) */
49
50 /* Mount options */
51 unsigned long s_mount_opt;
52 uid_t s_resuid;
53 gid_t s_resgid;
54
55 unsigned long s_interval; /* construction interval */
56 unsigned long s_watermark; /* threshold of data amount
57 for the segment construction */
58
59 /* Fundamental members */
60 struct super_block *s_super; /* reverse pointer to super_block */
61 struct the_nilfs *s_nilfs;
62 struct list_head s_list; /* list head for nilfs->ns_supers */
63
64 /* Segment constructor */
65 struct list_head s_dirty_files; /* dirty files list */
66 struct nilfs_sc_info *s_sc_info; /* segment constructor info */
67 spinlock_t s_inode_lock; /* Lock for the nilfs inode.
68 It covers s_dirty_files list */
69
70 /* Metadata files */
71 struct inode *s_ifile; /* index file inode */
72
73 /* Inode allocator */
74 spinlock_t s_next_gen_lock;
75 u32 s_next_generation;
76};
77
78static inline struct nilfs_sb_info *NILFS_SB(struct super_block *sb)
79{
80 return sb->s_fs_info;
81}
82
83static inline struct nilfs_sc_info *NILFS_SC(struct nilfs_sb_info *sbi)
84{
85 return sbi->s_sc_info;
86}
87
88/*
89 * Bit operations for the mount option
90 */
91#define nilfs_clear_opt(sbi, opt) \
92 do { (sbi)->s_mount_opt &= ~NILFS_MOUNT_##opt; } while (0)
93#define nilfs_set_opt(sbi, opt) \
94 do { (sbi)->s_mount_opt |= NILFS_MOUNT_##opt; } while (0)
95#define nilfs_test_opt(sbi, opt) ((sbi)->s_mount_opt & NILFS_MOUNT_##opt)
96#define nilfs_write_opt(sbi, mask, opt) \
97 do { (sbi)->s_mount_opt = \
98 (((sbi)->s_mount_opt & ~NILFS_MOUNT_##mask) | \
99 NILFS_MOUNT_##opt); \
100 } while (0)
101
102#endif /* _NILFS_SB */
diff --git a/fs/nilfs2/segbuf.c b/fs/nilfs2/segbuf.c
new file mode 100644
index 000000000000..1e68821b4a9b
--- /dev/null
+++ b/fs/nilfs2/segbuf.c
@@ -0,0 +1,439 @@
1/*
2 * segbuf.c - NILFS segment buffer
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>
21 *
22 */
23
24#include <linux/buffer_head.h>
25#include <linux/writeback.h>
26#include <linux/crc32.h>
27#include "page.h"
28#include "segbuf.h"
29#include "seglist.h"
30
31
32static struct kmem_cache *nilfs_segbuf_cachep;
33
34static void nilfs_segbuf_init_once(void *obj)
35{
36 memset(obj, 0, sizeof(struct nilfs_segment_buffer));
37}
38
39int __init nilfs_init_segbuf_cache(void)
40{
41 nilfs_segbuf_cachep =
42 kmem_cache_create("nilfs2_segbuf_cache",
43 sizeof(struct nilfs_segment_buffer),
44 0, SLAB_RECLAIM_ACCOUNT,
45 nilfs_segbuf_init_once);
46
47 return (nilfs_segbuf_cachep == NULL) ? -ENOMEM : 0;
48}
49
50void nilfs_destroy_segbuf_cache(void)
51{
52 kmem_cache_destroy(nilfs_segbuf_cachep);
53}
54
55struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *sb)
56{
57 struct nilfs_segment_buffer *segbuf;
58
59 segbuf = kmem_cache_alloc(nilfs_segbuf_cachep, GFP_NOFS);
60 if (unlikely(!segbuf))
61 return NULL;
62
63 segbuf->sb_super = sb;
64 INIT_LIST_HEAD(&segbuf->sb_list);
65 INIT_LIST_HEAD(&segbuf->sb_segsum_buffers);
66 INIT_LIST_HEAD(&segbuf->sb_payload_buffers);
67 return segbuf;
68}
69
70void nilfs_segbuf_free(struct nilfs_segment_buffer *segbuf)
71{
72 kmem_cache_free(nilfs_segbuf_cachep, segbuf);
73}
74
75void nilfs_segbuf_map(struct nilfs_segment_buffer *segbuf, __u64 segnum,
76 unsigned long offset, struct the_nilfs *nilfs)
77{
78 segbuf->sb_segnum = segnum;
79 nilfs_get_segment_range(nilfs, segnum, &segbuf->sb_fseg_start,
80 &segbuf->sb_fseg_end);
81
82 segbuf->sb_pseg_start = segbuf->sb_fseg_start + offset;
83 segbuf->sb_rest_blocks =
84 segbuf->sb_fseg_end - segbuf->sb_pseg_start + 1;
85}
86
87void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *segbuf,
88 __u64 nextnum, struct the_nilfs *nilfs)
89{
90 segbuf->sb_nextnum = nextnum;
91 segbuf->sb_sum.next = nilfs_get_segment_start_blocknr(nilfs, nextnum);
92}
93
94int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *segbuf)
95{
96 struct buffer_head *bh;
97
98 bh = sb_getblk(segbuf->sb_super,
99 segbuf->sb_pseg_start + segbuf->sb_sum.nsumblk);
100 if (unlikely(!bh))
101 return -ENOMEM;
102
103 nilfs_segbuf_add_segsum_buffer(segbuf, bh);
104 return 0;
105}
106
107int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *segbuf,
108 struct buffer_head **bhp)
109{
110 struct buffer_head *bh;
111
112 bh = sb_getblk(segbuf->sb_super,
113 segbuf->sb_pseg_start + segbuf->sb_sum.nblocks);
114 if (unlikely(!bh))
115 return -ENOMEM;
116
117 nilfs_segbuf_add_payload_buffer(segbuf, bh);
118 *bhp = bh;
119 return 0;
120}
121
122int nilfs_segbuf_reset(struct nilfs_segment_buffer *segbuf, unsigned flags,
123 time_t ctime)
124{
125 int err;
126
127 segbuf->sb_sum.nblocks = segbuf->sb_sum.nsumblk = 0;
128 err = nilfs_segbuf_extend_segsum(segbuf);
129 if (unlikely(err))
130 return err;
131
132 segbuf->sb_sum.flags = flags;
133 segbuf->sb_sum.sumbytes = sizeof(struct nilfs_segment_summary);
134 segbuf->sb_sum.nfinfo = segbuf->sb_sum.nfileblk = 0;
135 segbuf->sb_sum.ctime = ctime;
136
137 segbuf->sb_io_error = 0;
138 return 0;
139}
140
141/*
142 * Setup segument summary
143 */
144void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *segbuf)
145{
146 struct nilfs_segment_summary *raw_sum;
147 struct buffer_head *bh_sum;
148
149 bh_sum = list_entry(segbuf->sb_segsum_buffers.next,
150 struct buffer_head, b_assoc_buffers);
151 raw_sum = (struct nilfs_segment_summary *)bh_sum->b_data;
152
153 raw_sum->ss_magic = cpu_to_le32(NILFS_SEGSUM_MAGIC);
154 raw_sum->ss_bytes = cpu_to_le16(sizeof(*raw_sum));
155 raw_sum->ss_flags = cpu_to_le16(segbuf->sb_sum.flags);
156 raw_sum->ss_seq = cpu_to_le64(segbuf->sb_sum.seg_seq);
157 raw_sum->ss_create = cpu_to_le64(segbuf->sb_sum.ctime);
158 raw_sum->ss_next = cpu_to_le64(segbuf->sb_sum.next);
159 raw_sum->ss_nblocks = cpu_to_le32(segbuf->sb_sum.nblocks);
160 raw_sum->ss_nfinfo = cpu_to_le32(segbuf->sb_sum.nfinfo);
161 raw_sum->ss_sumbytes = cpu_to_le32(segbuf->sb_sum.sumbytes);
162 raw_sum->ss_pad = 0;
163}
164
165/*
166 * CRC calculation routines
167 */
168void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *segbuf,
169 u32 seed)
170{
171 struct buffer_head *bh;
172 struct nilfs_segment_summary *raw_sum;
173 unsigned long size, bytes = segbuf->sb_sum.sumbytes;
174 u32 crc;
175
176 bh = list_entry(segbuf->sb_segsum_buffers.next, struct buffer_head,
177 b_assoc_buffers);
178
179 raw_sum = (struct nilfs_segment_summary *)bh->b_data;
180 size = min_t(unsigned long, bytes, bh->b_size);
181 crc = crc32_le(seed,
182 (unsigned char *)raw_sum +
183 sizeof(raw_sum->ss_datasum) + sizeof(raw_sum->ss_sumsum),
184 size - (sizeof(raw_sum->ss_datasum) +
185 sizeof(raw_sum->ss_sumsum)));
186
187 list_for_each_entry_continue(bh, &segbuf->sb_segsum_buffers,
188 b_assoc_buffers) {
189 bytes -= size;
190 size = min_t(unsigned long, bytes, bh->b_size);
191 crc = crc32_le(crc, bh->b_data, size);
192 }
193 raw_sum->ss_sumsum = cpu_to_le32(crc);
194}
195
196void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *segbuf,
197 u32 seed)
198{
199 struct buffer_head *bh;
200 struct nilfs_segment_summary *raw_sum;
201 void *kaddr;
202 u32 crc;
203
204 bh = list_entry(segbuf->sb_segsum_buffers.next, struct buffer_head,
205 b_assoc_buffers);
206 raw_sum = (struct nilfs_segment_summary *)bh->b_data;
207 crc = crc32_le(seed,
208 (unsigned char *)raw_sum + sizeof(raw_sum->ss_datasum),
209 bh->b_size - sizeof(raw_sum->ss_datasum));
210
211 list_for_each_entry_continue(bh, &segbuf->sb_segsum_buffers,
212 b_assoc_buffers) {
213 crc = crc32_le(crc, bh->b_data, bh->b_size);
214 }
215 list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) {
216 kaddr = kmap_atomic(bh->b_page, KM_USER0);
217 crc = crc32_le(crc, kaddr + bh_offset(bh), bh->b_size);
218 kunmap_atomic(kaddr, KM_USER0);
219 }
220 raw_sum->ss_datasum = cpu_to_le32(crc);
221}
222
223void nilfs_release_buffers(struct list_head *list)
224{
225 struct buffer_head *bh, *n;
226
227 list_for_each_entry_safe(bh, n, list, b_assoc_buffers) {
228 list_del_init(&bh->b_assoc_buffers);
229 if (buffer_nilfs_allocated(bh)) {
230 struct page *clone_page = bh->b_page;
231
232 /* remove clone page */
233 brelse(bh);
234 page_cache_release(clone_page); /* for each bh */
235 if (page_count(clone_page) <= 2) {
236 lock_page(clone_page);
237 nilfs_free_private_page(clone_page);
238 }
239 continue;
240 }
241 brelse(bh);
242 }
243}
244
245/*
246 * BIO operations
247 */
248static void nilfs_end_bio_write(struct bio *bio, int err)
249{
250 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
251 struct nilfs_write_info *wi = bio->bi_private;
252
253 if (err == -EOPNOTSUPP) {
254 set_bit(BIO_EOPNOTSUPP, &bio->bi_flags);
255 bio_put(bio);
256 /* to be detected by submit_seg_bio() */
257 }
258
259 if (!uptodate)
260 atomic_inc(&wi->err);
261
262 bio_put(bio);
263 complete(&wi->bio_event);
264}
265
266static int nilfs_submit_seg_bio(struct nilfs_write_info *wi, int mode)
267{
268 struct bio *bio = wi->bio;
269 int err;
270
271 if (wi->nbio > 0 && bdi_write_congested(wi->bdi)) {
272 wait_for_completion(&wi->bio_event);
273 wi->nbio--;
274 if (unlikely(atomic_read(&wi->err))) {
275 bio_put(bio);
276 err = -EIO;
277 goto failed;
278 }
279 }
280
281 bio->bi_end_io = nilfs_end_bio_write;
282 bio->bi_private = wi;
283 bio_get(bio);
284 submit_bio(mode, bio);
285 if (bio_flagged(bio, BIO_EOPNOTSUPP)) {
286 bio_put(bio);
287 err = -EOPNOTSUPP;
288 goto failed;
289 }
290 wi->nbio++;
291 bio_put(bio);
292
293 wi->bio = NULL;
294 wi->rest_blocks -= wi->end - wi->start;
295 wi->nr_vecs = min(wi->max_pages, wi->rest_blocks);
296 wi->start = wi->end;
297 return 0;
298
299 failed:
300 wi->bio = NULL;
301 return err;
302}
303
304/**
305 * nilfs_alloc_seg_bio - allocate a bio for writing segment.
306 * @sb: super block
307 * @start: beginning disk block number of this BIO.
308 * @nr_vecs: request size of page vector.
309 *
310 * alloc_seg_bio() allocates a new BIO structure and initialize it.
311 *
312 * Return Value: On success, pointer to the struct bio is returned.
313 * On error, NULL is returned.
314 */
315static struct bio *nilfs_alloc_seg_bio(struct super_block *sb, sector_t start,
316 int nr_vecs)
317{
318 struct bio *bio;
319
320 bio = bio_alloc(GFP_NOWAIT, nr_vecs);
321 if (bio == NULL) {
322 while (!bio && (nr_vecs >>= 1))
323 bio = bio_alloc(GFP_NOWAIT, nr_vecs);
324 }
325 if (likely(bio)) {
326 bio->bi_bdev = sb->s_bdev;
327 bio->bi_sector = (sector_t)start << (sb->s_blocksize_bits - 9);
328 }
329 return bio;
330}
331
332void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *segbuf,
333 struct nilfs_write_info *wi)
334{
335 wi->bio = NULL;
336 wi->rest_blocks = segbuf->sb_sum.nblocks;
337 wi->max_pages = bio_get_nr_vecs(wi->sb->s_bdev);
338 wi->nr_vecs = min(wi->max_pages, wi->rest_blocks);
339 wi->start = wi->end = 0;
340 wi->nbio = 0;
341 wi->blocknr = segbuf->sb_pseg_start;
342
343 atomic_set(&wi->err, 0);
344 init_completion(&wi->bio_event);
345}
346
347static int nilfs_submit_bh(struct nilfs_write_info *wi, struct buffer_head *bh,
348 int mode)
349{
350 int len, err;
351
352 BUG_ON(wi->nr_vecs <= 0);
353 repeat:
354 if (!wi->bio) {
355 wi->bio = nilfs_alloc_seg_bio(wi->sb, wi->blocknr + wi->end,
356 wi->nr_vecs);
357 if (unlikely(!wi->bio))
358 return -ENOMEM;
359 }
360
361 len = bio_add_page(wi->bio, bh->b_page, bh->b_size, bh_offset(bh));
362 if (len == bh->b_size) {
363 wi->end++;
364 return 0;
365 }
366 /* bio is FULL */
367 err = nilfs_submit_seg_bio(wi, mode);
368 /* never submit current bh */
369 if (likely(!err))
370 goto repeat;
371 return err;
372}
373
374int nilfs_segbuf_write(struct nilfs_segment_buffer *segbuf,
375 struct nilfs_write_info *wi)
376{
377 struct buffer_head *bh;
378 int res, rw = WRITE;
379
380 list_for_each_entry(bh, &segbuf->sb_segsum_buffers, b_assoc_buffers) {
381 res = nilfs_submit_bh(wi, bh, rw);
382 if (unlikely(res))
383 goto failed_bio;
384 }
385
386 list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) {
387 res = nilfs_submit_bh(wi, bh, rw);
388 if (unlikely(res))
389 goto failed_bio;
390 }
391
392 if (wi->bio) {
393 /*
394 * Last BIO is always sent through the following
395 * submission.
396 */
397 rw |= (1 << BIO_RW_SYNCIO);
398 res = nilfs_submit_seg_bio(wi, rw);
399 if (unlikely(res))
400 goto failed_bio;
401 }
402
403 res = 0;
404 out:
405 return res;
406
407 failed_bio:
408 atomic_inc(&wi->err);
409 goto out;
410}
411
412/**
413 * nilfs_segbuf_wait - wait for completion of requested BIOs
414 * @wi: nilfs_write_info
415 *
416 * Return Value: On Success, 0 is returned. On Error, one of the following
417 * negative error code is returned.
418 *
419 * %-EIO - I/O error
420 */
421int nilfs_segbuf_wait(struct nilfs_segment_buffer *segbuf,
422 struct nilfs_write_info *wi)
423{
424 int err = 0;
425
426 if (!wi->nbio)
427 return 0;
428
429 do {
430 wait_for_completion(&wi->bio_event);
431 } while (--wi->nbio > 0);
432
433 if (unlikely(atomic_read(&wi->err) > 0)) {
434 printk(KERN_ERR "NILFS: IO error writing segment\n");
435 err = -EIO;
436 segbuf->sb_io_error = 1;
437 }
438 return err;
439}
diff --git a/fs/nilfs2/segbuf.h b/fs/nilfs2/segbuf.h
new file mode 100644
index 000000000000..0c3076f4e592
--- /dev/null
+++ b/fs/nilfs2/segbuf.h
@@ -0,0 +1,201 @@
1/*
2 * segbuf.h - NILFS Segment buffer prototypes and definitions
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>
21 *
22 */
23#ifndef _NILFS_SEGBUF_H
24#define _NILFS_SEGBUF_H
25
26#include <linux/fs.h>
27#include <linux/buffer_head.h>
28#include <linux/bio.h>
29#include <linux/completion.h>
30#include <linux/backing-dev.h>
31
32/**
33 * struct nilfs_segsum_info - On-memory segment summary
34 * @flags: Flags
35 * @nfinfo: Number of file information structures
36 * @nblocks: Number of blocks included in the partial segment
37 * @nsumblk: Number of summary blocks
38 * @sumbytes: Byte count of segment summary
39 * @nfileblk: Total number of file blocks
40 * @seg_seq: Segment sequence number
41 * @ctime: Creation time
42 * @next: Block number of the next full segment
43 */
44struct nilfs_segsum_info {
45 unsigned int flags;
46 unsigned long nfinfo;
47 unsigned long nblocks;
48 unsigned long nsumblk;
49 unsigned long sumbytes;
50 unsigned long nfileblk;
51 u64 seg_seq;
52 time_t ctime;
53 sector_t next;
54};
55
56/* macro for the flags */
57#define NILFS_SEG_HAS_SR(sum) ((sum)->flags & NILFS_SS_SR)
58#define NILFS_SEG_LOGBGN(sum) ((sum)->flags & NILFS_SS_LOGBGN)
59#define NILFS_SEG_LOGEND(sum) ((sum)->flags & NILFS_SS_LOGEND)
60#define NILFS_SEG_DSYNC(sum) ((sum)->flags & NILFS_SS_SYNDT)
61#define NILFS_SEG_SIMPLEX(sum) \
62 (((sum)->flags & (NILFS_SS_LOGBGN | NILFS_SS_LOGEND)) == \
63 (NILFS_SS_LOGBGN | NILFS_SS_LOGEND))
64
65#define NILFS_SEG_EMPTY(sum) ((sum)->nblocks == (sum)->nsumblk)
66
67/**
68 * struct nilfs_segment_buffer - Segment buffer
69 * @sb_super: back pointer to a superblock struct
70 * @sb_list: List head to chain this structure
71 * @sb_sum: On-memory segment summary
72 * @sb_segnum: Index number of the full segment
73 * @sb_nextnum: Index number of the next full segment
74 * @sb_fseg_start: Start block number of the full segment
75 * @sb_fseg_end: End block number of the full segment
76 * @sb_pseg_start: Disk block number of partial segment
77 * @sb_rest_blocks: Number of residual blocks in the current segment
78 * @sb_segsum_buffers: List of buffers for segment summaries
79 * @sb_payload_buffers: List of buffers for segment payload
80 * @sb_io_error: I/O error status
81 */
82struct nilfs_segment_buffer {
83 struct super_block *sb_super;
84 struct list_head sb_list;
85
86 /* Segment information */
87 struct nilfs_segsum_info sb_sum;
88 __u64 sb_segnum;
89 __u64 sb_nextnum;
90 sector_t sb_fseg_start, sb_fseg_end;
91 sector_t sb_pseg_start;
92 unsigned sb_rest_blocks;
93
94 /* Buffers */
95 struct list_head sb_segsum_buffers;
96 struct list_head sb_payload_buffers; /* including super root */
97
98 /* io status */
99 int sb_io_error;
100};
101
102#define NILFS_LIST_SEGBUF(head) \
103 list_entry((head), struct nilfs_segment_buffer, sb_list)
104#define NILFS_NEXT_SEGBUF(segbuf) NILFS_LIST_SEGBUF((segbuf)->sb_list.next)
105#define NILFS_PREV_SEGBUF(segbuf) NILFS_LIST_SEGBUF((segbuf)->sb_list.prev)
106#define NILFS_LAST_SEGBUF(head) NILFS_LIST_SEGBUF((head)->prev)
107#define NILFS_FIRST_SEGBUF(head) NILFS_LIST_SEGBUF((head)->next)
108#define NILFS_SEGBUF_IS_LAST(segbuf, head) ((segbuf)->sb_list.next == (head))
109
110#define nilfs_for_each_segbuf_before(s, t, h) \
111 for ((s) = NILFS_FIRST_SEGBUF(h); (s) != (t); \
112 (s) = NILFS_NEXT_SEGBUF(s))
113
114#define NILFS_SEGBUF_FIRST_BH(head) \
115 (list_entry((head)->next, struct buffer_head, b_assoc_buffers))
116#define NILFS_SEGBUF_NEXT_BH(bh) \
117 (list_entry((bh)->b_assoc_buffers.next, struct buffer_head, \
118 b_assoc_buffers))
119#define NILFS_SEGBUF_BH_IS_LAST(bh, head) ((bh)->b_assoc_buffers.next == head)
120
121
122int __init nilfs_init_segbuf_cache(void);
123void nilfs_destroy_segbuf_cache(void);
124struct nilfs_segment_buffer *nilfs_segbuf_new(struct super_block *);
125void nilfs_segbuf_free(struct nilfs_segment_buffer *);
126void nilfs_segbuf_map(struct nilfs_segment_buffer *, __u64, unsigned long,
127 struct the_nilfs *);
128void nilfs_segbuf_set_next_segnum(struct nilfs_segment_buffer *, __u64,
129 struct the_nilfs *);
130int nilfs_segbuf_reset(struct nilfs_segment_buffer *, unsigned, time_t);
131int nilfs_segbuf_extend_segsum(struct nilfs_segment_buffer *);
132int nilfs_segbuf_extend_payload(struct nilfs_segment_buffer *,
133 struct buffer_head **);
134void nilfs_segbuf_fill_in_segsum(struct nilfs_segment_buffer *);
135void nilfs_segbuf_fill_in_segsum_crc(struct nilfs_segment_buffer *, u32);
136void nilfs_segbuf_fill_in_data_crc(struct nilfs_segment_buffer *, u32);
137
138static inline void
139nilfs_segbuf_add_segsum_buffer(struct nilfs_segment_buffer *segbuf,
140 struct buffer_head *bh)
141{
142 list_add_tail(&bh->b_assoc_buffers, &segbuf->sb_segsum_buffers);
143 segbuf->sb_sum.nblocks++;
144 segbuf->sb_sum.nsumblk++;
145}
146
147static inline void
148nilfs_segbuf_add_payload_buffer(struct nilfs_segment_buffer *segbuf,
149 struct buffer_head *bh)
150{
151 list_add_tail(&bh->b_assoc_buffers, &segbuf->sb_payload_buffers);
152 segbuf->sb_sum.nblocks++;
153}
154
155static inline void
156nilfs_segbuf_add_file_buffer(struct nilfs_segment_buffer *segbuf,
157 struct buffer_head *bh)
158{
159 get_bh(bh);
160 nilfs_segbuf_add_payload_buffer(segbuf, bh);
161 segbuf->sb_sum.nfileblk++;
162}
163
164void nilfs_release_buffers(struct list_head *);
165
166static inline void nilfs_segbuf_clear(struct nilfs_segment_buffer *segbuf)
167{
168 nilfs_release_buffers(&segbuf->sb_segsum_buffers);
169 nilfs_release_buffers(&segbuf->sb_payload_buffers);
170}
171
172struct nilfs_write_info {
173 struct bio *bio;
174 int start, end; /* The region to be submitted */
175 int rest_blocks;
176 int max_pages;
177 int nr_vecs;
178 sector_t blocknr;
179
180 int nbio;
181 atomic_t err;
182 struct completion bio_event;
183 /* completion event of segment write */
184
185 /*
186 * The following fields must be set explicitly
187 */
188 struct super_block *sb;
189 struct backing_dev_info *bdi; /* backing dev info */
190 struct buffer_head *bh_sr;
191};
192
193
194void nilfs_segbuf_prepare_write(struct nilfs_segment_buffer *,
195 struct nilfs_write_info *);
196int nilfs_segbuf_write(struct nilfs_segment_buffer *,
197 struct nilfs_write_info *);
198int nilfs_segbuf_wait(struct nilfs_segment_buffer *,
199 struct nilfs_write_info *);
200
201#endif /* _NILFS_SEGBUF_H */
diff --git a/fs/nilfs2/seglist.h b/fs/nilfs2/seglist.h
new file mode 100644
index 000000000000..d39df9144e99
--- /dev/null
+++ b/fs/nilfs2/seglist.h
@@ -0,0 +1,85 @@
1/*
2 * seglist.h - expediential structure and routines to handle list of segments
3 * (would be removed in a future release)
4 *
5 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
20 *
21 * Written by Ryusuke Konishi <ryusuke@osrg.net>
22 *
23 */
24#ifndef _NILFS_SEGLIST_H
25#define _NILFS_SEGLIST_H
26
27#include <linux/fs.h>
28#include <linux/buffer_head.h>
29#include <linux/nilfs2_fs.h>
30#include "sufile.h"
31
32struct nilfs_segment_entry {
33 __u64 segnum;
34
35#define NILFS_SLH_FREED 0x0001 /* The segment was freed provisonally.
36 It must be cancelled if
37 construction aborted */
38
39 unsigned flags;
40 struct list_head list;
41 struct buffer_head *bh_su;
42 struct nilfs_segment_usage *raw_su;
43};
44
45
46void nilfs_dispose_segment_list(struct list_head *);
47
48static inline struct nilfs_segment_entry *
49nilfs_alloc_segment_entry(__u64 segnum)
50{
51 struct nilfs_segment_entry *ent = kmalloc(sizeof(*ent), GFP_NOFS);
52
53 if (likely(ent)) {
54 ent->segnum = segnum;
55 ent->flags = 0;
56 ent->bh_su = NULL;
57 ent->raw_su = NULL;
58 INIT_LIST_HEAD(&ent->list);
59 }
60 return ent;
61}
62
63static inline int nilfs_open_segment_entry(struct nilfs_segment_entry *ent,
64 struct inode *sufile)
65{
66 return nilfs_sufile_get_segment_usage(sufile, ent->segnum,
67 &ent->raw_su, &ent->bh_su);
68}
69
70static inline void nilfs_close_segment_entry(struct nilfs_segment_entry *ent,
71 struct inode *sufile)
72{
73 if (!ent->bh_su)
74 return;
75 nilfs_sufile_put_segment_usage(sufile, ent->segnum, ent->bh_su);
76 ent->bh_su = NULL;
77 ent->raw_su = NULL;
78}
79
80static inline void nilfs_free_segment_entry(struct nilfs_segment_entry *ent)
81{
82 kfree(ent);
83}
84
85#endif /* _NILFS_SEGLIST_H */
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
new file mode 100644
index 000000000000..fb70ec3be20e
--- /dev/null
+++ b/fs/nilfs2/segment.c
@@ -0,0 +1,2977 @@
1/*
2 * segment.c - NILFS segment constructor.
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>
21 *
22 */
23
24#include <linux/pagemap.h>
25#include <linux/buffer_head.h>
26#include <linux/writeback.h>
27#include <linux/bio.h>
28#include <linux/completion.h>
29#include <linux/blkdev.h>
30#include <linux/backing-dev.h>
31#include <linux/freezer.h>
32#include <linux/kthread.h>
33#include <linux/crc32.h>
34#include <linux/pagevec.h>
35#include "nilfs.h"
36#include "btnode.h"
37#include "page.h"
38#include "segment.h"
39#include "sufile.h"
40#include "cpfile.h"
41#include "ifile.h"
42#include "seglist.h"
43#include "segbuf.h"
44
45
46/*
47 * Segment constructor
48 */
49#define SC_N_INODEVEC 16 /* Size of locally allocated inode vector */
50
51#define SC_MAX_SEGDELTA 64 /* Upper limit of the number of segments
52 appended in collection retry loop */
53
54/* Construction mode */
55enum {
56 SC_LSEG_SR = 1, /* Make a logical segment having a super root */
57 SC_LSEG_DSYNC, /* Flush data blocks of a given file and make
58 a logical segment without a super root */
59 SC_FLUSH_FILE, /* Flush data files, leads to segment writes without
60 creating a checkpoint */
61 SC_FLUSH_DAT, /* Flush DAT file. This also creates segments without
62 a checkpoint */
63};
64
65/* Stage numbers of dirty block collection */
66enum {
67 NILFS_ST_INIT = 0,
68 NILFS_ST_GC, /* Collecting dirty blocks for GC */
69 NILFS_ST_FILE,
70 NILFS_ST_IFILE,
71 NILFS_ST_CPFILE,
72 NILFS_ST_SUFILE,
73 NILFS_ST_DAT,
74 NILFS_ST_SR, /* Super root */
75 NILFS_ST_DSYNC, /* Data sync blocks */
76 NILFS_ST_DONE,
77};
78
79/* State flags of collection */
80#define NILFS_CF_NODE 0x0001 /* Collecting node blocks */
81#define NILFS_CF_IFILE_STARTED 0x0002 /* IFILE stage has started */
82#define NILFS_CF_HISTORY_MASK (NILFS_CF_IFILE_STARTED)
83
84/* Operations depending on the construction mode and file type */
85struct nilfs_sc_operations {
86 int (*collect_data)(struct nilfs_sc_info *, struct buffer_head *,
87 struct inode *);
88 int (*collect_node)(struct nilfs_sc_info *, struct buffer_head *,
89 struct inode *);
90 int (*collect_bmap)(struct nilfs_sc_info *, struct buffer_head *,
91 struct inode *);
92 void (*write_data_binfo)(struct nilfs_sc_info *,
93 struct nilfs_segsum_pointer *,
94 union nilfs_binfo *);
95 void (*write_node_binfo)(struct nilfs_sc_info *,
96 struct nilfs_segsum_pointer *,
97 union nilfs_binfo *);
98};
99
100/*
101 * Other definitions
102 */
103static void nilfs_segctor_start_timer(struct nilfs_sc_info *);
104static void nilfs_segctor_do_flush(struct nilfs_sc_info *, int);
105static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *);
106static void nilfs_dispose_list(struct nilfs_sb_info *, struct list_head *,
107 int);
108
109#define nilfs_cnt32_gt(a, b) \
110 (typecheck(__u32, a) && typecheck(__u32, b) && \
111 ((__s32)(b) - (__s32)(a) < 0))
112#define nilfs_cnt32_ge(a, b) \
113 (typecheck(__u32, a) && typecheck(__u32, b) && \
114 ((__s32)(a) - (__s32)(b) >= 0))
115#define nilfs_cnt32_lt(a, b) nilfs_cnt32_gt(b, a)
116#define nilfs_cnt32_le(a, b) nilfs_cnt32_ge(b, a)
117
118/*
119 * Transaction
120 */
121static struct kmem_cache *nilfs_transaction_cachep;
122
123/**
124 * nilfs_init_transaction_cache - create a cache for nilfs_transaction_info
125 *
126 * nilfs_init_transaction_cache() creates a slab cache for the struct
127 * nilfs_transaction_info.
128 *
129 * Return Value: On success, it returns 0. On error, one of the following
130 * negative error code is returned.
131 *
132 * %-ENOMEM - Insufficient memory available.
133 */
134int nilfs_init_transaction_cache(void)
135{
136 nilfs_transaction_cachep =
137 kmem_cache_create("nilfs2_transaction_cache",
138 sizeof(struct nilfs_transaction_info),
139 0, SLAB_RECLAIM_ACCOUNT, NULL);
140 return (nilfs_transaction_cachep == NULL) ? -ENOMEM : 0;
141}
142
143/**
144 * nilfs_detroy_transaction_cache - destroy the cache for transaction info
145 *
146 * nilfs_destroy_transaction_cache() frees the slab cache for the struct
147 * nilfs_transaction_info.
148 */
149void nilfs_destroy_transaction_cache(void)
150{
151 kmem_cache_destroy(nilfs_transaction_cachep);
152}
153
154static int nilfs_prepare_segment_lock(struct nilfs_transaction_info *ti)
155{
156 struct nilfs_transaction_info *cur_ti = current->journal_info;
157 void *save = NULL;
158
159 if (cur_ti) {
160 if (cur_ti->ti_magic == NILFS_TI_MAGIC)
161 return ++cur_ti->ti_count;
162 else {
163 /*
164 * If journal_info field is occupied by other FS,
165 * it is saved and will be restored on
166 * nilfs_transaction_commit().
167 */
168 printk(KERN_WARNING
169 "NILFS warning: journal info from a different "
170 "FS\n");
171 save = current->journal_info;
172 }
173 }
174 if (!ti) {
175 ti = kmem_cache_alloc(nilfs_transaction_cachep, GFP_NOFS);
176 if (!ti)
177 return -ENOMEM;
178 ti->ti_flags = NILFS_TI_DYNAMIC_ALLOC;
179 } else {
180 ti->ti_flags = 0;
181 }
182 ti->ti_count = 0;
183 ti->ti_save = save;
184 ti->ti_magic = NILFS_TI_MAGIC;
185 current->journal_info = ti;
186 return 0;
187}
188
189/**
190 * nilfs_transaction_begin - start indivisible file operations.
191 * @sb: super block
192 * @ti: nilfs_transaction_info
193 * @vacancy_check: flags for vacancy rate checks
194 *
195 * nilfs_transaction_begin() acquires a reader/writer semaphore, called
196 * the segment semaphore, to make a segment construction and write tasks
197 * exclusive. The function is used with nilfs_transaction_commit() in pairs.
198 * The region enclosed by these two functions can be nested. To avoid a
199 * deadlock, the semaphore is only acquired or released in the outermost call.
200 *
201 * This function allocates a nilfs_transaction_info struct to keep context
202 * information on it. It is initialized and hooked onto the current task in
203 * the outermost call. If a pre-allocated struct is given to @ti, it is used
204 * instead; othewise a new struct is assigned from a slab.
205 *
206 * When @vacancy_check flag is set, this function will check the amount of
207 * free space, and will wait for the GC to reclaim disk space if low capacity.
208 *
209 * Return Value: On success, 0 is returned. On error, one of the following
210 * negative error code is returned.
211 *
212 * %-ENOMEM - Insufficient memory available.
213 *
214 * %-ENOSPC - No space left on device
215 */
216int nilfs_transaction_begin(struct super_block *sb,
217 struct nilfs_transaction_info *ti,
218 int vacancy_check)
219{
220 struct nilfs_sb_info *sbi;
221 struct the_nilfs *nilfs;
222 int ret = nilfs_prepare_segment_lock(ti);
223
224 if (unlikely(ret < 0))
225 return ret;
226 if (ret > 0)
227 return 0;
228
229 sbi = NILFS_SB(sb);
230 nilfs = sbi->s_nilfs;
231 down_read(&nilfs->ns_segctor_sem);
232 if (vacancy_check && nilfs_near_disk_full(nilfs)) {
233 up_read(&nilfs->ns_segctor_sem);
234 ret = -ENOSPC;
235 goto failed;
236 }
237 return 0;
238
239 failed:
240 ti = current->journal_info;
241 current->journal_info = ti->ti_save;
242 if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
243 kmem_cache_free(nilfs_transaction_cachep, ti);
244 return ret;
245}
246
247/**
248 * nilfs_transaction_commit - commit indivisible file operations.
249 * @sb: super block
250 *
251 * nilfs_transaction_commit() releases the read semaphore which is
252 * acquired by nilfs_transaction_begin(). This is only performed
253 * in outermost call of this function. If a commit flag is set,
254 * nilfs_transaction_commit() sets a timer to start the segment
255 * constructor. If a sync flag is set, it starts construction
256 * directly.
257 */
258int nilfs_transaction_commit(struct super_block *sb)
259{
260 struct nilfs_transaction_info *ti = current->journal_info;
261 struct nilfs_sb_info *sbi;
262 struct nilfs_sc_info *sci;
263 int err = 0;
264
265 BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
266 ti->ti_flags |= NILFS_TI_COMMIT;
267 if (ti->ti_count > 0) {
268 ti->ti_count--;
269 return 0;
270 }
271 sbi = NILFS_SB(sb);
272 sci = NILFS_SC(sbi);
273 if (sci != NULL) {
274 if (ti->ti_flags & NILFS_TI_COMMIT)
275 nilfs_segctor_start_timer(sci);
276 if (atomic_read(&sbi->s_nilfs->ns_ndirtyblks) >
277 sci->sc_watermark)
278 nilfs_segctor_do_flush(sci, 0);
279 }
280 up_read(&sbi->s_nilfs->ns_segctor_sem);
281 current->journal_info = ti->ti_save;
282
283 if (ti->ti_flags & NILFS_TI_SYNC)
284 err = nilfs_construct_segment(sb);
285 if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
286 kmem_cache_free(nilfs_transaction_cachep, ti);
287 return err;
288}
289
290void nilfs_transaction_abort(struct super_block *sb)
291{
292 struct nilfs_transaction_info *ti = current->journal_info;
293
294 BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
295 if (ti->ti_count > 0) {
296 ti->ti_count--;
297 return;
298 }
299 up_read(&NILFS_SB(sb)->s_nilfs->ns_segctor_sem);
300
301 current->journal_info = ti->ti_save;
302 if (ti->ti_flags & NILFS_TI_DYNAMIC_ALLOC)
303 kmem_cache_free(nilfs_transaction_cachep, ti);
304}
305
306void nilfs_relax_pressure_in_lock(struct super_block *sb)
307{
308 struct nilfs_sb_info *sbi = NILFS_SB(sb);
309 struct nilfs_sc_info *sci = NILFS_SC(sbi);
310 struct the_nilfs *nilfs = sbi->s_nilfs;
311
312 if (!sci || !sci->sc_flush_request)
313 return;
314
315 set_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags);
316 up_read(&nilfs->ns_segctor_sem);
317
318 down_write(&nilfs->ns_segctor_sem);
319 if (sci->sc_flush_request &&
320 test_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags)) {
321 struct nilfs_transaction_info *ti = current->journal_info;
322
323 ti->ti_flags |= NILFS_TI_WRITER;
324 nilfs_segctor_do_immediate_flush(sci);
325 ti->ti_flags &= ~NILFS_TI_WRITER;
326 }
327 downgrade_write(&nilfs->ns_segctor_sem);
328}
329
330static void nilfs_transaction_lock(struct nilfs_sb_info *sbi,
331 struct nilfs_transaction_info *ti,
332 int gcflag)
333{
334 struct nilfs_transaction_info *cur_ti = current->journal_info;
335
336 WARN_ON(cur_ti);
337 ti->ti_flags = NILFS_TI_WRITER;
338 ti->ti_count = 0;
339 ti->ti_save = cur_ti;
340 ti->ti_magic = NILFS_TI_MAGIC;
341 INIT_LIST_HEAD(&ti->ti_garbage);
342 current->journal_info = ti;
343
344 for (;;) {
345 down_write(&sbi->s_nilfs->ns_segctor_sem);
346 if (!test_bit(NILFS_SC_PRIOR_FLUSH, &NILFS_SC(sbi)->sc_flags))
347 break;
348
349 nilfs_segctor_do_immediate_flush(NILFS_SC(sbi));
350
351 up_write(&sbi->s_nilfs->ns_segctor_sem);
352 yield();
353 }
354 if (gcflag)
355 ti->ti_flags |= NILFS_TI_GC;
356}
357
358static void nilfs_transaction_unlock(struct nilfs_sb_info *sbi)
359{
360 struct nilfs_transaction_info *ti = current->journal_info;
361
362 BUG_ON(ti == NULL || ti->ti_magic != NILFS_TI_MAGIC);
363 BUG_ON(ti->ti_count > 0);
364
365 up_write(&sbi->s_nilfs->ns_segctor_sem);
366 current->journal_info = ti->ti_save;
367 if (!list_empty(&ti->ti_garbage))
368 nilfs_dispose_list(sbi, &ti->ti_garbage, 0);
369}
370
371static void *nilfs_segctor_map_segsum_entry(struct nilfs_sc_info *sci,
372 struct nilfs_segsum_pointer *ssp,
373 unsigned bytes)
374{
375 struct nilfs_segment_buffer *segbuf = sci->sc_curseg;
376 unsigned blocksize = sci->sc_super->s_blocksize;
377 void *p;
378
379 if (unlikely(ssp->offset + bytes > blocksize)) {
380 ssp->offset = 0;
381 BUG_ON(NILFS_SEGBUF_BH_IS_LAST(ssp->bh,
382 &segbuf->sb_segsum_buffers));
383 ssp->bh = NILFS_SEGBUF_NEXT_BH(ssp->bh);
384 }
385 p = ssp->bh->b_data + ssp->offset;
386 ssp->offset += bytes;
387 return p;
388}
389
390/**
391 * nilfs_segctor_reset_segment_buffer - reset the current segment buffer
392 * @sci: nilfs_sc_info
393 */
394static int nilfs_segctor_reset_segment_buffer(struct nilfs_sc_info *sci)
395{
396 struct nilfs_segment_buffer *segbuf = sci->sc_curseg;
397 struct buffer_head *sumbh;
398 unsigned sumbytes;
399 unsigned flags = 0;
400 int err;
401
402 if (nilfs_doing_gc())
403 flags = NILFS_SS_GC;
404 err = nilfs_segbuf_reset(segbuf, flags, sci->sc_seg_ctime);
405 if (unlikely(err))
406 return err;
407
408 sumbh = NILFS_SEGBUF_FIRST_BH(&segbuf->sb_segsum_buffers);
409 sumbytes = segbuf->sb_sum.sumbytes;
410 sci->sc_finfo_ptr.bh = sumbh; sci->sc_finfo_ptr.offset = sumbytes;
411 sci->sc_binfo_ptr.bh = sumbh; sci->sc_binfo_ptr.offset = sumbytes;
412 sci->sc_blk_cnt = sci->sc_datablk_cnt = 0;
413 return 0;
414}
415
416static int nilfs_segctor_feed_segment(struct nilfs_sc_info *sci)
417{
418 sci->sc_nblk_this_inc += sci->sc_curseg->sb_sum.nblocks;
419 if (NILFS_SEGBUF_IS_LAST(sci->sc_curseg, &sci->sc_segbufs))
420 return -E2BIG; /* The current segment is filled up
421 (internal code) */
422 sci->sc_curseg = NILFS_NEXT_SEGBUF(sci->sc_curseg);
423 return nilfs_segctor_reset_segment_buffer(sci);
424}
425
426static int nilfs_segctor_add_super_root(struct nilfs_sc_info *sci)
427{
428 struct nilfs_segment_buffer *segbuf = sci->sc_curseg;
429 int err;
430
431 if (segbuf->sb_sum.nblocks >= segbuf->sb_rest_blocks) {
432 err = nilfs_segctor_feed_segment(sci);
433 if (err)
434 return err;
435 segbuf = sci->sc_curseg;
436 }
437 err = nilfs_segbuf_extend_payload(segbuf, &sci->sc_super_root);
438 if (likely(!err))
439 segbuf->sb_sum.flags |= NILFS_SS_SR;
440 return err;
441}
442
443/*
444 * Functions for making segment summary and payloads
445 */
446static int nilfs_segctor_segsum_block_required(
447 struct nilfs_sc_info *sci, const struct nilfs_segsum_pointer *ssp,
448 unsigned binfo_size)
449{
450 unsigned blocksize = sci->sc_super->s_blocksize;
451 /* Size of finfo and binfo is enough small against blocksize */
452
453 return ssp->offset + binfo_size +
454 (!sci->sc_blk_cnt ? sizeof(struct nilfs_finfo) : 0) >
455 blocksize;
456}
457
458static void nilfs_segctor_begin_finfo(struct nilfs_sc_info *sci,
459 struct inode *inode)
460{
461 sci->sc_curseg->sb_sum.nfinfo++;
462 sci->sc_binfo_ptr = sci->sc_finfo_ptr;
463 nilfs_segctor_map_segsum_entry(
464 sci, &sci->sc_binfo_ptr, sizeof(struct nilfs_finfo));
465
466 if (inode->i_sb && !test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags))
467 set_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags);
468 /* skip finfo */
469}
470
471static void nilfs_segctor_end_finfo(struct nilfs_sc_info *sci,
472 struct inode *inode)
473{
474 struct nilfs_finfo *finfo;
475 struct nilfs_inode_info *ii;
476 struct nilfs_segment_buffer *segbuf;
477
478 if (sci->sc_blk_cnt == 0)
479 return;
480
481 ii = NILFS_I(inode);
482 finfo = nilfs_segctor_map_segsum_entry(sci, &sci->sc_finfo_ptr,
483 sizeof(*finfo));
484 finfo->fi_ino = cpu_to_le64(inode->i_ino);
485 finfo->fi_nblocks = cpu_to_le32(sci->sc_blk_cnt);
486 finfo->fi_ndatablk = cpu_to_le32(sci->sc_datablk_cnt);
487 finfo->fi_cno = cpu_to_le64(ii->i_cno);
488
489 segbuf = sci->sc_curseg;
490 segbuf->sb_sum.sumbytes = sci->sc_binfo_ptr.offset +
491 sci->sc_super->s_blocksize * (segbuf->sb_sum.nsumblk - 1);
492 sci->sc_finfo_ptr = sci->sc_binfo_ptr;
493 sci->sc_blk_cnt = sci->sc_datablk_cnt = 0;
494}
495
496static int nilfs_segctor_add_file_block(struct nilfs_sc_info *sci,
497 struct buffer_head *bh,
498 struct inode *inode,
499 unsigned binfo_size)
500{
501 struct nilfs_segment_buffer *segbuf;
502 int required, err = 0;
503
504 retry:
505 segbuf = sci->sc_curseg;
506 required = nilfs_segctor_segsum_block_required(
507 sci, &sci->sc_binfo_ptr, binfo_size);
508 if (segbuf->sb_sum.nblocks + required + 1 > segbuf->sb_rest_blocks) {
509 nilfs_segctor_end_finfo(sci, inode);
510 err = nilfs_segctor_feed_segment(sci);
511 if (err)
512 return err;
513 goto retry;
514 }
515 if (unlikely(required)) {
516 err = nilfs_segbuf_extend_segsum(segbuf);
517 if (unlikely(err))
518 goto failed;
519 }
520 if (sci->sc_blk_cnt == 0)
521 nilfs_segctor_begin_finfo(sci, inode);
522
523 nilfs_segctor_map_segsum_entry(sci, &sci->sc_binfo_ptr, binfo_size);
524 /* Substitution to vblocknr is delayed until update_blocknr() */
525 nilfs_segbuf_add_file_buffer(segbuf, bh);
526 sci->sc_blk_cnt++;
527 failed:
528 return err;
529}
530
531static int nilfs_handle_bmap_error(int err, const char *fname,
532 struct inode *inode, struct super_block *sb)
533{
534 if (err == -EINVAL) {
535 nilfs_error(sb, fname, "broken bmap (inode=%lu)\n",
536 inode->i_ino);
537 err = -EIO;
538 }
539 return err;
540}
541
542/*
543 * Callback functions that enumerate, mark, and collect dirty blocks
544 */
545static int nilfs_collect_file_data(struct nilfs_sc_info *sci,
546 struct buffer_head *bh, struct inode *inode)
547{
548 int err;
549
550 err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
551 if (unlikely(err < 0))
552 return nilfs_handle_bmap_error(err, __func__, inode,
553 sci->sc_super);
554
555 err = nilfs_segctor_add_file_block(sci, bh, inode,
556 sizeof(struct nilfs_binfo_v));
557 if (!err)
558 sci->sc_datablk_cnt++;
559 return err;
560}
561
562static int nilfs_collect_file_node(struct nilfs_sc_info *sci,
563 struct buffer_head *bh,
564 struct inode *inode)
565{
566 int err;
567
568 err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
569 if (unlikely(err < 0))
570 return nilfs_handle_bmap_error(err, __func__, inode,
571 sci->sc_super);
572 return 0;
573}
574
575static int nilfs_collect_file_bmap(struct nilfs_sc_info *sci,
576 struct buffer_head *bh,
577 struct inode *inode)
578{
579 WARN_ON(!buffer_dirty(bh));
580 return nilfs_segctor_add_file_block(sci, bh, inode, sizeof(__le64));
581}
582
583static void nilfs_write_file_data_binfo(struct nilfs_sc_info *sci,
584 struct nilfs_segsum_pointer *ssp,
585 union nilfs_binfo *binfo)
586{
587 struct nilfs_binfo_v *binfo_v = nilfs_segctor_map_segsum_entry(
588 sci, ssp, sizeof(*binfo_v));
589 *binfo_v = binfo->bi_v;
590}
591
592static void nilfs_write_file_node_binfo(struct nilfs_sc_info *sci,
593 struct nilfs_segsum_pointer *ssp,
594 union nilfs_binfo *binfo)
595{
596 __le64 *vblocknr = nilfs_segctor_map_segsum_entry(
597 sci, ssp, sizeof(*vblocknr));
598 *vblocknr = binfo->bi_v.bi_vblocknr;
599}
600
601struct nilfs_sc_operations nilfs_sc_file_ops = {
602 .collect_data = nilfs_collect_file_data,
603 .collect_node = nilfs_collect_file_node,
604 .collect_bmap = nilfs_collect_file_bmap,
605 .write_data_binfo = nilfs_write_file_data_binfo,
606 .write_node_binfo = nilfs_write_file_node_binfo,
607};
608
609static int nilfs_collect_dat_data(struct nilfs_sc_info *sci,
610 struct buffer_head *bh, struct inode *inode)
611{
612 int err;
613
614 err = nilfs_bmap_propagate(NILFS_I(inode)->i_bmap, bh);
615 if (unlikely(err < 0))
616 return nilfs_handle_bmap_error(err, __func__, inode,
617 sci->sc_super);
618
619 err = nilfs_segctor_add_file_block(sci, bh, inode, sizeof(__le64));
620 if (!err)
621 sci->sc_datablk_cnt++;
622 return err;
623}
624
625static int nilfs_collect_dat_bmap(struct nilfs_sc_info *sci,
626 struct buffer_head *bh, struct inode *inode)
627{
628 WARN_ON(!buffer_dirty(bh));
629 return nilfs_segctor_add_file_block(sci, bh, inode,
630 sizeof(struct nilfs_binfo_dat));
631}
632
633static void nilfs_write_dat_data_binfo(struct nilfs_sc_info *sci,
634 struct nilfs_segsum_pointer *ssp,
635 union nilfs_binfo *binfo)
636{
637 __le64 *blkoff = nilfs_segctor_map_segsum_entry(sci, ssp,
638 sizeof(*blkoff));
639 *blkoff = binfo->bi_dat.bi_blkoff;
640}
641
642static void nilfs_write_dat_node_binfo(struct nilfs_sc_info *sci,
643 struct nilfs_segsum_pointer *ssp,
644 union nilfs_binfo *binfo)
645{
646 struct nilfs_binfo_dat *binfo_dat =
647 nilfs_segctor_map_segsum_entry(sci, ssp, sizeof(*binfo_dat));
648 *binfo_dat = binfo->bi_dat;
649}
650
651struct nilfs_sc_operations nilfs_sc_dat_ops = {
652 .collect_data = nilfs_collect_dat_data,
653 .collect_node = nilfs_collect_file_node,
654 .collect_bmap = nilfs_collect_dat_bmap,
655 .write_data_binfo = nilfs_write_dat_data_binfo,
656 .write_node_binfo = nilfs_write_dat_node_binfo,
657};
658
659struct nilfs_sc_operations nilfs_sc_dsync_ops = {
660 .collect_data = nilfs_collect_file_data,
661 .collect_node = NULL,
662 .collect_bmap = NULL,
663 .write_data_binfo = nilfs_write_file_data_binfo,
664 .write_node_binfo = NULL,
665};
666
667static size_t nilfs_lookup_dirty_data_buffers(struct inode *inode,
668 struct list_head *listp,
669 size_t nlimit,
670 loff_t start, loff_t end)
671{
672 struct address_space *mapping = inode->i_mapping;
673 struct pagevec pvec;
674 pgoff_t index = 0, last = ULONG_MAX;
675 size_t ndirties = 0;
676 int i;
677
678 if (unlikely(start != 0 || end != LLONG_MAX)) {
679 /*
680 * A valid range is given for sync-ing data pages. The
681 * range is rounded to per-page; extra dirty buffers
682 * may be included if blocksize < pagesize.
683 */
684 index = start >> PAGE_SHIFT;
685 last = end >> PAGE_SHIFT;
686 }
687 pagevec_init(&pvec, 0);
688 repeat:
689 if (unlikely(index > last) ||
690 !pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY,
691 min_t(pgoff_t, last - index,
692 PAGEVEC_SIZE - 1) + 1))
693 return ndirties;
694
695 for (i = 0; i < pagevec_count(&pvec); i++) {
696 struct buffer_head *bh, *head;
697 struct page *page = pvec.pages[i];
698
699 if (unlikely(page->index > last))
700 break;
701
702 if (mapping->host) {
703 lock_page(page);
704 if (!page_has_buffers(page))
705 create_empty_buffers(page,
706 1 << inode->i_blkbits, 0);
707 unlock_page(page);
708 }
709
710 bh = head = page_buffers(page);
711 do {
712 if (!buffer_dirty(bh))
713 continue;
714 get_bh(bh);
715 list_add_tail(&bh->b_assoc_buffers, listp);
716 ndirties++;
717 if (unlikely(ndirties >= nlimit)) {
718 pagevec_release(&pvec);
719 cond_resched();
720 return ndirties;
721 }
722 } while (bh = bh->b_this_page, bh != head);
723 }
724 pagevec_release(&pvec);
725 cond_resched();
726 goto repeat;
727}
728
729static void nilfs_lookup_dirty_node_buffers(struct inode *inode,
730 struct list_head *listp)
731{
732 struct nilfs_inode_info *ii = NILFS_I(inode);
733 struct address_space *mapping = &ii->i_btnode_cache;
734 struct pagevec pvec;
735 struct buffer_head *bh, *head;
736 unsigned int i;
737 pgoff_t index = 0;
738
739 pagevec_init(&pvec, 0);
740
741 while (pagevec_lookup_tag(&pvec, mapping, &index, PAGECACHE_TAG_DIRTY,
742 PAGEVEC_SIZE)) {
743 for (i = 0; i < pagevec_count(&pvec); i++) {
744 bh = head = page_buffers(pvec.pages[i]);
745 do {
746 if (buffer_dirty(bh)) {
747 get_bh(bh);
748 list_add_tail(&bh->b_assoc_buffers,
749 listp);
750 }
751 bh = bh->b_this_page;
752 } while (bh != head);
753 }
754 pagevec_release(&pvec);
755 cond_resched();
756 }
757}
758
759static void nilfs_dispose_list(struct nilfs_sb_info *sbi,
760 struct list_head *head, int force)
761{
762 struct nilfs_inode_info *ii, *n;
763 struct nilfs_inode_info *ivec[SC_N_INODEVEC], **pii;
764 unsigned nv = 0;
765
766 while (!list_empty(head)) {
767 spin_lock(&sbi->s_inode_lock);
768 list_for_each_entry_safe(ii, n, head, i_dirty) {
769 list_del_init(&ii->i_dirty);
770 if (force) {
771 if (unlikely(ii->i_bh)) {
772 brelse(ii->i_bh);
773 ii->i_bh = NULL;
774 }
775 } else if (test_bit(NILFS_I_DIRTY, &ii->i_state)) {
776 set_bit(NILFS_I_QUEUED, &ii->i_state);
777 list_add_tail(&ii->i_dirty,
778 &sbi->s_dirty_files);
779 continue;
780 }
781 ivec[nv++] = ii;
782 if (nv == SC_N_INODEVEC)
783 break;
784 }
785 spin_unlock(&sbi->s_inode_lock);
786
787 for (pii = ivec; nv > 0; pii++, nv--)
788 iput(&(*pii)->vfs_inode);
789 }
790}
791
792static int nilfs_test_metadata_dirty(struct nilfs_sb_info *sbi)
793{
794 struct the_nilfs *nilfs = sbi->s_nilfs;
795 int ret = 0;
796
797 if (nilfs_mdt_fetch_dirty(sbi->s_ifile))
798 ret++;
799 if (nilfs_mdt_fetch_dirty(nilfs->ns_cpfile))
800 ret++;
801 if (nilfs_mdt_fetch_dirty(nilfs->ns_sufile))
802 ret++;
803 if (ret || nilfs_doing_gc())
804 if (nilfs_mdt_fetch_dirty(nilfs_dat_inode(nilfs)))
805 ret++;
806 return ret;
807}
808
809static int nilfs_segctor_clean(struct nilfs_sc_info *sci)
810{
811 return list_empty(&sci->sc_dirty_files) &&
812 !test_bit(NILFS_SC_DIRTY, &sci->sc_flags) &&
813 list_empty(&sci->sc_cleaning_segments) &&
814 (!nilfs_doing_gc() || list_empty(&sci->sc_gc_inodes));
815}
816
817static int nilfs_segctor_confirm(struct nilfs_sc_info *sci)
818{
819 struct nilfs_sb_info *sbi = sci->sc_sbi;
820 int ret = 0;
821
822 if (nilfs_test_metadata_dirty(sbi))
823 set_bit(NILFS_SC_DIRTY, &sci->sc_flags);
824
825 spin_lock(&sbi->s_inode_lock);
826 if (list_empty(&sbi->s_dirty_files) && nilfs_segctor_clean(sci))
827 ret++;
828
829 spin_unlock(&sbi->s_inode_lock);
830 return ret;
831}
832
833static void nilfs_segctor_clear_metadata_dirty(struct nilfs_sc_info *sci)
834{
835 struct nilfs_sb_info *sbi = sci->sc_sbi;
836 struct the_nilfs *nilfs = sbi->s_nilfs;
837
838 nilfs_mdt_clear_dirty(sbi->s_ifile);
839 nilfs_mdt_clear_dirty(nilfs->ns_cpfile);
840 nilfs_mdt_clear_dirty(nilfs->ns_sufile);
841 nilfs_mdt_clear_dirty(nilfs_dat_inode(nilfs));
842}
843
844static int nilfs_segctor_create_checkpoint(struct nilfs_sc_info *sci)
845{
846 struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
847 struct buffer_head *bh_cp;
848 struct nilfs_checkpoint *raw_cp;
849 int err;
850
851 /* XXX: this interface will be changed */
852 err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 1,
853 &raw_cp, &bh_cp);
854 if (likely(!err)) {
855 /* The following code is duplicated with cpfile. But, it is
856 needed to collect the checkpoint even if it was not newly
857 created */
858 nilfs_mdt_mark_buffer_dirty(bh_cp);
859 nilfs_mdt_mark_dirty(nilfs->ns_cpfile);
860 nilfs_cpfile_put_checkpoint(
861 nilfs->ns_cpfile, nilfs->ns_cno, bh_cp);
862 } else
863 WARN_ON(err == -EINVAL || err == -ENOENT);
864
865 return err;
866}
867
868static int nilfs_segctor_fill_in_checkpoint(struct nilfs_sc_info *sci)
869{
870 struct nilfs_sb_info *sbi = sci->sc_sbi;
871 struct the_nilfs *nilfs = sbi->s_nilfs;
872 struct buffer_head *bh_cp;
873 struct nilfs_checkpoint *raw_cp;
874 int err;
875
876 err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 0,
877 &raw_cp, &bh_cp);
878 if (unlikely(err)) {
879 WARN_ON(err == -EINVAL || err == -ENOENT);
880 goto failed_ibh;
881 }
882 raw_cp->cp_snapshot_list.ssl_next = 0;
883 raw_cp->cp_snapshot_list.ssl_prev = 0;
884 raw_cp->cp_inodes_count =
885 cpu_to_le64(atomic_read(&sbi->s_inodes_count));
886 raw_cp->cp_blocks_count =
887 cpu_to_le64(atomic_read(&sbi->s_blocks_count));
888 raw_cp->cp_nblk_inc =
889 cpu_to_le64(sci->sc_nblk_inc + sci->sc_nblk_this_inc);
890 raw_cp->cp_create = cpu_to_le64(sci->sc_seg_ctime);
891 raw_cp->cp_cno = cpu_to_le64(nilfs->ns_cno);
892
893 if (test_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags))
894 nilfs_checkpoint_clear_minor(raw_cp);
895 else
896 nilfs_checkpoint_set_minor(raw_cp);
897
898 nilfs_write_inode_common(sbi->s_ifile, &raw_cp->cp_ifile_inode, 1);
899 nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, bh_cp);
900 return 0;
901
902 failed_ibh:
903 return err;
904}
905
906static void nilfs_fill_in_file_bmap(struct inode *ifile,
907 struct nilfs_inode_info *ii)
908
909{
910 struct buffer_head *ibh;
911 struct nilfs_inode *raw_inode;
912
913 if (test_bit(NILFS_I_BMAP, &ii->i_state)) {
914 ibh = ii->i_bh;
915 BUG_ON(!ibh);
916 raw_inode = nilfs_ifile_map_inode(ifile, ii->vfs_inode.i_ino,
917 ibh);
918 nilfs_bmap_write(ii->i_bmap, raw_inode);
919 nilfs_ifile_unmap_inode(ifile, ii->vfs_inode.i_ino, ibh);
920 }
921}
922
923static void nilfs_segctor_fill_in_file_bmap(struct nilfs_sc_info *sci,
924 struct inode *ifile)
925{
926 struct nilfs_inode_info *ii;
927
928 list_for_each_entry(ii, &sci->sc_dirty_files, i_dirty) {
929 nilfs_fill_in_file_bmap(ifile, ii);
930 set_bit(NILFS_I_COLLECTED, &ii->i_state);
931 }
932}
933
934/*
935 * CRC calculation routines
936 */
937static void nilfs_fill_in_super_root_crc(struct buffer_head *bh_sr, u32 seed)
938{
939 struct nilfs_super_root *raw_sr =
940 (struct nilfs_super_root *)bh_sr->b_data;
941 u32 crc;
942
943 crc = crc32_le(seed,
944 (unsigned char *)raw_sr + sizeof(raw_sr->sr_sum),
945 NILFS_SR_BYTES - sizeof(raw_sr->sr_sum));
946 raw_sr->sr_sum = cpu_to_le32(crc);
947}
948
949static void nilfs_segctor_fill_in_checksums(struct nilfs_sc_info *sci,
950 u32 seed)
951{
952 struct nilfs_segment_buffer *segbuf;
953
954 if (sci->sc_super_root)
955 nilfs_fill_in_super_root_crc(sci->sc_super_root, seed);
956
957 list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
958 nilfs_segbuf_fill_in_segsum_crc(segbuf, seed);
959 nilfs_segbuf_fill_in_data_crc(segbuf, seed);
960 }
961}
962
963static void nilfs_segctor_fill_in_super_root(struct nilfs_sc_info *sci,
964 struct the_nilfs *nilfs)
965{
966 struct buffer_head *bh_sr = sci->sc_super_root;
967 struct nilfs_super_root *raw_sr =
968 (struct nilfs_super_root *)bh_sr->b_data;
969 unsigned isz = nilfs->ns_inode_size;
970
971 raw_sr->sr_bytes = cpu_to_le16(NILFS_SR_BYTES);
972 raw_sr->sr_nongc_ctime
973 = cpu_to_le64(nilfs_doing_gc() ?
974 nilfs->ns_nongc_ctime : sci->sc_seg_ctime);
975 raw_sr->sr_flags = 0;
976
977 nilfs_mdt_write_inode_direct(
978 nilfs_dat_inode(nilfs), bh_sr, NILFS_SR_DAT_OFFSET(isz));
979 nilfs_mdt_write_inode_direct(
980 nilfs->ns_cpfile, bh_sr, NILFS_SR_CPFILE_OFFSET(isz));
981 nilfs_mdt_write_inode_direct(
982 nilfs->ns_sufile, bh_sr, NILFS_SR_SUFILE_OFFSET(isz));
983}
984
985static void nilfs_redirty_inodes(struct list_head *head)
986{
987 struct nilfs_inode_info *ii;
988
989 list_for_each_entry(ii, head, i_dirty) {
990 if (test_bit(NILFS_I_COLLECTED, &ii->i_state))
991 clear_bit(NILFS_I_COLLECTED, &ii->i_state);
992 }
993}
994
995static void nilfs_drop_collected_inodes(struct list_head *head)
996{
997 struct nilfs_inode_info *ii;
998
999 list_for_each_entry(ii, head, i_dirty) {
1000 if (!test_and_clear_bit(NILFS_I_COLLECTED, &ii->i_state))
1001 continue;
1002
1003 clear_bit(NILFS_I_INODE_DIRTY, &ii->i_state);
1004 set_bit(NILFS_I_UPDATED, &ii->i_state);
1005 }
1006}
1007
1008static void nilfs_segctor_cancel_free_segments(struct nilfs_sc_info *sci,
1009 struct inode *sufile)
1010
1011{
1012 struct list_head *head = &sci->sc_cleaning_segments;
1013 struct nilfs_segment_entry *ent;
1014 int err;
1015
1016 list_for_each_entry(ent, head, list) {
1017 if (!(ent->flags & NILFS_SLH_FREED))
1018 break;
1019 err = nilfs_sufile_cancel_free(sufile, ent->segnum);
1020 WARN_ON(err); /* do not happen */
1021 ent->flags &= ~NILFS_SLH_FREED;
1022 }
1023}
1024
1025static int nilfs_segctor_prepare_free_segments(struct nilfs_sc_info *sci,
1026 struct inode *sufile)
1027{
1028 struct list_head *head = &sci->sc_cleaning_segments;
1029 struct nilfs_segment_entry *ent;
1030 int err;
1031
1032 list_for_each_entry(ent, head, list) {
1033 err = nilfs_sufile_free(sufile, ent->segnum);
1034 if (unlikely(err))
1035 return err;
1036 ent->flags |= NILFS_SLH_FREED;
1037 }
1038 return 0;
1039}
1040
1041static void nilfs_segctor_commit_free_segments(struct nilfs_sc_info *sci)
1042{
1043 nilfs_dispose_segment_list(&sci->sc_cleaning_segments);
1044}
1045
1046static int nilfs_segctor_apply_buffers(struct nilfs_sc_info *sci,
1047 struct inode *inode,
1048 struct list_head *listp,
1049 int (*collect)(struct nilfs_sc_info *,
1050 struct buffer_head *,
1051 struct inode *))
1052{
1053 struct buffer_head *bh, *n;
1054 int err = 0;
1055
1056 if (collect) {
1057 list_for_each_entry_safe(bh, n, listp, b_assoc_buffers) {
1058 list_del_init(&bh->b_assoc_buffers);
1059 err = collect(sci, bh, inode);
1060 brelse(bh);
1061 if (unlikely(err))
1062 goto dispose_buffers;
1063 }
1064 return 0;
1065 }
1066
1067 dispose_buffers:
1068 while (!list_empty(listp)) {
1069 bh = list_entry(listp->next, struct buffer_head,
1070 b_assoc_buffers);
1071 list_del_init(&bh->b_assoc_buffers);
1072 brelse(bh);
1073 }
1074 return err;
1075}
1076
1077static size_t nilfs_segctor_buffer_rest(struct nilfs_sc_info *sci)
1078{
1079 /* Remaining number of blocks within segment buffer */
1080 return sci->sc_segbuf_nblocks -
1081 (sci->sc_nblk_this_inc + sci->sc_curseg->sb_sum.nblocks);
1082}
1083
1084static int nilfs_segctor_scan_file(struct nilfs_sc_info *sci,
1085 struct inode *inode,
1086 struct nilfs_sc_operations *sc_ops)
1087{
1088 LIST_HEAD(data_buffers);
1089 LIST_HEAD(node_buffers);
1090 int err;
1091
1092 if (!(sci->sc_stage.flags & NILFS_CF_NODE)) {
1093 size_t n, rest = nilfs_segctor_buffer_rest(sci);
1094
1095 n = nilfs_lookup_dirty_data_buffers(
1096 inode, &data_buffers, rest + 1, 0, LLONG_MAX);
1097 if (n > rest) {
1098 err = nilfs_segctor_apply_buffers(
1099 sci, inode, &data_buffers,
1100 sc_ops->collect_data);
1101 BUG_ON(!err); /* always receive -E2BIG or true error */
1102 goto break_or_fail;
1103 }
1104 }
1105 nilfs_lookup_dirty_node_buffers(inode, &node_buffers);
1106
1107 if (!(sci->sc_stage.flags & NILFS_CF_NODE)) {
1108 err = nilfs_segctor_apply_buffers(
1109 sci, inode, &data_buffers, sc_ops->collect_data);
1110 if (unlikely(err)) {
1111 /* dispose node list */
1112 nilfs_segctor_apply_buffers(
1113 sci, inode, &node_buffers, NULL);
1114 goto break_or_fail;
1115 }
1116 sci->sc_stage.flags |= NILFS_CF_NODE;
1117 }
1118 /* Collect node */
1119 err = nilfs_segctor_apply_buffers(
1120 sci, inode, &node_buffers, sc_ops->collect_node);
1121 if (unlikely(err))
1122 goto break_or_fail;
1123
1124 nilfs_bmap_lookup_dirty_buffers(NILFS_I(inode)->i_bmap, &node_buffers);
1125 err = nilfs_segctor_apply_buffers(
1126 sci, inode, &node_buffers, sc_ops->collect_bmap);
1127 if (unlikely(err))
1128 goto break_or_fail;
1129
1130 nilfs_segctor_end_finfo(sci, inode);
1131 sci->sc_stage.flags &= ~NILFS_CF_NODE;
1132
1133 break_or_fail:
1134 return err;
1135}
1136
1137static int nilfs_segctor_scan_file_dsync(struct nilfs_sc_info *sci,
1138 struct inode *inode)
1139{
1140 LIST_HEAD(data_buffers);
1141 size_t n, rest = nilfs_segctor_buffer_rest(sci);
1142 int err;
1143
1144 n = nilfs_lookup_dirty_data_buffers(inode, &data_buffers, rest + 1,
1145 sci->sc_dsync_start,
1146 sci->sc_dsync_end);
1147
1148 err = nilfs_segctor_apply_buffers(sci, inode, &data_buffers,
1149 nilfs_collect_file_data);
1150 if (!err) {
1151 nilfs_segctor_end_finfo(sci, inode);
1152 BUG_ON(n > rest);
1153 /* always receive -E2BIG or true error if n > rest */
1154 }
1155 return err;
1156}
1157
1158static int nilfs_segctor_collect_blocks(struct nilfs_sc_info *sci, int mode)
1159{
1160 struct nilfs_sb_info *sbi = sci->sc_sbi;
1161 struct the_nilfs *nilfs = sbi->s_nilfs;
1162 struct list_head *head;
1163 struct nilfs_inode_info *ii;
1164 int err = 0;
1165
1166 switch (sci->sc_stage.scnt) {
1167 case NILFS_ST_INIT:
1168 /* Pre-processes */
1169 sci->sc_stage.flags = 0;
1170
1171 if (!test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags)) {
1172 sci->sc_nblk_inc = 0;
1173 sci->sc_curseg->sb_sum.flags = NILFS_SS_LOGBGN;
1174 if (mode == SC_LSEG_DSYNC) {
1175 sci->sc_stage.scnt = NILFS_ST_DSYNC;
1176 goto dsync_mode;
1177 }
1178 }
1179
1180 sci->sc_stage.dirty_file_ptr = NULL;
1181 sci->sc_stage.gc_inode_ptr = NULL;
1182 if (mode == SC_FLUSH_DAT) {
1183 sci->sc_stage.scnt = NILFS_ST_DAT;
1184 goto dat_stage;
1185 }
1186 sci->sc_stage.scnt++; /* Fall through */
1187 case NILFS_ST_GC:
1188 if (nilfs_doing_gc()) {
1189 head = &sci->sc_gc_inodes;
1190 ii = list_prepare_entry(sci->sc_stage.gc_inode_ptr,
1191 head, i_dirty);
1192 list_for_each_entry_continue(ii, head, i_dirty) {
1193 err = nilfs_segctor_scan_file(
1194 sci, &ii->vfs_inode,
1195 &nilfs_sc_file_ops);
1196 if (unlikely(err)) {
1197 sci->sc_stage.gc_inode_ptr = list_entry(
1198 ii->i_dirty.prev,
1199 struct nilfs_inode_info,
1200 i_dirty);
1201 goto break_or_fail;
1202 }
1203 set_bit(NILFS_I_COLLECTED, &ii->i_state);
1204 }
1205 sci->sc_stage.gc_inode_ptr = NULL;
1206 }
1207 sci->sc_stage.scnt++; /* Fall through */
1208 case NILFS_ST_FILE:
1209 head = &sci->sc_dirty_files;
1210 ii = list_prepare_entry(sci->sc_stage.dirty_file_ptr, head,
1211 i_dirty);
1212 list_for_each_entry_continue(ii, head, i_dirty) {
1213 clear_bit(NILFS_I_DIRTY, &ii->i_state);
1214
1215 err = nilfs_segctor_scan_file(sci, &ii->vfs_inode,
1216 &nilfs_sc_file_ops);
1217 if (unlikely(err)) {
1218 sci->sc_stage.dirty_file_ptr =
1219 list_entry(ii->i_dirty.prev,
1220 struct nilfs_inode_info,
1221 i_dirty);
1222 goto break_or_fail;
1223 }
1224 /* sci->sc_stage.dirty_file_ptr = NILFS_I(inode); */
1225 /* XXX: required ? */
1226 }
1227 sci->sc_stage.dirty_file_ptr = NULL;
1228 if (mode == SC_FLUSH_FILE) {
1229 sci->sc_stage.scnt = NILFS_ST_DONE;
1230 return 0;
1231 }
1232 sci->sc_stage.scnt++;
1233 sci->sc_stage.flags |= NILFS_CF_IFILE_STARTED;
1234 /* Fall through */
1235 case NILFS_ST_IFILE:
1236 err = nilfs_segctor_scan_file(sci, sbi->s_ifile,
1237 &nilfs_sc_file_ops);
1238 if (unlikely(err))
1239 break;
1240 sci->sc_stage.scnt++;
1241 /* Creating a checkpoint */
1242 err = nilfs_segctor_create_checkpoint(sci);
1243 if (unlikely(err))
1244 break;
1245 /* Fall through */
1246 case NILFS_ST_CPFILE:
1247 err = nilfs_segctor_scan_file(sci, nilfs->ns_cpfile,
1248 &nilfs_sc_file_ops);
1249 if (unlikely(err))
1250 break;
1251 sci->sc_stage.scnt++; /* Fall through */
1252 case NILFS_ST_SUFILE:
1253 err = nilfs_segctor_prepare_free_segments(sci,
1254 nilfs->ns_sufile);
1255 if (unlikely(err))
1256 break;
1257 err = nilfs_segctor_scan_file(sci, nilfs->ns_sufile,
1258 &nilfs_sc_file_ops);
1259 if (unlikely(err))
1260 break;
1261 sci->sc_stage.scnt++; /* Fall through */
1262 case NILFS_ST_DAT:
1263 dat_stage:
1264 err = nilfs_segctor_scan_file(sci, nilfs_dat_inode(nilfs),
1265 &nilfs_sc_dat_ops);
1266 if (unlikely(err))
1267 break;
1268 if (mode == SC_FLUSH_DAT) {
1269 sci->sc_stage.scnt = NILFS_ST_DONE;
1270 return 0;
1271 }
1272 sci->sc_stage.scnt++; /* Fall through */
1273 case NILFS_ST_SR:
1274 if (mode == SC_LSEG_SR) {
1275 /* Appending a super root */
1276 err = nilfs_segctor_add_super_root(sci);
1277 if (unlikely(err))
1278 break;
1279 }
1280 /* End of a logical segment */
1281 sci->sc_curseg->sb_sum.flags |= NILFS_SS_LOGEND;
1282 sci->sc_stage.scnt = NILFS_ST_DONE;
1283 return 0;
1284 case NILFS_ST_DSYNC:
1285 dsync_mode:
1286 sci->sc_curseg->sb_sum.flags |= NILFS_SS_SYNDT;
1287 ii = sci->sc_dsync_inode;
1288 if (!test_bit(NILFS_I_BUSY, &ii->i_state))
1289 break;
1290
1291 err = nilfs_segctor_scan_file_dsync(sci, &ii->vfs_inode);
1292 if (unlikely(err))
1293 break;
1294 sci->sc_curseg->sb_sum.flags |= NILFS_SS_LOGEND;
1295 sci->sc_stage.scnt = NILFS_ST_DONE;
1296 return 0;
1297 case NILFS_ST_DONE:
1298 return 0;
1299 default:
1300 BUG();
1301 }
1302
1303 break_or_fail:
1304 return err;
1305}
1306
1307static int nilfs_touch_segusage(struct inode *sufile, __u64 segnum)
1308{
1309 struct buffer_head *bh_su;
1310 struct nilfs_segment_usage *raw_su;
1311 int err;
1312
1313 err = nilfs_sufile_get_segment_usage(sufile, segnum, &raw_su, &bh_su);
1314 if (unlikely(err))
1315 return err;
1316 nilfs_mdt_mark_buffer_dirty(bh_su);
1317 nilfs_mdt_mark_dirty(sufile);
1318 nilfs_sufile_put_segment_usage(sufile, segnum, bh_su);
1319 return 0;
1320}
1321
1322static int nilfs_segctor_begin_construction(struct nilfs_sc_info *sci,
1323 struct the_nilfs *nilfs)
1324{
1325 struct nilfs_segment_buffer *segbuf, *n;
1326 __u64 nextnum;
1327 int err;
1328
1329 if (list_empty(&sci->sc_segbufs)) {
1330 segbuf = nilfs_segbuf_new(sci->sc_super);
1331 if (unlikely(!segbuf))
1332 return -ENOMEM;
1333 list_add(&segbuf->sb_list, &sci->sc_segbufs);
1334 } else
1335 segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
1336
1337 nilfs_segbuf_map(segbuf, nilfs->ns_segnum, nilfs->ns_pseg_offset,
1338 nilfs);
1339
1340 if (segbuf->sb_rest_blocks < NILFS_PSEG_MIN_BLOCKS) {
1341 nilfs_shift_to_next_segment(nilfs);
1342 nilfs_segbuf_map(segbuf, nilfs->ns_segnum, 0, nilfs);
1343 }
1344 sci->sc_segbuf_nblocks = segbuf->sb_rest_blocks;
1345
1346 err = nilfs_touch_segusage(nilfs->ns_sufile, segbuf->sb_segnum);
1347 if (unlikely(err))
1348 return err;
1349
1350 if (nilfs->ns_segnum == nilfs->ns_nextnum) {
1351 /* Start from the head of a new full segment */
1352 err = nilfs_sufile_alloc(nilfs->ns_sufile, &nextnum);
1353 if (unlikely(err))
1354 return err;
1355 } else
1356 nextnum = nilfs->ns_nextnum;
1357
1358 segbuf->sb_sum.seg_seq = nilfs->ns_seg_seq;
1359 nilfs_segbuf_set_next_segnum(segbuf, nextnum, nilfs);
1360
1361 /* truncating segment buffers */
1362 list_for_each_entry_safe_continue(segbuf, n, &sci->sc_segbufs,
1363 sb_list) {
1364 list_del_init(&segbuf->sb_list);
1365 nilfs_segbuf_free(segbuf);
1366 }
1367 return 0;
1368}
1369
1370static int nilfs_segctor_extend_segments(struct nilfs_sc_info *sci,
1371 struct the_nilfs *nilfs, int nadd)
1372{
1373 struct nilfs_segment_buffer *segbuf, *prev, *n;
1374 struct inode *sufile = nilfs->ns_sufile;
1375 __u64 nextnextnum;
1376 LIST_HEAD(list);
1377 int err, ret, i;
1378
1379 prev = NILFS_LAST_SEGBUF(&sci->sc_segbufs);
1380 /*
1381 * Since the segment specified with nextnum might be allocated during
1382 * the previous construction, the buffer including its segusage may
1383 * not be dirty. The following call ensures that the buffer is dirty
1384 * and will pin the buffer on memory until the sufile is written.
1385 */
1386 err = nilfs_touch_segusage(sufile, prev->sb_nextnum);
1387 if (unlikely(err))
1388 return err;
1389
1390 for (i = 0; i < nadd; i++) {
1391 /* extend segment info */
1392 err = -ENOMEM;
1393 segbuf = nilfs_segbuf_new(sci->sc_super);
1394 if (unlikely(!segbuf))
1395 goto failed;
1396
1397 /* map this buffer to region of segment on-disk */
1398 nilfs_segbuf_map(segbuf, prev->sb_nextnum, 0, nilfs);
1399 sci->sc_segbuf_nblocks += segbuf->sb_rest_blocks;
1400
1401 /* allocate the next next full segment */
1402 err = nilfs_sufile_alloc(sufile, &nextnextnum);
1403 if (unlikely(err))
1404 goto failed_segbuf;
1405
1406 segbuf->sb_sum.seg_seq = prev->sb_sum.seg_seq + 1;
1407 nilfs_segbuf_set_next_segnum(segbuf, nextnextnum, nilfs);
1408
1409 list_add_tail(&segbuf->sb_list, &list);
1410 prev = segbuf;
1411 }
1412 list_splice(&list, sci->sc_segbufs.prev);
1413 return 0;
1414
1415 failed_segbuf:
1416 nilfs_segbuf_free(segbuf);
1417 failed:
1418 list_for_each_entry_safe(segbuf, n, &list, sb_list) {
1419 ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum);
1420 WARN_ON(ret); /* never fails */
1421 list_del_init(&segbuf->sb_list);
1422 nilfs_segbuf_free(segbuf);
1423 }
1424 return err;
1425}
1426
1427static void nilfs_segctor_free_incomplete_segments(struct nilfs_sc_info *sci,
1428 struct the_nilfs *nilfs)
1429{
1430 struct nilfs_segment_buffer *segbuf;
1431 int ret, done = 0;
1432
1433 segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
1434 if (nilfs->ns_nextnum != segbuf->sb_nextnum) {
1435 ret = nilfs_sufile_free(nilfs->ns_sufile, segbuf->sb_nextnum);
1436 WARN_ON(ret); /* never fails */
1437 }
1438 if (segbuf->sb_io_error) {
1439 /* Case 1: The first segment failed */
1440 if (segbuf->sb_pseg_start != segbuf->sb_fseg_start)
1441 /* Case 1a: Partial segment appended into an existing
1442 segment */
1443 nilfs_terminate_segment(nilfs, segbuf->sb_fseg_start,
1444 segbuf->sb_fseg_end);
1445 else /* Case 1b: New full segment */
1446 set_nilfs_discontinued(nilfs);
1447 done++;
1448 }
1449
1450 list_for_each_entry_continue(segbuf, &sci->sc_segbufs, sb_list) {
1451 ret = nilfs_sufile_free(nilfs->ns_sufile, segbuf->sb_nextnum);
1452 WARN_ON(ret); /* never fails */
1453 if (!done && segbuf->sb_io_error) {
1454 if (segbuf->sb_segnum != nilfs->ns_nextnum)
1455 /* Case 2: extended segment (!= next) failed */
1456 nilfs_sufile_set_error(nilfs->ns_sufile,
1457 segbuf->sb_segnum);
1458 done++;
1459 }
1460 }
1461}
1462
1463static void nilfs_segctor_clear_segment_buffers(struct nilfs_sc_info *sci)
1464{
1465 struct nilfs_segment_buffer *segbuf;
1466
1467 list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list)
1468 nilfs_segbuf_clear(segbuf);
1469 sci->sc_super_root = NULL;
1470}
1471
1472static void nilfs_segctor_destroy_segment_buffers(struct nilfs_sc_info *sci)
1473{
1474 struct nilfs_segment_buffer *segbuf;
1475
1476 while (!list_empty(&sci->sc_segbufs)) {
1477 segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
1478 list_del_init(&segbuf->sb_list);
1479 nilfs_segbuf_free(segbuf);
1480 }
1481 /* sci->sc_curseg = NULL; */
1482}
1483
1484static void nilfs_segctor_end_construction(struct nilfs_sc_info *sci,
1485 struct the_nilfs *nilfs, int err)
1486{
1487 if (unlikely(err)) {
1488 nilfs_segctor_free_incomplete_segments(sci, nilfs);
1489 nilfs_segctor_cancel_free_segments(sci, nilfs->ns_sufile);
1490 }
1491 nilfs_segctor_clear_segment_buffers(sci);
1492}
1493
1494static void nilfs_segctor_update_segusage(struct nilfs_sc_info *sci,
1495 struct inode *sufile)
1496{
1497 struct nilfs_segment_buffer *segbuf;
1498 struct buffer_head *bh_su;
1499 struct nilfs_segment_usage *raw_su;
1500 unsigned long live_blocks;
1501 int ret;
1502
1503 list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
1504 ret = nilfs_sufile_get_segment_usage(sufile, segbuf->sb_segnum,
1505 &raw_su, &bh_su);
1506 WARN_ON(ret); /* always succeed because bh_su is dirty */
1507 live_blocks = segbuf->sb_sum.nblocks +
1508 (segbuf->sb_pseg_start - segbuf->sb_fseg_start);
1509 raw_su->su_lastmod = cpu_to_le64(sci->sc_seg_ctime);
1510 raw_su->su_nblocks = cpu_to_le32(live_blocks);
1511 nilfs_sufile_put_segment_usage(sufile, segbuf->sb_segnum,
1512 bh_su);
1513 }
1514}
1515
1516static void nilfs_segctor_cancel_segusage(struct nilfs_sc_info *sci,
1517 struct inode *sufile)
1518{
1519 struct nilfs_segment_buffer *segbuf;
1520 struct buffer_head *bh_su;
1521 struct nilfs_segment_usage *raw_su;
1522 int ret;
1523
1524 segbuf = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
1525 ret = nilfs_sufile_get_segment_usage(sufile, segbuf->sb_segnum,
1526 &raw_su, &bh_su);
1527 WARN_ON(ret); /* always succeed because bh_su is dirty */
1528 raw_su->su_nblocks = cpu_to_le32(segbuf->sb_pseg_start -
1529 segbuf->sb_fseg_start);
1530 nilfs_sufile_put_segment_usage(sufile, segbuf->sb_segnum, bh_su);
1531
1532 list_for_each_entry_continue(segbuf, &sci->sc_segbufs, sb_list) {
1533 ret = nilfs_sufile_get_segment_usage(sufile, segbuf->sb_segnum,
1534 &raw_su, &bh_su);
1535 WARN_ON(ret); /* always succeed */
1536 raw_su->su_nblocks = 0;
1537 nilfs_sufile_put_segment_usage(sufile, segbuf->sb_segnum,
1538 bh_su);
1539 }
1540}
1541
1542static void nilfs_segctor_truncate_segments(struct nilfs_sc_info *sci,
1543 struct nilfs_segment_buffer *last,
1544 struct inode *sufile)
1545{
1546 struct nilfs_segment_buffer *segbuf = last, *n;
1547 int ret;
1548
1549 list_for_each_entry_safe_continue(segbuf, n, &sci->sc_segbufs,
1550 sb_list) {
1551 list_del_init(&segbuf->sb_list);
1552 sci->sc_segbuf_nblocks -= segbuf->sb_rest_blocks;
1553 ret = nilfs_sufile_free(sufile, segbuf->sb_nextnum);
1554 WARN_ON(ret);
1555 nilfs_segbuf_free(segbuf);
1556 }
1557}
1558
1559
1560static int nilfs_segctor_collect(struct nilfs_sc_info *sci,
1561 struct the_nilfs *nilfs, int mode)
1562{
1563 struct nilfs_cstage prev_stage = sci->sc_stage;
1564 int err, nadd = 1;
1565
1566 /* Collection retry loop */
1567 for (;;) {
1568 sci->sc_super_root = NULL;
1569 sci->sc_nblk_this_inc = 0;
1570 sci->sc_curseg = NILFS_FIRST_SEGBUF(&sci->sc_segbufs);
1571
1572 err = nilfs_segctor_reset_segment_buffer(sci);
1573 if (unlikely(err))
1574 goto failed;
1575
1576 err = nilfs_segctor_collect_blocks(sci, mode);
1577 sci->sc_nblk_this_inc += sci->sc_curseg->sb_sum.nblocks;
1578 if (!err)
1579 break;
1580
1581 if (unlikely(err != -E2BIG))
1582 goto failed;
1583
1584 /* The current segment is filled up */
1585 if (mode != SC_LSEG_SR || sci->sc_stage.scnt < NILFS_ST_CPFILE)
1586 break;
1587
1588 nilfs_segctor_cancel_free_segments(sci, nilfs->ns_sufile);
1589 nilfs_segctor_clear_segment_buffers(sci);
1590
1591 err = nilfs_segctor_extend_segments(sci, nilfs, nadd);
1592 if (unlikely(err))
1593 return err;
1594
1595 nadd = min_t(int, nadd << 1, SC_MAX_SEGDELTA);
1596 sci->sc_stage = prev_stage;
1597 }
1598 nilfs_segctor_truncate_segments(sci, sci->sc_curseg, nilfs->ns_sufile);
1599 return 0;
1600
1601 failed:
1602 return err;
1603}
1604
1605static void nilfs_list_replace_buffer(struct buffer_head *old_bh,
1606 struct buffer_head *new_bh)
1607{
1608 BUG_ON(!list_empty(&new_bh->b_assoc_buffers));
1609
1610 list_replace_init(&old_bh->b_assoc_buffers, &new_bh->b_assoc_buffers);
1611 /* The caller must release old_bh */
1612}
1613
1614static int
1615nilfs_segctor_update_payload_blocknr(struct nilfs_sc_info *sci,
1616 struct nilfs_segment_buffer *segbuf,
1617 int mode)
1618{
1619 struct inode *inode = NULL;
1620 sector_t blocknr;
1621 unsigned long nfinfo = segbuf->sb_sum.nfinfo;
1622 unsigned long nblocks = 0, ndatablk = 0;
1623 struct nilfs_sc_operations *sc_op = NULL;
1624 struct nilfs_segsum_pointer ssp;
1625 struct nilfs_finfo *finfo = NULL;
1626 union nilfs_binfo binfo;
1627 struct buffer_head *bh, *bh_org;
1628 ino_t ino = 0;
1629 int err = 0;
1630
1631 if (!nfinfo)
1632 goto out;
1633
1634 blocknr = segbuf->sb_pseg_start + segbuf->sb_sum.nsumblk;
1635 ssp.bh = NILFS_SEGBUF_FIRST_BH(&segbuf->sb_segsum_buffers);
1636 ssp.offset = sizeof(struct nilfs_segment_summary);
1637
1638 list_for_each_entry(bh, &segbuf->sb_payload_buffers, b_assoc_buffers) {
1639 if (bh == sci->sc_super_root)
1640 break;
1641 if (!finfo) {
1642 finfo = nilfs_segctor_map_segsum_entry(
1643 sci, &ssp, sizeof(*finfo));
1644 ino = le64_to_cpu(finfo->fi_ino);
1645 nblocks = le32_to_cpu(finfo->fi_nblocks);
1646 ndatablk = le32_to_cpu(finfo->fi_ndatablk);
1647
1648 if (buffer_nilfs_node(bh))
1649 inode = NILFS_BTNC_I(bh->b_page->mapping);
1650 else
1651 inode = NILFS_AS_I(bh->b_page->mapping);
1652
1653 if (mode == SC_LSEG_DSYNC)
1654 sc_op = &nilfs_sc_dsync_ops;
1655 else if (ino == NILFS_DAT_INO)
1656 sc_op = &nilfs_sc_dat_ops;
1657 else /* file blocks */
1658 sc_op = &nilfs_sc_file_ops;
1659 }
1660 bh_org = bh;
1661 get_bh(bh_org);
1662 err = nilfs_bmap_assign(NILFS_I(inode)->i_bmap, &bh, blocknr,
1663 &binfo);
1664 if (bh != bh_org)
1665 nilfs_list_replace_buffer(bh_org, bh);
1666 brelse(bh_org);
1667 if (unlikely(err))
1668 goto failed_bmap;
1669
1670 if (ndatablk > 0)
1671 sc_op->write_data_binfo(sci, &ssp, &binfo);
1672 else
1673 sc_op->write_node_binfo(sci, &ssp, &binfo);
1674
1675 blocknr++;
1676 if (--nblocks == 0) {
1677 finfo = NULL;
1678 if (--nfinfo == 0)
1679 break;
1680 } else if (ndatablk > 0)
1681 ndatablk--;
1682 }
1683 out:
1684 return 0;
1685
1686 failed_bmap:
1687 err = nilfs_handle_bmap_error(err, __func__, inode, sci->sc_super);
1688 return err;
1689}
1690
1691static int nilfs_segctor_assign(struct nilfs_sc_info *sci, int mode)
1692{
1693 struct nilfs_segment_buffer *segbuf;
1694 int err;
1695
1696 list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
1697 err = nilfs_segctor_update_payload_blocknr(sci, segbuf, mode);
1698 if (unlikely(err))
1699 return err;
1700 nilfs_segbuf_fill_in_segsum(segbuf);
1701 }
1702 return 0;
1703}
1704
1705static int
1706nilfs_copy_replace_page_buffers(struct page *page, struct list_head *out)
1707{
1708 struct page *clone_page;
1709 struct buffer_head *bh, *head, *bh2;
1710 void *kaddr;
1711
1712 bh = head = page_buffers(page);
1713
1714 clone_page = nilfs_alloc_private_page(bh->b_bdev, bh->b_size, 0);
1715 if (unlikely(!clone_page))
1716 return -ENOMEM;
1717
1718 bh2 = page_buffers(clone_page);
1719 kaddr = kmap_atomic(page, KM_USER0);
1720 do {
1721 if (list_empty(&bh->b_assoc_buffers))
1722 continue;
1723 get_bh(bh2);
1724 page_cache_get(clone_page); /* for each bh */
1725 memcpy(bh2->b_data, kaddr + bh_offset(bh), bh2->b_size);
1726 bh2->b_blocknr = bh->b_blocknr;
1727 list_replace(&bh->b_assoc_buffers, &bh2->b_assoc_buffers);
1728 list_add_tail(&bh->b_assoc_buffers, out);
1729 } while (bh = bh->b_this_page, bh2 = bh2->b_this_page, bh != head);
1730 kunmap_atomic(kaddr, KM_USER0);
1731
1732 if (!TestSetPageWriteback(clone_page))
1733 inc_zone_page_state(clone_page, NR_WRITEBACK);
1734 unlock_page(clone_page);
1735
1736 return 0;
1737}
1738
1739static int nilfs_test_page_to_be_frozen(struct page *page)
1740{
1741 struct address_space *mapping = page->mapping;
1742
1743 if (!mapping || !mapping->host || S_ISDIR(mapping->host->i_mode))
1744 return 0;
1745
1746 if (page_mapped(page)) {
1747 ClearPageChecked(page);
1748 return 1;
1749 }
1750 return PageChecked(page);
1751}
1752
1753static int nilfs_begin_page_io(struct page *page, struct list_head *out)
1754{
1755 if (!page || PageWriteback(page))
1756 /* For split b-tree node pages, this function may be called
1757 twice. We ignore the 2nd or later calls by this check. */
1758 return 0;
1759
1760 lock_page(page);
1761 clear_page_dirty_for_io(page);
1762 set_page_writeback(page);
1763 unlock_page(page);
1764
1765 if (nilfs_test_page_to_be_frozen(page)) {
1766 int err = nilfs_copy_replace_page_buffers(page, out);
1767 if (unlikely(err))
1768 return err;
1769 }
1770 return 0;
1771}
1772
1773static int nilfs_segctor_prepare_write(struct nilfs_sc_info *sci,
1774 struct page **failed_page)
1775{
1776 struct nilfs_segment_buffer *segbuf;
1777 struct page *bd_page = NULL, *fs_page = NULL;
1778 struct list_head *list = &sci->sc_copied_buffers;
1779 int err;
1780
1781 *failed_page = NULL;
1782 list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
1783 struct buffer_head *bh;
1784
1785 list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
1786 b_assoc_buffers) {
1787 if (bh->b_page != bd_page) {
1788 if (bd_page) {
1789 lock_page(bd_page);
1790 clear_page_dirty_for_io(bd_page);
1791 set_page_writeback(bd_page);
1792 unlock_page(bd_page);
1793 }
1794 bd_page = bh->b_page;
1795 }
1796 }
1797
1798 list_for_each_entry(bh, &segbuf->sb_payload_buffers,
1799 b_assoc_buffers) {
1800 if (bh == sci->sc_super_root) {
1801 if (bh->b_page != bd_page) {
1802 lock_page(bd_page);
1803 clear_page_dirty_for_io(bd_page);
1804 set_page_writeback(bd_page);
1805 unlock_page(bd_page);
1806 bd_page = bh->b_page;
1807 }
1808 break;
1809 }
1810 if (bh->b_page != fs_page) {
1811 err = nilfs_begin_page_io(fs_page, list);
1812 if (unlikely(err)) {
1813 *failed_page = fs_page;
1814 goto out;
1815 }
1816 fs_page = bh->b_page;
1817 }
1818 }
1819 }
1820 if (bd_page) {
1821 lock_page(bd_page);
1822 clear_page_dirty_for_io(bd_page);
1823 set_page_writeback(bd_page);
1824 unlock_page(bd_page);
1825 }
1826 err = nilfs_begin_page_io(fs_page, list);
1827 if (unlikely(err))
1828 *failed_page = fs_page;
1829 out:
1830 return err;
1831}
1832
1833static int nilfs_segctor_write(struct nilfs_sc_info *sci,
1834 struct backing_dev_info *bdi)
1835{
1836 struct nilfs_segment_buffer *segbuf;
1837 struct nilfs_write_info wi;
1838 int err, res;
1839
1840 wi.sb = sci->sc_super;
1841 wi.bh_sr = sci->sc_super_root;
1842 wi.bdi = bdi;
1843
1844 list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
1845 nilfs_segbuf_prepare_write(segbuf, &wi);
1846 err = nilfs_segbuf_write(segbuf, &wi);
1847
1848 res = nilfs_segbuf_wait(segbuf, &wi);
1849 err = unlikely(err) ? : res;
1850 if (unlikely(err))
1851 return err;
1852 }
1853 return 0;
1854}
1855
1856static int nilfs_page_has_uncleared_buffer(struct page *page)
1857{
1858 struct buffer_head *head, *bh;
1859
1860 head = bh = page_buffers(page);
1861 do {
1862 if (buffer_dirty(bh) && !list_empty(&bh->b_assoc_buffers))
1863 return 1;
1864 bh = bh->b_this_page;
1865 } while (bh != head);
1866 return 0;
1867}
1868
1869static void __nilfs_end_page_io(struct page *page, int err)
1870{
1871 if (!err) {
1872 if (!nilfs_page_buffers_clean(page))
1873 __set_page_dirty_nobuffers(page);
1874 ClearPageError(page);
1875 } else {
1876 __set_page_dirty_nobuffers(page);
1877 SetPageError(page);
1878 }
1879
1880 if (buffer_nilfs_allocated(page_buffers(page))) {
1881 if (TestClearPageWriteback(page))
1882 dec_zone_page_state(page, NR_WRITEBACK);
1883 } else
1884 end_page_writeback(page);
1885}
1886
1887static void nilfs_end_page_io(struct page *page, int err)
1888{
1889 if (!page)
1890 return;
1891
1892 if (buffer_nilfs_node(page_buffers(page)) &&
1893 nilfs_page_has_uncleared_buffer(page))
1894 /* For b-tree node pages, this function may be called twice
1895 or more because they might be split in a segment.
1896 This check assures that cleanup has been done for all
1897 buffers in a split btnode page. */
1898 return;
1899
1900 __nilfs_end_page_io(page, err);
1901}
1902
1903static void nilfs_clear_copied_buffers(struct list_head *list, int err)
1904{
1905 struct buffer_head *bh, *head;
1906 struct page *page;
1907
1908 while (!list_empty(list)) {
1909 bh = list_entry(list->next, struct buffer_head,
1910 b_assoc_buffers);
1911 page = bh->b_page;
1912 page_cache_get(page);
1913 head = bh = page_buffers(page);
1914 do {
1915 if (!list_empty(&bh->b_assoc_buffers)) {
1916 list_del_init(&bh->b_assoc_buffers);
1917 if (!err) {
1918 set_buffer_uptodate(bh);
1919 clear_buffer_dirty(bh);
1920 clear_buffer_nilfs_volatile(bh);
1921 }
1922 brelse(bh); /* for b_assoc_buffers */
1923 }
1924 } while ((bh = bh->b_this_page) != head);
1925
1926 __nilfs_end_page_io(page, err);
1927 page_cache_release(page);
1928 }
1929}
1930
1931static void nilfs_segctor_abort_write(struct nilfs_sc_info *sci,
1932 struct page *failed_page, int err)
1933{
1934 struct nilfs_segment_buffer *segbuf;
1935 struct page *bd_page = NULL, *fs_page = NULL;
1936
1937 list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
1938 struct buffer_head *bh;
1939
1940 list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
1941 b_assoc_buffers) {
1942 if (bh->b_page != bd_page) {
1943 if (bd_page)
1944 end_page_writeback(bd_page);
1945 bd_page = bh->b_page;
1946 }
1947 }
1948
1949 list_for_each_entry(bh, &segbuf->sb_payload_buffers,
1950 b_assoc_buffers) {
1951 if (bh == sci->sc_super_root) {
1952 if (bh->b_page != bd_page) {
1953 end_page_writeback(bd_page);
1954 bd_page = bh->b_page;
1955 }
1956 break;
1957 }
1958 if (bh->b_page != fs_page) {
1959 nilfs_end_page_io(fs_page, err);
1960 if (unlikely(fs_page == failed_page))
1961 goto done;
1962 fs_page = bh->b_page;
1963 }
1964 }
1965 }
1966 if (bd_page)
1967 end_page_writeback(bd_page);
1968
1969 nilfs_end_page_io(fs_page, err);
1970 done:
1971 nilfs_clear_copied_buffers(&sci->sc_copied_buffers, err);
1972}
1973
1974static void nilfs_set_next_segment(struct the_nilfs *nilfs,
1975 struct nilfs_segment_buffer *segbuf)
1976{
1977 nilfs->ns_segnum = segbuf->sb_segnum;
1978 nilfs->ns_nextnum = segbuf->sb_nextnum;
1979 nilfs->ns_pseg_offset = segbuf->sb_pseg_start - segbuf->sb_fseg_start
1980 + segbuf->sb_sum.nblocks;
1981 nilfs->ns_seg_seq = segbuf->sb_sum.seg_seq;
1982 nilfs->ns_ctime = segbuf->sb_sum.ctime;
1983}
1984
1985static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci)
1986{
1987 struct nilfs_segment_buffer *segbuf;
1988 struct page *bd_page = NULL, *fs_page = NULL;
1989 struct nilfs_sb_info *sbi = sci->sc_sbi;
1990 struct the_nilfs *nilfs = sbi->s_nilfs;
1991 int update_sr = (sci->sc_super_root != NULL);
1992
1993 list_for_each_entry(segbuf, &sci->sc_segbufs, sb_list) {
1994 struct buffer_head *bh;
1995
1996 list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
1997 b_assoc_buffers) {
1998 set_buffer_uptodate(bh);
1999 clear_buffer_dirty(bh);
2000 if (bh->b_page != bd_page) {
2001 if (bd_page)
2002 end_page_writeback(bd_page);
2003 bd_page = bh->b_page;
2004 }
2005 }
2006 /*
2007 * We assume that the buffers which belong to the same page
2008 * continue over the buffer list.
2009 * Under this assumption, the last BHs of pages is
2010 * identifiable by the discontinuity of bh->b_page
2011 * (page != fs_page).
2012 *
2013 * For B-tree node blocks, however, this assumption is not
2014 * guaranteed. The cleanup code of B-tree node pages needs
2015 * special care.
2016 */
2017 list_for_each_entry(bh, &segbuf->sb_payload_buffers,
2018 b_assoc_buffers) {
2019 set_buffer_uptodate(bh);
2020 clear_buffer_dirty(bh);
2021 clear_buffer_nilfs_volatile(bh);
2022 if (bh == sci->sc_super_root) {
2023 if (bh->b_page != bd_page) {
2024 end_page_writeback(bd_page);
2025 bd_page = bh->b_page;
2026 }
2027 break;
2028 }
2029 if (bh->b_page != fs_page) {
2030 nilfs_end_page_io(fs_page, 0);
2031 fs_page = bh->b_page;
2032 }
2033 }
2034
2035 if (!NILFS_SEG_SIMPLEX(&segbuf->sb_sum)) {
2036 if (NILFS_SEG_LOGBGN(&segbuf->sb_sum)) {
2037 set_bit(NILFS_SC_UNCLOSED, &sci->sc_flags);
2038 sci->sc_lseg_stime = jiffies;
2039 }
2040 if (NILFS_SEG_LOGEND(&segbuf->sb_sum))
2041 clear_bit(NILFS_SC_UNCLOSED, &sci->sc_flags);
2042 }
2043 }
2044 /*
2045 * Since pages may continue over multiple segment buffers,
2046 * end of the last page must be checked outside of the loop.
2047 */
2048 if (bd_page)
2049 end_page_writeback(bd_page);
2050
2051 nilfs_end_page_io(fs_page, 0);
2052
2053 nilfs_clear_copied_buffers(&sci->sc_copied_buffers, 0);
2054
2055 nilfs_drop_collected_inodes(&sci->sc_dirty_files);
2056
2057 if (nilfs_doing_gc()) {
2058 nilfs_drop_collected_inodes(&sci->sc_gc_inodes);
2059 if (update_sr)
2060 nilfs_commit_gcdat_inode(nilfs);
2061 } else
2062 nilfs->ns_nongc_ctime = sci->sc_seg_ctime;
2063
2064 sci->sc_nblk_inc += sci->sc_nblk_this_inc;
2065
2066 segbuf = NILFS_LAST_SEGBUF(&sci->sc_segbufs);
2067 nilfs_set_next_segment(nilfs, segbuf);
2068
2069 if (update_sr) {
2070 nilfs_set_last_segment(nilfs, segbuf->sb_pseg_start,
2071 segbuf->sb_sum.seg_seq, nilfs->ns_cno++);
2072 sbi->s_super->s_dirt = 1;
2073
2074 clear_bit(NILFS_SC_HAVE_DELTA, &sci->sc_flags);
2075 clear_bit(NILFS_SC_DIRTY, &sci->sc_flags);
2076 set_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags);
2077 } else
2078 clear_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags);
2079}
2080
2081static int nilfs_segctor_check_in_files(struct nilfs_sc_info *sci,
2082 struct nilfs_sb_info *sbi)
2083{
2084 struct nilfs_inode_info *ii, *n;
2085 __u64 cno = sbi->s_nilfs->ns_cno;
2086
2087 spin_lock(&sbi->s_inode_lock);
2088 retry:
2089 list_for_each_entry_safe(ii, n, &sbi->s_dirty_files, i_dirty) {
2090 if (!ii->i_bh) {
2091 struct buffer_head *ibh;
2092 int err;
2093
2094 spin_unlock(&sbi->s_inode_lock);
2095 err = nilfs_ifile_get_inode_block(
2096 sbi->s_ifile, ii->vfs_inode.i_ino, &ibh);
2097 if (unlikely(err)) {
2098 nilfs_warning(sbi->s_super, __func__,
2099 "failed to get inode block.\n");
2100 return err;
2101 }
2102 nilfs_mdt_mark_buffer_dirty(ibh);
2103 nilfs_mdt_mark_dirty(sbi->s_ifile);
2104 spin_lock(&sbi->s_inode_lock);
2105 if (likely(!ii->i_bh))
2106 ii->i_bh = ibh;
2107 else
2108 brelse(ibh);
2109 goto retry;
2110 }
2111 ii->i_cno = cno;
2112
2113 clear_bit(NILFS_I_QUEUED, &ii->i_state);
2114 set_bit(NILFS_I_BUSY, &ii->i_state);
2115 list_del(&ii->i_dirty);
2116 list_add_tail(&ii->i_dirty, &sci->sc_dirty_files);
2117 }
2118 spin_unlock(&sbi->s_inode_lock);
2119
2120 NILFS_I(sbi->s_ifile)->i_cno = cno;
2121
2122 return 0;
2123}
2124
2125static void nilfs_segctor_check_out_files(struct nilfs_sc_info *sci,
2126 struct nilfs_sb_info *sbi)
2127{
2128 struct nilfs_transaction_info *ti = current->journal_info;
2129 struct nilfs_inode_info *ii, *n;
2130 __u64 cno = sbi->s_nilfs->ns_cno;
2131
2132 spin_lock(&sbi->s_inode_lock);
2133 list_for_each_entry_safe(ii, n, &sci->sc_dirty_files, i_dirty) {
2134 if (!test_and_clear_bit(NILFS_I_UPDATED, &ii->i_state) ||
2135 test_bit(NILFS_I_DIRTY, &ii->i_state)) {
2136 /* The current checkpoint number (=nilfs->ns_cno) is
2137 changed between check-in and check-out only if the
2138 super root is written out. So, we can update i_cno
2139 for the inodes that remain in the dirty list. */
2140 ii->i_cno = cno;
2141 continue;
2142 }
2143 clear_bit(NILFS_I_BUSY, &ii->i_state);
2144 brelse(ii->i_bh);
2145 ii->i_bh = NULL;
2146 list_del(&ii->i_dirty);
2147 list_add_tail(&ii->i_dirty, &ti->ti_garbage);
2148 }
2149 spin_unlock(&sbi->s_inode_lock);
2150}
2151
2152/*
2153 * Main procedure of segment constructor
2154 */
2155static int nilfs_segctor_do_construct(struct nilfs_sc_info *sci, int mode)
2156{
2157 struct nilfs_sb_info *sbi = sci->sc_sbi;
2158 struct the_nilfs *nilfs = sbi->s_nilfs;
2159 struct page *failed_page;
2160 int err, has_sr = 0;
2161
2162 sci->sc_stage.scnt = NILFS_ST_INIT;
2163
2164 err = nilfs_segctor_check_in_files(sci, sbi);
2165 if (unlikely(err))
2166 goto out;
2167
2168 if (nilfs_test_metadata_dirty(sbi))
2169 set_bit(NILFS_SC_DIRTY, &sci->sc_flags);
2170
2171 if (nilfs_segctor_clean(sci))
2172 goto out;
2173
2174 do {
2175 sci->sc_stage.flags &= ~NILFS_CF_HISTORY_MASK;
2176
2177 err = nilfs_segctor_begin_construction(sci, nilfs);
2178 if (unlikely(err))
2179 goto out;
2180
2181 /* Update time stamp */
2182 sci->sc_seg_ctime = get_seconds();
2183
2184 err = nilfs_segctor_collect(sci, nilfs, mode);
2185 if (unlikely(err))
2186 goto failed;
2187
2188 has_sr = (sci->sc_super_root != NULL);
2189
2190 /* Avoid empty segment */
2191 if (sci->sc_stage.scnt == NILFS_ST_DONE &&
2192 NILFS_SEG_EMPTY(&sci->sc_curseg->sb_sum)) {
2193 nilfs_segctor_end_construction(sci, nilfs, 1);
2194 goto out;
2195 }
2196
2197 err = nilfs_segctor_assign(sci, mode);
2198 if (unlikely(err))
2199 goto failed;
2200
2201 if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED)
2202 nilfs_segctor_fill_in_file_bmap(sci, sbi->s_ifile);
2203
2204 if (has_sr) {
2205 err = nilfs_segctor_fill_in_checkpoint(sci);
2206 if (unlikely(err))
2207 goto failed_to_make_up;
2208
2209 nilfs_segctor_fill_in_super_root(sci, nilfs);
2210 }
2211 nilfs_segctor_update_segusage(sci, nilfs->ns_sufile);
2212
2213 /* Write partial segments */
2214 err = nilfs_segctor_prepare_write(sci, &failed_page);
2215 if (unlikely(err))
2216 goto failed_to_write;
2217
2218 nilfs_segctor_fill_in_checksums(sci, nilfs->ns_crc_seed);
2219
2220 err = nilfs_segctor_write(sci, nilfs->ns_bdi);
2221 if (unlikely(err))
2222 goto failed_to_write;
2223
2224 nilfs_segctor_complete_write(sci);
2225
2226 /* Commit segments */
2227 if (has_sr) {
2228 nilfs_segctor_commit_free_segments(sci);
2229 nilfs_segctor_clear_metadata_dirty(sci);
2230 }
2231
2232 nilfs_segctor_end_construction(sci, nilfs, 0);
2233
2234 } while (sci->sc_stage.scnt != NILFS_ST_DONE);
2235
2236 out:
2237 nilfs_segctor_destroy_segment_buffers(sci);
2238 nilfs_segctor_check_out_files(sci, sbi);
2239 return err;
2240
2241 failed_to_write:
2242 nilfs_segctor_abort_write(sci, failed_page, err);
2243 nilfs_segctor_cancel_segusage(sci, nilfs->ns_sufile);
2244
2245 failed_to_make_up:
2246 if (sci->sc_stage.flags & NILFS_CF_IFILE_STARTED)
2247 nilfs_redirty_inodes(&sci->sc_dirty_files);
2248
2249 failed:
2250 if (nilfs_doing_gc())
2251 nilfs_redirty_inodes(&sci->sc_gc_inodes);
2252 nilfs_segctor_end_construction(sci, nilfs, err);
2253 goto out;
2254}
2255
2256/**
2257 * nilfs_secgtor_start_timer - set timer of background write
2258 * @sci: nilfs_sc_info
2259 *
2260 * If the timer has already been set, it ignores the new request.
2261 * This function MUST be called within a section locking the segment
2262 * semaphore.
2263 */
2264static void nilfs_segctor_start_timer(struct nilfs_sc_info *sci)
2265{
2266 spin_lock(&sci->sc_state_lock);
2267 if (sci->sc_timer && !(sci->sc_state & NILFS_SEGCTOR_COMMIT)) {
2268 sci->sc_timer->expires = jiffies + sci->sc_interval;
2269 add_timer(sci->sc_timer);
2270 sci->sc_state |= NILFS_SEGCTOR_COMMIT;
2271 }
2272 spin_unlock(&sci->sc_state_lock);
2273}
2274
2275static void nilfs_segctor_do_flush(struct nilfs_sc_info *sci, int bn)
2276{
2277 spin_lock(&sci->sc_state_lock);
2278 if (!(sci->sc_flush_request & (1 << bn))) {
2279 unsigned long prev_req = sci->sc_flush_request;
2280
2281 sci->sc_flush_request |= (1 << bn);
2282 if (!prev_req)
2283 wake_up(&sci->sc_wait_daemon);
2284 }
2285 spin_unlock(&sci->sc_state_lock);
2286}
2287
2288/**
2289 * nilfs_flush_segment - trigger a segment construction for resource control
2290 * @sb: super block
2291 * @ino: inode number of the file to be flushed out.
2292 */
2293void nilfs_flush_segment(struct super_block *sb, ino_t ino)
2294{
2295 struct nilfs_sb_info *sbi = NILFS_SB(sb);
2296 struct nilfs_sc_info *sci = NILFS_SC(sbi);
2297
2298 if (!sci || nilfs_doing_construction())
2299 return;
2300 nilfs_segctor_do_flush(sci, NILFS_MDT_INODE(sb, ino) ? ino : 0);
2301 /* assign bit 0 to data files */
2302}
2303
2304int nilfs_segctor_add_segments_to_be_freed(struct nilfs_sc_info *sci,
2305 __u64 *segnum, size_t nsegs)
2306{
2307 struct nilfs_segment_entry *ent;
2308 struct the_nilfs *nilfs = sci->sc_sbi->s_nilfs;
2309 struct inode *sufile = nilfs->ns_sufile;
2310 LIST_HEAD(list);
2311 __u64 *pnum;
2312 size_t i;
2313 int err;
2314
2315 for (pnum = segnum, i = 0; i < nsegs; pnum++, i++) {
2316 ent = nilfs_alloc_segment_entry(*pnum);
2317 if (unlikely(!ent)) {
2318 err = -ENOMEM;
2319 goto failed;
2320 }
2321 list_add_tail(&ent->list, &list);
2322
2323 err = nilfs_open_segment_entry(ent, sufile);
2324 if (unlikely(err))
2325 goto failed;
2326
2327 if (unlikely(!nilfs_segment_usage_dirty(ent->raw_su)))
2328 printk(KERN_WARNING "NILFS: unused segment is "
2329 "requested to be cleaned (segnum=%llu)\n",
2330 (unsigned long long)ent->segnum);
2331 nilfs_close_segment_entry(ent, sufile);
2332 }
2333 list_splice(&list, sci->sc_cleaning_segments.prev);
2334 return 0;
2335
2336 failed:
2337 nilfs_dispose_segment_list(&list);
2338 return err;
2339}
2340
2341void nilfs_segctor_clear_segments_to_be_freed(struct nilfs_sc_info *sci)
2342{
2343 nilfs_dispose_segment_list(&sci->sc_cleaning_segments);
2344}
2345
2346struct nilfs_segctor_wait_request {
2347 wait_queue_t wq;
2348 __u32 seq;
2349 int err;
2350 atomic_t done;
2351};
2352
2353static int nilfs_segctor_sync(struct nilfs_sc_info *sci)
2354{
2355 struct nilfs_segctor_wait_request wait_req;
2356 int err = 0;
2357
2358 spin_lock(&sci->sc_state_lock);
2359 init_wait(&wait_req.wq);
2360 wait_req.err = 0;
2361 atomic_set(&wait_req.done, 0);
2362 wait_req.seq = ++sci->sc_seq_request;
2363 spin_unlock(&sci->sc_state_lock);
2364
2365 init_waitqueue_entry(&wait_req.wq, current);
2366 add_wait_queue(&sci->sc_wait_request, &wait_req.wq);
2367 set_current_state(TASK_INTERRUPTIBLE);
2368 wake_up(&sci->sc_wait_daemon);
2369
2370 for (;;) {
2371 if (atomic_read(&wait_req.done)) {
2372 err = wait_req.err;
2373 break;
2374 }
2375 if (!signal_pending(current)) {
2376 schedule();
2377 continue;
2378 }
2379 err = -ERESTARTSYS;
2380 break;
2381 }
2382 finish_wait(&sci->sc_wait_request, &wait_req.wq);
2383 return err;
2384}
2385
2386static void nilfs_segctor_wakeup(struct nilfs_sc_info *sci, int err)
2387{
2388 struct nilfs_segctor_wait_request *wrq, *n;
2389 unsigned long flags;
2390
2391 spin_lock_irqsave(&sci->sc_wait_request.lock, flags);
2392 list_for_each_entry_safe(wrq, n, &sci->sc_wait_request.task_list,
2393 wq.task_list) {
2394 if (!atomic_read(&wrq->done) &&
2395 nilfs_cnt32_ge(sci->sc_seq_done, wrq->seq)) {
2396 wrq->err = err;
2397 atomic_set(&wrq->done, 1);
2398 }
2399 if (atomic_read(&wrq->done)) {
2400 wrq->wq.func(&wrq->wq,
2401 TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE,
2402 0, NULL);
2403 }
2404 }
2405 spin_unlock_irqrestore(&sci->sc_wait_request.lock, flags);
2406}
2407
2408/**
2409 * nilfs_construct_segment - construct a logical segment
2410 * @sb: super block
2411 *
2412 * Return Value: On success, 0 is retured. On errors, one of the following
2413 * negative error code is returned.
2414 *
2415 * %-EROFS - Read only filesystem.
2416 *
2417 * %-EIO - I/O error
2418 *
2419 * %-ENOSPC - No space left on device (only in a panic state).
2420 *
2421 * %-ERESTARTSYS - Interrupted.
2422 *
2423 * %-ENOMEM - Insufficient memory available.
2424 */
2425int nilfs_construct_segment(struct super_block *sb)
2426{
2427 struct nilfs_sb_info *sbi = NILFS_SB(sb);
2428 struct nilfs_sc_info *sci = NILFS_SC(sbi);
2429 struct nilfs_transaction_info *ti;
2430 int err;
2431
2432 if (!sci)
2433 return -EROFS;
2434
2435 /* A call inside transactions causes a deadlock. */
2436 BUG_ON((ti = current->journal_info) && ti->ti_magic == NILFS_TI_MAGIC);
2437
2438 err = nilfs_segctor_sync(sci);
2439 return err;
2440}
2441
2442/**
2443 * nilfs_construct_dsync_segment - construct a data-only logical segment
2444 * @sb: super block
2445 * @inode: inode whose data blocks should be written out
2446 * @start: start byte offset
2447 * @end: end byte offset (inclusive)
2448 *
2449 * Return Value: On success, 0 is retured. On errors, one of the following
2450 * negative error code is returned.
2451 *
2452 * %-EROFS - Read only filesystem.
2453 *
2454 * %-EIO - I/O error
2455 *
2456 * %-ENOSPC - No space left on device (only in a panic state).
2457 *
2458 * %-ERESTARTSYS - Interrupted.
2459 *
2460 * %-ENOMEM - Insufficient memory available.
2461 */
2462int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode,
2463 loff_t start, loff_t end)
2464{
2465 struct nilfs_sb_info *sbi = NILFS_SB(sb);
2466 struct nilfs_sc_info *sci = NILFS_SC(sbi);
2467 struct nilfs_inode_info *ii;
2468 struct nilfs_transaction_info ti;
2469 int err = 0;
2470
2471 if (!sci)
2472 return -EROFS;
2473
2474 nilfs_transaction_lock(sbi, &ti, 0);
2475
2476 ii = NILFS_I(inode);
2477 if (test_bit(NILFS_I_INODE_DIRTY, &ii->i_state) ||
2478 nilfs_test_opt(sbi, STRICT_ORDER) ||
2479 test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags) ||
2480 nilfs_discontinued(sbi->s_nilfs)) {
2481 nilfs_transaction_unlock(sbi);
2482 err = nilfs_segctor_sync(sci);
2483 return err;
2484 }
2485
2486 spin_lock(&sbi->s_inode_lock);
2487 if (!test_bit(NILFS_I_QUEUED, &ii->i_state) &&
2488 !test_bit(NILFS_I_BUSY, &ii->i_state)) {
2489 spin_unlock(&sbi->s_inode_lock);
2490 nilfs_transaction_unlock(sbi);
2491 return 0;
2492 }
2493 spin_unlock(&sbi->s_inode_lock);
2494 sci->sc_dsync_inode = ii;
2495 sci->sc_dsync_start = start;
2496 sci->sc_dsync_end = end;
2497
2498 err = nilfs_segctor_do_construct(sci, SC_LSEG_DSYNC);
2499
2500 nilfs_transaction_unlock(sbi);
2501 return err;
2502}
2503
2504struct nilfs_segctor_req {
2505 int mode;
2506 __u32 seq_accepted;
2507 int sc_err; /* construction failure */
2508 int sb_err; /* super block writeback failure */
2509};
2510
2511#define FLUSH_FILE_BIT (0x1) /* data file only */
2512#define FLUSH_DAT_BIT (1 << NILFS_DAT_INO) /* DAT only */
2513
2514static void nilfs_segctor_accept(struct nilfs_sc_info *sci,
2515 struct nilfs_segctor_req *req)
2516{
2517 req->sc_err = req->sb_err = 0;
2518 spin_lock(&sci->sc_state_lock);
2519 req->seq_accepted = sci->sc_seq_request;
2520 spin_unlock(&sci->sc_state_lock);
2521
2522 if (sci->sc_timer)
2523 del_timer_sync(sci->sc_timer);
2524}
2525
2526static void nilfs_segctor_notify(struct nilfs_sc_info *sci,
2527 struct nilfs_segctor_req *req)
2528{
2529 /* Clear requests (even when the construction failed) */
2530 spin_lock(&sci->sc_state_lock);
2531
2532 sci->sc_state &= ~NILFS_SEGCTOR_COMMIT;
2533
2534 if (req->mode == SC_LSEG_SR) {
2535 sci->sc_seq_done = req->seq_accepted;
2536 nilfs_segctor_wakeup(sci, req->sc_err ? : req->sb_err);
2537 sci->sc_flush_request = 0;
2538 } else if (req->mode == SC_FLUSH_FILE)
2539 sci->sc_flush_request &= ~FLUSH_FILE_BIT;
2540 else if (req->mode == SC_FLUSH_DAT)
2541 sci->sc_flush_request &= ~FLUSH_DAT_BIT;
2542
2543 spin_unlock(&sci->sc_state_lock);
2544}
2545
2546static int nilfs_segctor_construct(struct nilfs_sc_info *sci,
2547 struct nilfs_segctor_req *req)
2548{
2549 struct nilfs_sb_info *sbi = sci->sc_sbi;
2550 struct the_nilfs *nilfs = sbi->s_nilfs;
2551 int err = 0;
2552
2553 if (nilfs_discontinued(nilfs))
2554 req->mode = SC_LSEG_SR;
2555 if (!nilfs_segctor_confirm(sci)) {
2556 err = nilfs_segctor_do_construct(sci, req->mode);
2557 req->sc_err = err;
2558 }
2559 if (likely(!err)) {
2560 if (req->mode != SC_FLUSH_DAT)
2561 atomic_set(&nilfs->ns_ndirtyblks, 0);
2562 if (test_bit(NILFS_SC_SUPER_ROOT, &sci->sc_flags) &&
2563 nilfs_discontinued(nilfs)) {
2564 down_write(&nilfs->ns_sem);
2565 req->sb_err = nilfs_commit_super(sbi, 0);
2566 up_write(&nilfs->ns_sem);
2567 }
2568 }
2569 return err;
2570}
2571
2572static void nilfs_construction_timeout(unsigned long data)
2573{
2574 struct task_struct *p = (struct task_struct *)data;
2575 wake_up_process(p);
2576}
2577
2578static void
2579nilfs_remove_written_gcinodes(struct the_nilfs *nilfs, struct list_head *head)
2580{
2581 struct nilfs_inode_info *ii, *n;
2582
2583 list_for_each_entry_safe(ii, n, head, i_dirty) {
2584 if (!test_bit(NILFS_I_UPDATED, &ii->i_state))
2585 continue;
2586 hlist_del_init(&ii->vfs_inode.i_hash);
2587 list_del_init(&ii->i_dirty);
2588 nilfs_clear_gcinode(&ii->vfs_inode);
2589 }
2590}
2591
2592int nilfs_clean_segments(struct super_block *sb, void __user *argp)
2593{
2594 struct nilfs_sb_info *sbi = NILFS_SB(sb);
2595 struct nilfs_sc_info *sci = NILFS_SC(sbi);
2596 struct the_nilfs *nilfs = sbi->s_nilfs;
2597 struct nilfs_transaction_info ti;
2598 struct nilfs_segctor_req req = { .mode = SC_LSEG_SR };
2599 int err;
2600
2601 if (unlikely(!sci))
2602 return -EROFS;
2603
2604 nilfs_transaction_lock(sbi, &ti, 1);
2605
2606 err = nilfs_init_gcdat_inode(nilfs);
2607 if (unlikely(err))
2608 goto out_unlock;
2609 err = nilfs_ioctl_prepare_clean_segments(nilfs, argp);
2610 if (unlikely(err))
2611 goto out_unlock;
2612
2613 list_splice_init(&nilfs->ns_gc_inodes, sci->sc_gc_inodes.prev);
2614
2615 for (;;) {
2616 nilfs_segctor_accept(sci, &req);
2617 err = nilfs_segctor_construct(sci, &req);
2618 nilfs_remove_written_gcinodes(nilfs, &sci->sc_gc_inodes);
2619 nilfs_segctor_notify(sci, &req);
2620
2621 if (likely(!err))
2622 break;
2623
2624 nilfs_warning(sb, __func__,
2625 "segment construction failed. (err=%d)", err);
2626 set_current_state(TASK_INTERRUPTIBLE);
2627 schedule_timeout(sci->sc_interval);
2628 }
2629
2630 out_unlock:
2631 nilfs_clear_gcdat_inode(nilfs);
2632 nilfs_transaction_unlock(sbi);
2633 return err;
2634}
2635
2636static void nilfs_segctor_thread_construct(struct nilfs_sc_info *sci, int mode)
2637{
2638 struct nilfs_sb_info *sbi = sci->sc_sbi;
2639 struct nilfs_transaction_info ti;
2640 struct nilfs_segctor_req req = { .mode = mode };
2641
2642 nilfs_transaction_lock(sbi, &ti, 0);
2643
2644 nilfs_segctor_accept(sci, &req);
2645 nilfs_segctor_construct(sci, &req);
2646 nilfs_segctor_notify(sci, &req);
2647
2648 /*
2649 * Unclosed segment should be retried. We do this using sc_timer.
2650 * Timeout of sc_timer will invoke complete construction which leads
2651 * to close the current logical segment.
2652 */
2653 if (test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags))
2654 nilfs_segctor_start_timer(sci);
2655
2656 nilfs_transaction_unlock(sbi);
2657}
2658
2659static void nilfs_segctor_do_immediate_flush(struct nilfs_sc_info *sci)
2660{
2661 int mode = 0;
2662 int err;
2663
2664 spin_lock(&sci->sc_state_lock);
2665 mode = (sci->sc_flush_request & FLUSH_DAT_BIT) ?
2666 SC_FLUSH_DAT : SC_FLUSH_FILE;
2667 spin_unlock(&sci->sc_state_lock);
2668
2669 if (mode) {
2670 err = nilfs_segctor_do_construct(sci, mode);
2671
2672 spin_lock(&sci->sc_state_lock);
2673 sci->sc_flush_request &= (mode == SC_FLUSH_FILE) ?
2674 ~FLUSH_FILE_BIT : ~FLUSH_DAT_BIT;
2675 spin_unlock(&sci->sc_state_lock);
2676 }
2677 clear_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags);
2678}
2679
2680static int nilfs_segctor_flush_mode(struct nilfs_sc_info *sci)
2681{
2682 if (!test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags) ||
2683 time_before(jiffies, sci->sc_lseg_stime + sci->sc_mjcp_freq)) {
2684 if (!(sci->sc_flush_request & ~FLUSH_FILE_BIT))
2685 return SC_FLUSH_FILE;
2686 else if (!(sci->sc_flush_request & ~FLUSH_DAT_BIT))
2687 return SC_FLUSH_DAT;
2688 }
2689 return SC_LSEG_SR;
2690}
2691
2692/**
2693 * nilfs_segctor_thread - main loop of the segment constructor thread.
2694 * @arg: pointer to a struct nilfs_sc_info.
2695 *
2696 * nilfs_segctor_thread() initializes a timer and serves as a daemon
2697 * to execute segment constructions.
2698 */
2699static int nilfs_segctor_thread(void *arg)
2700{
2701 struct nilfs_sc_info *sci = (struct nilfs_sc_info *)arg;
2702 struct timer_list timer;
2703 int timeout = 0;
2704
2705 init_timer(&timer);
2706 timer.data = (unsigned long)current;
2707 timer.function = nilfs_construction_timeout;
2708 sci->sc_timer = &timer;
2709
2710 /* start sync. */
2711 sci->sc_task = current;
2712 wake_up(&sci->sc_wait_task); /* for nilfs_segctor_start_thread() */
2713 printk(KERN_INFO
2714 "segctord starting. Construction interval = %lu seconds, "
2715 "CP frequency < %lu seconds\n",
2716 sci->sc_interval / HZ, sci->sc_mjcp_freq / HZ);
2717
2718 spin_lock(&sci->sc_state_lock);
2719 loop:
2720 for (;;) {
2721 int mode;
2722
2723 if (sci->sc_state & NILFS_SEGCTOR_QUIT)
2724 goto end_thread;
2725
2726 if (timeout || sci->sc_seq_request != sci->sc_seq_done)
2727 mode = SC_LSEG_SR;
2728 else if (!sci->sc_flush_request)
2729 break;
2730 else
2731 mode = nilfs_segctor_flush_mode(sci);
2732
2733 spin_unlock(&sci->sc_state_lock);
2734 nilfs_segctor_thread_construct(sci, mode);
2735 spin_lock(&sci->sc_state_lock);
2736 timeout = 0;
2737 }
2738
2739
2740 if (freezing(current)) {
2741 spin_unlock(&sci->sc_state_lock);
2742 refrigerator();
2743 spin_lock(&sci->sc_state_lock);
2744 } else {
2745 DEFINE_WAIT(wait);
2746 int should_sleep = 1;
2747
2748 prepare_to_wait(&sci->sc_wait_daemon, &wait,
2749 TASK_INTERRUPTIBLE);
2750
2751 if (sci->sc_seq_request != sci->sc_seq_done)
2752 should_sleep = 0;
2753 else if (sci->sc_flush_request)
2754 should_sleep = 0;
2755 else if (sci->sc_state & NILFS_SEGCTOR_COMMIT)
2756 should_sleep = time_before(jiffies,
2757 sci->sc_timer->expires);
2758
2759 if (should_sleep) {
2760 spin_unlock(&sci->sc_state_lock);
2761 schedule();
2762 spin_lock(&sci->sc_state_lock);
2763 }
2764 finish_wait(&sci->sc_wait_daemon, &wait);
2765 timeout = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) &&
2766 time_after_eq(jiffies, sci->sc_timer->expires));
2767 }
2768 goto loop;
2769
2770 end_thread:
2771 spin_unlock(&sci->sc_state_lock);
2772 del_timer_sync(sci->sc_timer);
2773 sci->sc_timer = NULL;
2774
2775 /* end sync. */
2776 sci->sc_task = NULL;
2777 wake_up(&sci->sc_wait_task); /* for nilfs_segctor_kill_thread() */
2778 return 0;
2779}
2780
2781static int nilfs_segctor_start_thread(struct nilfs_sc_info *sci)
2782{
2783 struct task_struct *t;
2784
2785 t = kthread_run(nilfs_segctor_thread, sci, "segctord");
2786 if (IS_ERR(t)) {
2787 int err = PTR_ERR(t);
2788
2789 printk(KERN_ERR "NILFS: error %d creating segctord thread\n",
2790 err);
2791 return err;
2792 }
2793 wait_event(sci->sc_wait_task, sci->sc_task != NULL);
2794 return 0;
2795}
2796
2797static void nilfs_segctor_kill_thread(struct nilfs_sc_info *sci)
2798{
2799 sci->sc_state |= NILFS_SEGCTOR_QUIT;
2800
2801 while (sci->sc_task) {
2802 wake_up(&sci->sc_wait_daemon);
2803 spin_unlock(&sci->sc_state_lock);
2804 wait_event(sci->sc_wait_task, sci->sc_task == NULL);
2805 spin_lock(&sci->sc_state_lock);
2806 }
2807}
2808
2809static int nilfs_segctor_init(struct nilfs_sc_info *sci)
2810{
2811 sci->sc_seq_done = sci->sc_seq_request;
2812
2813 return nilfs_segctor_start_thread(sci);
2814}
2815
2816/*
2817 * Setup & clean-up functions
2818 */
2819static struct nilfs_sc_info *nilfs_segctor_new(struct nilfs_sb_info *sbi)
2820{
2821 struct nilfs_sc_info *sci;
2822
2823 sci = kzalloc(sizeof(*sci), GFP_KERNEL);
2824 if (!sci)
2825 return NULL;
2826
2827 sci->sc_sbi = sbi;
2828 sci->sc_super = sbi->s_super;
2829
2830 init_waitqueue_head(&sci->sc_wait_request);
2831 init_waitqueue_head(&sci->sc_wait_daemon);
2832 init_waitqueue_head(&sci->sc_wait_task);
2833 spin_lock_init(&sci->sc_state_lock);
2834 INIT_LIST_HEAD(&sci->sc_dirty_files);
2835 INIT_LIST_HEAD(&sci->sc_segbufs);
2836 INIT_LIST_HEAD(&sci->sc_gc_inodes);
2837 INIT_LIST_HEAD(&sci->sc_cleaning_segments);
2838 INIT_LIST_HEAD(&sci->sc_copied_buffers);
2839
2840 sci->sc_interval = HZ * NILFS_SC_DEFAULT_TIMEOUT;
2841 sci->sc_mjcp_freq = HZ * NILFS_SC_DEFAULT_SR_FREQ;
2842 sci->sc_watermark = NILFS_SC_DEFAULT_WATERMARK;
2843
2844 if (sbi->s_interval)
2845 sci->sc_interval = sbi->s_interval;
2846 if (sbi->s_watermark)
2847 sci->sc_watermark = sbi->s_watermark;
2848 return sci;
2849}
2850
2851static void nilfs_segctor_write_out(struct nilfs_sc_info *sci)
2852{
2853 int ret, retrycount = NILFS_SC_CLEANUP_RETRY;
2854
2855 /* The segctord thread was stopped and its timer was removed.
2856 But some tasks remain. */
2857 do {
2858 struct nilfs_sb_info *sbi = sci->sc_sbi;
2859 struct nilfs_transaction_info ti;
2860 struct nilfs_segctor_req req = { .mode = SC_LSEG_SR };
2861
2862 nilfs_transaction_lock(sbi, &ti, 0);
2863 nilfs_segctor_accept(sci, &req);
2864 ret = nilfs_segctor_construct(sci, &req);
2865 nilfs_segctor_notify(sci, &req);
2866 nilfs_transaction_unlock(sbi);
2867
2868 } while (ret && retrycount-- > 0);
2869}
2870
2871/**
2872 * nilfs_segctor_destroy - destroy the segment constructor.
2873 * @sci: nilfs_sc_info
2874 *
2875 * nilfs_segctor_destroy() kills the segctord thread and frees
2876 * the nilfs_sc_info struct.
2877 * Caller must hold the segment semaphore.
2878 */
2879static void nilfs_segctor_destroy(struct nilfs_sc_info *sci)
2880{
2881 struct nilfs_sb_info *sbi = sci->sc_sbi;
2882 int flag;
2883
2884 up_write(&sbi->s_nilfs->ns_segctor_sem);
2885
2886 spin_lock(&sci->sc_state_lock);
2887 nilfs_segctor_kill_thread(sci);
2888 flag = ((sci->sc_state & NILFS_SEGCTOR_COMMIT) || sci->sc_flush_request
2889 || sci->sc_seq_request != sci->sc_seq_done);
2890 spin_unlock(&sci->sc_state_lock);
2891
2892 if (flag || nilfs_segctor_confirm(sci))
2893 nilfs_segctor_write_out(sci);
2894
2895 WARN_ON(!list_empty(&sci->sc_copied_buffers));
2896
2897 if (!list_empty(&sci->sc_dirty_files)) {
2898 nilfs_warning(sbi->s_super, __func__,
2899 "dirty file(s) after the final construction\n");
2900 nilfs_dispose_list(sbi, &sci->sc_dirty_files, 1);
2901 }
2902
2903 if (!list_empty(&sci->sc_cleaning_segments))
2904 nilfs_dispose_segment_list(&sci->sc_cleaning_segments);
2905
2906 WARN_ON(!list_empty(&sci->sc_segbufs));
2907
2908 down_write(&sbi->s_nilfs->ns_segctor_sem);
2909
2910 kfree(sci);
2911}
2912
2913/**
2914 * nilfs_attach_segment_constructor - attach a segment constructor
2915 * @sbi: nilfs_sb_info
2916 *
2917 * nilfs_attach_segment_constructor() allocates a struct nilfs_sc_info,
2918 * initilizes it, and starts the segment constructor.
2919 *
2920 * Return Value: On success, 0 is returned. On error, one of the following
2921 * negative error code is returned.
2922 *
2923 * %-ENOMEM - Insufficient memory available.
2924 */
2925int nilfs_attach_segment_constructor(struct nilfs_sb_info *sbi)
2926{
2927 struct the_nilfs *nilfs = sbi->s_nilfs;
2928 int err;
2929
2930 /* Each field of nilfs_segctor is cleared through the initialization
2931 of super-block info */
2932 sbi->s_sc_info = nilfs_segctor_new(sbi);
2933 if (!sbi->s_sc_info)
2934 return -ENOMEM;
2935
2936 nilfs_attach_writer(nilfs, sbi);
2937 err = nilfs_segctor_init(NILFS_SC(sbi));
2938 if (err) {
2939 nilfs_detach_writer(nilfs, sbi);
2940 kfree(sbi->s_sc_info);
2941 sbi->s_sc_info = NULL;
2942 }
2943 return err;
2944}
2945
2946/**
2947 * nilfs_detach_segment_constructor - destroy the segment constructor
2948 * @sbi: nilfs_sb_info
2949 *
2950 * nilfs_detach_segment_constructor() kills the segment constructor daemon,
2951 * frees the struct nilfs_sc_info, and destroy the dirty file list.
2952 */
2953void nilfs_detach_segment_constructor(struct nilfs_sb_info *sbi)
2954{
2955 struct the_nilfs *nilfs = sbi->s_nilfs;
2956 LIST_HEAD(garbage_list);
2957
2958 down_write(&nilfs->ns_segctor_sem);
2959 if (NILFS_SC(sbi)) {
2960 nilfs_segctor_destroy(NILFS_SC(sbi));
2961 sbi->s_sc_info = NULL;
2962 }
2963
2964 /* Force to free the list of dirty files */
2965 spin_lock(&sbi->s_inode_lock);
2966 if (!list_empty(&sbi->s_dirty_files)) {
2967 list_splice_init(&sbi->s_dirty_files, &garbage_list);
2968 nilfs_warning(sbi->s_super, __func__,
2969 "Non empty dirty list after the last "
2970 "segment construction\n");
2971 }
2972 spin_unlock(&sbi->s_inode_lock);
2973 up_write(&nilfs->ns_segctor_sem);
2974
2975 nilfs_dispose_list(sbi, &garbage_list, 1);
2976 nilfs_detach_writer(nilfs, sbi);
2977}
diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h
new file mode 100644
index 000000000000..a98fc1ed0bbb
--- /dev/null
+++ b/fs/nilfs2/segment.h
@@ -0,0 +1,243 @@
1/*
2 * segment.h - NILFS Segment constructor prototypes and definitions
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>
21 *
22 */
23#ifndef _NILFS_SEGMENT_H
24#define _NILFS_SEGMENT_H
25
26#include <linux/types.h>
27#include <linux/fs.h>
28#include <linux/buffer_head.h>
29#include <linux/nilfs2_fs.h>
30#include "sb.h"
31
32/**
33 * struct nilfs_recovery_info - Recovery infomation
34 * @ri_need_recovery: Recovery status
35 * @ri_super_root: Block number of the last super root
36 * @ri_ri_cno: Number of the last checkpoint
37 * @ri_lsegs_start: Region for roll-forwarding (start block number)
38 * @ri_lsegs_end: Region for roll-forwarding (end block number)
39 * @ri_lseg_start_seq: Sequence value of the segment at ri_lsegs_start
40 * @ri_used_segments: List of segments to be mark active
41 * @ri_pseg_start: Block number of the last partial segment
42 * @ri_seq: Sequence number on the last partial segment
43 * @ri_segnum: Segment number on the last partial segment
44 * @ri_nextnum: Next segment number on the last partial segment
45 */
46struct nilfs_recovery_info {
47 int ri_need_recovery;
48 sector_t ri_super_root;
49 __u64 ri_cno;
50
51 sector_t ri_lsegs_start;
52 sector_t ri_lsegs_end;
53 u64 ri_lsegs_start_seq;
54 struct list_head ri_used_segments;
55 sector_t ri_pseg_start;
56 u64 ri_seq;
57 __u64 ri_segnum;
58 __u64 ri_nextnum;
59};
60
61/* ri_need_recovery */
62#define NILFS_RECOVERY_SR_UPDATED 1 /* The super root was updated */
63#define NILFS_RECOVERY_ROLLFORWARD_DONE 2 /* Rollforward was carried out */
64
65/**
66 * struct nilfs_cstage - Context of collection stage
67 * @scnt: Stage count
68 * @flags: State flags
69 * @dirty_file_ptr: Pointer on dirty_files list, or inode of a target file
70 * @gc_inode_ptr: Pointer on the list of gc-inodes
71 */
72struct nilfs_cstage {
73 int scnt;
74 unsigned flags;
75 struct nilfs_inode_info *dirty_file_ptr;
76 struct nilfs_inode_info *gc_inode_ptr;
77};
78
79struct nilfs_segment_buffer;
80
81struct nilfs_segsum_pointer {
82 struct buffer_head *bh;
83 unsigned offset; /* offset in bytes */
84};
85
86/**
87 * struct nilfs_sc_info - Segment constructor information
88 * @sc_super: Back pointer to super_block struct
89 * @sc_sbi: Back pointer to nilfs_sb_info struct
90 * @sc_nblk_inc: Block count of current generation
91 * @sc_dirty_files: List of files to be written
92 * @sc_gc_inodes: List of GC inodes having blocks to be written
93 * @sc_cleaning_segments: List of segments to be freed through construction
94 * @sc_copied_buffers: List of copied buffers (buffer heads) to freeze data
95 * @sc_dsync_inode: inode whose data pages are written for a sync operation
96 * @sc_dsync_start: start byte offset of data pages
97 * @sc_dsync_end: end byte offset of data pages (inclusive)
98 * @sc_segbufs: List of segment buffers
99 * @sc_segbuf_nblocks: Number of available blocks in segment buffers.
100 * @sc_curseg: Current segment buffer
101 * @sc_super_root: Pointer to the super root buffer
102 * @sc_stage: Collection stage
103 * @sc_finfo_ptr: pointer to the current finfo struct in the segment summary
104 * @sc_binfo_ptr: pointer to the current binfo struct in the segment summary
105 * @sc_blk_cnt: Block count of a file
106 * @sc_datablk_cnt: Data block count of a file
107 * @sc_nblk_this_inc: Number of blocks included in the current logical segment
108 * @sc_seg_ctime: Creation time
109 * @sc_flags: Internal flags
110 * @sc_state_lock: spinlock for sc_state and so on
111 * @sc_state: Segctord state flags
112 * @sc_flush_request: inode bitmap of metadata files to be flushed
113 * @sc_wait_request: Client request queue
114 * @sc_wait_daemon: Daemon wait queue
115 * @sc_wait_task: Start/end wait queue to control segctord task
116 * @sc_seq_request: Request counter
117 * @sc_seq_done: Completion counter
118 * @sc_sync: Request of explicit sync operation
119 * @sc_interval: Timeout value of background construction
120 * @sc_mjcp_freq: Frequency of creating checkpoints
121 * @sc_lseg_stime: Start time of the latest logical segment
122 * @sc_watermark: Watermark for the number of dirty buffers
123 * @sc_timer: Timer for segctord
124 * @sc_task: current thread of segctord
125 */
126struct nilfs_sc_info {
127 struct super_block *sc_super;
128 struct nilfs_sb_info *sc_sbi;
129
130 unsigned long sc_nblk_inc;
131
132 struct list_head sc_dirty_files;
133 struct list_head sc_gc_inodes;
134 struct list_head sc_cleaning_segments;
135 struct list_head sc_copied_buffers;
136
137 struct nilfs_inode_info *sc_dsync_inode;
138 loff_t sc_dsync_start;
139 loff_t sc_dsync_end;
140
141 /* Segment buffers */
142 struct list_head sc_segbufs;
143 unsigned long sc_segbuf_nblocks;
144 struct nilfs_segment_buffer *sc_curseg;
145 struct buffer_head *sc_super_root;
146
147 struct nilfs_cstage sc_stage;
148
149 struct nilfs_segsum_pointer sc_finfo_ptr;
150 struct nilfs_segsum_pointer sc_binfo_ptr;
151 unsigned long sc_blk_cnt;
152 unsigned long sc_datablk_cnt;
153 unsigned long sc_nblk_this_inc;
154 time_t sc_seg_ctime;
155
156 unsigned long sc_flags;
157
158 spinlock_t sc_state_lock;
159 unsigned long sc_state;
160 unsigned long sc_flush_request;
161
162 wait_queue_head_t sc_wait_request;
163 wait_queue_head_t sc_wait_daemon;
164 wait_queue_head_t sc_wait_task;
165
166 __u32 sc_seq_request;
167 __u32 sc_seq_done;
168
169 int sc_sync;
170 unsigned long sc_interval;
171 unsigned long sc_mjcp_freq;
172 unsigned long sc_lseg_stime; /* in 1/HZ seconds */
173 unsigned long sc_watermark;
174
175 struct timer_list *sc_timer;
176 struct task_struct *sc_task;
177};
178
179/* sc_flags */
180enum {
181 NILFS_SC_DIRTY, /* One or more dirty meta-data blocks exist */
182 NILFS_SC_UNCLOSED, /* Logical segment is not closed */
183 NILFS_SC_SUPER_ROOT, /* The latest segment has a super root */
184 NILFS_SC_PRIOR_FLUSH, /* Requesting immediate flush without making a
185 checkpoint */
186 NILFS_SC_HAVE_DELTA, /* Next checkpoint will have update of files
187 other than DAT, cpfile, sufile, or files
188 moved by GC */
189};
190
191/* sc_state */
192#define NILFS_SEGCTOR_QUIT 0x0001 /* segctord is being destroyed */
193#define NILFS_SEGCTOR_COMMIT 0x0004 /* committed transaction exists */
194
195/*
196 * Constant parameters
197 */
198#define NILFS_SC_CLEANUP_RETRY 3 /* Retry count of construction when
199 destroying segctord */
200
201/*
202 * Default values of timeout, in seconds.
203 */
204#define NILFS_SC_DEFAULT_TIMEOUT 5 /* Timeout value of dirty blocks.
205 It triggers construction of a
206 logical segment with a super root */
207#define NILFS_SC_DEFAULT_SR_FREQ 30 /* Maximum frequency of super root
208 creation */
209
210/*
211 * The default threshold amount of data, in block counts.
212 */
213#define NILFS_SC_DEFAULT_WATERMARK 3600
214
215
216/* segment.c */
217extern int nilfs_init_transaction_cache(void);
218extern void nilfs_destroy_transaction_cache(void);
219extern void nilfs_relax_pressure_in_lock(struct super_block *);
220
221extern int nilfs_construct_segment(struct super_block *);
222extern int nilfs_construct_dsync_segment(struct super_block *, struct inode *,
223 loff_t, loff_t);
224extern void nilfs_flush_segment(struct super_block *, ino_t);
225extern int nilfs_clean_segments(struct super_block *, void __user *);
226
227extern int nilfs_segctor_add_segments_to_be_freed(struct nilfs_sc_info *,
228 __u64 *, size_t);
229extern void nilfs_segctor_clear_segments_to_be_freed(struct nilfs_sc_info *);
230
231extern int nilfs_attach_segment_constructor(struct nilfs_sb_info *);
232extern void nilfs_detach_segment_constructor(struct nilfs_sb_info *);
233
234/* recovery.c */
235extern int nilfs_read_super_root_block(struct super_block *, sector_t,
236 struct buffer_head **, int);
237extern int nilfs_search_super_root(struct the_nilfs *, struct nilfs_sb_info *,
238 struct nilfs_recovery_info *);
239extern int nilfs_recover_logical_segments(struct the_nilfs *,
240 struct nilfs_sb_info *,
241 struct nilfs_recovery_info *);
242
243#endif /* _NILFS_SEGMENT_H */
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
new file mode 100644
index 000000000000..c774cf397e2f
--- /dev/null
+++ b/fs/nilfs2/sufile.c
@@ -0,0 +1,640 @@
1/*
2 * sufile.c - NILFS segment usage file.
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#include <linux/kernel.h>
24#include <linux/fs.h>
25#include <linux/string.h>
26#include <linux/buffer_head.h>
27#include <linux/errno.h>
28#include <linux/nilfs2_fs.h>
29#include "mdt.h"
30#include "sufile.h"
31
32
33static inline unsigned long
34nilfs_sufile_segment_usages_per_block(const struct inode *sufile)
35{
36 return NILFS_MDT(sufile)->mi_entries_per_block;
37}
38
39static unsigned long
40nilfs_sufile_get_blkoff(const struct inode *sufile, __u64 segnum)
41{
42 __u64 t = segnum + NILFS_MDT(sufile)->mi_first_entry_offset;
43 do_div(t, nilfs_sufile_segment_usages_per_block(sufile));
44 return (unsigned long)t;
45}
46
47static unsigned long
48nilfs_sufile_get_offset(const struct inode *sufile, __u64 segnum)
49{
50 __u64 t = segnum + NILFS_MDT(sufile)->mi_first_entry_offset;
51 return do_div(t, nilfs_sufile_segment_usages_per_block(sufile));
52}
53
54static unsigned long
55nilfs_sufile_segment_usages_in_block(const struct inode *sufile, __u64 curr,
56 __u64 max)
57{
58 return min_t(unsigned long,
59 nilfs_sufile_segment_usages_per_block(sufile) -
60 nilfs_sufile_get_offset(sufile, curr),
61 max - curr + 1);
62}
63
64static inline struct nilfs_sufile_header *
65nilfs_sufile_block_get_header(const struct inode *sufile,
66 struct buffer_head *bh,
67 void *kaddr)
68{
69 return kaddr + bh_offset(bh);
70}
71
72static struct nilfs_segment_usage *
73nilfs_sufile_block_get_segment_usage(const struct inode *sufile, __u64 segnum,
74 struct buffer_head *bh, void *kaddr)
75{
76 return kaddr + bh_offset(bh) +
77 nilfs_sufile_get_offset(sufile, segnum) *
78 NILFS_MDT(sufile)->mi_entry_size;
79}
80
81static inline int nilfs_sufile_get_header_block(struct inode *sufile,
82 struct buffer_head **bhp)
83{
84 return nilfs_mdt_get_block(sufile, 0, 0, NULL, bhp);
85}
86
87static inline int
88nilfs_sufile_get_segment_usage_block(struct inode *sufile, __u64 segnum,
89 int create, struct buffer_head **bhp)
90{
91 return nilfs_mdt_get_block(sufile,
92 nilfs_sufile_get_blkoff(sufile, segnum),
93 create, NULL, bhp);
94}
95
96/**
97 * nilfs_sufile_alloc - allocate a segment
98 * @sufile: inode of segment usage file
99 * @segnump: pointer to segment number
100 *
101 * Description: nilfs_sufile_alloc() allocates a clean segment.
102 *
103 * Return Value: On success, 0 is returned and the segment number of the
104 * allocated segment is stored in the place pointed by @segnump. On error, one
105 * of the following negative error codes is returned.
106 *
107 * %-EIO - I/O error.
108 *
109 * %-ENOMEM - Insufficient amount of memory available.
110 *
111 * %-ENOSPC - No clean segment left.
112 */
113int nilfs_sufile_alloc(struct inode *sufile, __u64 *segnump)
114{
115 struct buffer_head *header_bh, *su_bh;
116 struct the_nilfs *nilfs;
117 struct nilfs_sufile_header *header;
118 struct nilfs_segment_usage *su;
119 size_t susz = NILFS_MDT(sufile)->mi_entry_size;
120 __u64 segnum, maxsegnum, last_alloc;
121 void *kaddr;
122 unsigned long nsegments, ncleansegs, nsus;
123 int ret, i, j;
124
125 down_write(&NILFS_MDT(sufile)->mi_sem);
126
127 nilfs = NILFS_MDT(sufile)->mi_nilfs;
128
129 ret = nilfs_sufile_get_header_block(sufile, &header_bh);
130 if (ret < 0)
131 goto out_sem;
132 kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
133 header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr);
134 ncleansegs = le64_to_cpu(header->sh_ncleansegs);
135 last_alloc = le64_to_cpu(header->sh_last_alloc);
136 kunmap_atomic(kaddr, KM_USER0);
137
138 nsegments = nilfs_sufile_get_nsegments(sufile);
139 segnum = last_alloc + 1;
140 maxsegnum = nsegments - 1;
141 for (i = 0; i < nsegments; i += nsus) {
142 if (segnum >= nsegments) {
143 /* wrap around */
144 segnum = 0;
145 maxsegnum = last_alloc;
146 }
147 ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 1,
148 &su_bh);
149 if (ret < 0)
150 goto out_header;
151 kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
152 su = nilfs_sufile_block_get_segment_usage(
153 sufile, segnum, su_bh, kaddr);
154
155 nsus = nilfs_sufile_segment_usages_in_block(
156 sufile, segnum, maxsegnum);
157 for (j = 0; j < nsus; j++, su = (void *)su + susz, segnum++) {
158 if (!nilfs_segment_usage_clean(su))
159 continue;
160 /* found a clean segment */
161 nilfs_segment_usage_set_dirty(su);
162 kunmap_atomic(kaddr, KM_USER0);
163
164 kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
165 header = nilfs_sufile_block_get_header(
166 sufile, header_bh, kaddr);
167 le64_add_cpu(&header->sh_ncleansegs, -1);
168 le64_add_cpu(&header->sh_ndirtysegs, 1);
169 header->sh_last_alloc = cpu_to_le64(segnum);
170 kunmap_atomic(kaddr, KM_USER0);
171
172 nilfs_mdt_mark_buffer_dirty(header_bh);
173 nilfs_mdt_mark_buffer_dirty(su_bh);
174 nilfs_mdt_mark_dirty(sufile);
175 brelse(su_bh);
176 *segnump = segnum;
177 goto out_header;
178 }
179
180 kunmap_atomic(kaddr, KM_USER0);
181 brelse(su_bh);
182 }
183
184 /* no segments left */
185 ret = -ENOSPC;
186
187 out_header:
188 brelse(header_bh);
189
190 out_sem:
191 up_write(&NILFS_MDT(sufile)->mi_sem);
192 return ret;
193}
194
195/**
196 * nilfs_sufile_cancel_free -
197 * @sufile: inode of segment usage file
198 * @segnum: segment number
199 *
200 * Description:
201 *
202 * Return Value: On success, 0 is returned. On error, one of the following
203 * negative error codes is returned.
204 *
205 * %-EIO - I/O error.
206 *
207 * %-ENOMEM - Insufficient amount of memory available.
208 */
209int nilfs_sufile_cancel_free(struct inode *sufile, __u64 segnum)
210{
211 struct buffer_head *header_bh, *su_bh;
212 struct the_nilfs *nilfs;
213 struct nilfs_sufile_header *header;
214 struct nilfs_segment_usage *su;
215 void *kaddr;
216 int ret;
217
218 down_write(&NILFS_MDT(sufile)->mi_sem);
219
220 nilfs = NILFS_MDT(sufile)->mi_nilfs;
221
222 ret = nilfs_sufile_get_header_block(sufile, &header_bh);
223 if (ret < 0)
224 goto out_sem;
225
226 ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &su_bh);
227 if (ret < 0)
228 goto out_header;
229
230 kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
231 su = nilfs_sufile_block_get_segment_usage(
232 sufile, segnum, su_bh, kaddr);
233 if (unlikely(!nilfs_segment_usage_clean(su))) {
234 printk(KERN_WARNING "%s: segment %llu must be clean\n",
235 __func__, (unsigned long long)segnum);
236 kunmap_atomic(kaddr, KM_USER0);
237 goto out_su_bh;
238 }
239 nilfs_segment_usage_set_dirty(su);
240 kunmap_atomic(kaddr, KM_USER0);
241
242 kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
243 header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr);
244 le64_add_cpu(&header->sh_ncleansegs, -1);
245 le64_add_cpu(&header->sh_ndirtysegs, 1);
246 kunmap_atomic(kaddr, KM_USER0);
247
248 nilfs_mdt_mark_buffer_dirty(header_bh);
249 nilfs_mdt_mark_buffer_dirty(su_bh);
250 nilfs_mdt_mark_dirty(sufile);
251
252 out_su_bh:
253 brelse(su_bh);
254 out_header:
255 brelse(header_bh);
256 out_sem:
257 up_write(&NILFS_MDT(sufile)->mi_sem);
258 return ret;
259}
260
261/**
262 * nilfs_sufile_freev - free segments
263 * @sufile: inode of segment usage file
264 * @segnum: array of segment numbers
265 * @nsegs: number of segments
266 *
267 * Description: nilfs_sufile_freev() frees segments specified by @segnum and
268 * @nsegs, which must have been returned by a previous call to
269 * nilfs_sufile_alloc().
270 *
271 * Return Value: On success, 0 is returned. On error, one of the following
272 * negative error codes is returned.
273 *
274 * %-EIO - I/O error.
275 *
276 * %-ENOMEM - Insufficient amount of memory available.
277 */
278#define NILFS_SUFILE_FREEV_PREALLOC 16
279int nilfs_sufile_freev(struct inode *sufile, __u64 *segnum, size_t nsegs)
280{
281 struct buffer_head *header_bh, **su_bh,
282 *su_bh_prealloc[NILFS_SUFILE_FREEV_PREALLOC];
283 struct the_nilfs *nilfs;
284 struct nilfs_sufile_header *header;
285 struct nilfs_segment_usage *su;
286 void *kaddr;
287 int ret, i;
288
289 down_write(&NILFS_MDT(sufile)->mi_sem);
290
291 nilfs = NILFS_MDT(sufile)->mi_nilfs;
292
293 /* prepare resources */
294 if (nsegs <= NILFS_SUFILE_FREEV_PREALLOC)
295 su_bh = su_bh_prealloc;
296 else {
297 su_bh = kmalloc(sizeof(*su_bh) * nsegs, GFP_NOFS);
298 if (su_bh == NULL) {
299 ret = -ENOMEM;
300 goto out_sem;
301 }
302 }
303
304 ret = nilfs_sufile_get_header_block(sufile, &header_bh);
305 if (ret < 0)
306 goto out_su_bh;
307 for (i = 0; i < nsegs; i++) {
308 ret = nilfs_sufile_get_segment_usage_block(sufile, segnum[i],
309 0, &su_bh[i]);
310 if (ret < 0)
311 goto out_bh;
312 }
313
314 /* free segments */
315 for (i = 0; i < nsegs; i++) {
316 kaddr = kmap_atomic(su_bh[i]->b_page, KM_USER0);
317 su = nilfs_sufile_block_get_segment_usage(
318 sufile, segnum[i], su_bh[i], kaddr);
319 WARN_ON(nilfs_segment_usage_error(su));
320 nilfs_segment_usage_set_clean(su);
321 kunmap_atomic(kaddr, KM_USER0);
322 nilfs_mdt_mark_buffer_dirty(su_bh[i]);
323 }
324 kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
325 header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr);
326 le64_add_cpu(&header->sh_ncleansegs, nsegs);
327 le64_add_cpu(&header->sh_ndirtysegs, -(u64)nsegs);
328 kunmap_atomic(kaddr, KM_USER0);
329 nilfs_mdt_mark_buffer_dirty(header_bh);
330 nilfs_mdt_mark_dirty(sufile);
331
332 out_bh:
333 for (i--; i >= 0; i--)
334 brelse(su_bh[i]);
335 brelse(header_bh);
336
337 out_su_bh:
338 if (su_bh != su_bh_prealloc)
339 kfree(su_bh);
340
341 out_sem:
342 up_write(&NILFS_MDT(sufile)->mi_sem);
343 return ret;
344}
345
346/**
347 * nilfs_sufile_free -
348 * @sufile:
349 * @segnum:
350 */
351int nilfs_sufile_free(struct inode *sufile, __u64 segnum)
352{
353 return nilfs_sufile_freev(sufile, &segnum, 1);
354}
355
356/**
357 * nilfs_sufile_get_segment_usage - get a segment usage
358 * @sufile: inode of segment usage file
359 * @segnum: segment number
360 * @sup: pointer to segment usage
361 * @bhp: pointer to buffer head
362 *
363 * Description: nilfs_sufile_get_segment_usage() acquires the segment usage
364 * specified by @segnum.
365 *
366 * Return Value: On success, 0 is returned, and the segment usage and the
367 * buffer head of the buffer on which the segment usage is located are stored
368 * in the place pointed by @sup and @bhp, respectively. On error, one of the
369 * following negative error codes is returned.
370 *
371 * %-EIO - I/O error.
372 *
373 * %-ENOMEM - Insufficient amount of memory available.
374 *
375 * %-EINVAL - Invalid segment usage number.
376 */
377int nilfs_sufile_get_segment_usage(struct inode *sufile, __u64 segnum,
378 struct nilfs_segment_usage **sup,
379 struct buffer_head **bhp)
380{
381 struct buffer_head *bh;
382 struct nilfs_segment_usage *su;
383 void *kaddr;
384 int ret;
385
386 /* segnum is 0 origin */
387 if (segnum >= nilfs_sufile_get_nsegments(sufile))
388 return -EINVAL;
389 down_write(&NILFS_MDT(sufile)->mi_sem);
390 ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 1, &bh);
391 if (ret < 0)
392 goto out_sem;
393 kaddr = kmap(bh->b_page);
394 su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr);
395 if (nilfs_segment_usage_error(su)) {
396 kunmap(bh->b_page);
397 brelse(bh);
398 ret = -EINVAL;
399 goto out_sem;
400 }
401
402 if (sup != NULL)
403 *sup = su;
404 *bhp = bh;
405
406 out_sem:
407 up_write(&NILFS_MDT(sufile)->mi_sem);
408 return ret;
409}
410
411/**
412 * nilfs_sufile_put_segment_usage - put a segment usage
413 * @sufile: inode of segment usage file
414 * @segnum: segment number
415 * @bh: buffer head
416 *
417 * Description: nilfs_sufile_put_segment_usage() releases the segment usage
418 * specified by @segnum. @bh must be the buffer head which have been returned
419 * by a previous call to nilfs_sufile_get_segment_usage() with @segnum.
420 */
421void nilfs_sufile_put_segment_usage(struct inode *sufile, __u64 segnum,
422 struct buffer_head *bh)
423{
424 kunmap(bh->b_page);
425 brelse(bh);
426}
427
428/**
429 * nilfs_sufile_get_stat - get segment usage statistics
430 * @sufile: inode of segment usage file
431 * @stat: pointer to a structure of segment usage statistics
432 *
433 * Description: nilfs_sufile_get_stat() returns information about segment
434 * usage.
435 *
436 * Return Value: On success, 0 is returned, and segment usage information is
437 * stored in the place pointed by @stat. On error, one of the following
438 * negative error codes is returned.
439 *
440 * %-EIO - I/O error.
441 *
442 * %-ENOMEM - Insufficient amount of memory available.
443 */
444int nilfs_sufile_get_stat(struct inode *sufile, struct nilfs_sustat *sustat)
445{
446 struct buffer_head *header_bh;
447 struct nilfs_sufile_header *header;
448 struct the_nilfs *nilfs = NILFS_MDT(sufile)->mi_nilfs;
449 void *kaddr;
450 int ret;
451
452 down_read(&NILFS_MDT(sufile)->mi_sem);
453
454 ret = nilfs_sufile_get_header_block(sufile, &header_bh);
455 if (ret < 0)
456 goto out_sem;
457
458 kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
459 header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr);
460 sustat->ss_nsegs = nilfs_sufile_get_nsegments(sufile);
461 sustat->ss_ncleansegs = le64_to_cpu(header->sh_ncleansegs);
462 sustat->ss_ndirtysegs = le64_to_cpu(header->sh_ndirtysegs);
463 sustat->ss_ctime = nilfs->ns_ctime;
464 sustat->ss_nongc_ctime = nilfs->ns_nongc_ctime;
465 spin_lock(&nilfs->ns_last_segment_lock);
466 sustat->ss_prot_seq = nilfs->ns_prot_seq;
467 spin_unlock(&nilfs->ns_last_segment_lock);
468 kunmap_atomic(kaddr, KM_USER0);
469 brelse(header_bh);
470
471 out_sem:
472 up_read(&NILFS_MDT(sufile)->mi_sem);
473 return ret;
474}
475
476/**
477 * nilfs_sufile_get_ncleansegs - get the number of clean segments
478 * @sufile: inode of segment usage file
479 * @nsegsp: pointer to the number of clean segments
480 *
481 * Description: nilfs_sufile_get_ncleansegs() acquires the number of clean
482 * segments.
483 *
484 * Return Value: On success, 0 is returned and the number of clean segments is
485 * stored in the place pointed by @nsegsp. On error, one of the following
486 * negative error codes is returned.
487 *
488 * %-EIO - I/O error.
489 *
490 * %-ENOMEM - Insufficient amount of memory available.
491 */
492int nilfs_sufile_get_ncleansegs(struct inode *sufile, unsigned long *nsegsp)
493{
494 struct nilfs_sustat sustat;
495 int ret;
496
497 ret = nilfs_sufile_get_stat(sufile, &sustat);
498 if (ret == 0)
499 *nsegsp = sustat.ss_ncleansegs;
500 return ret;
501}
502
503/**
504 * nilfs_sufile_set_error - mark a segment as erroneous
505 * @sufile: inode of segment usage file
506 * @segnum: segment number
507 *
508 * Description: nilfs_sufile_set_error() marks the segment specified by
509 * @segnum as erroneous. The error segment will never be used again.
510 *
511 * Return Value: On success, 0 is returned. On error, one of the following
512 * negative error codes is returned.
513 *
514 * %-EIO - I/O error.
515 *
516 * %-ENOMEM - Insufficient amount of memory available.
517 *
518 * %-EINVAL - Invalid segment usage number.
519 */
520int nilfs_sufile_set_error(struct inode *sufile, __u64 segnum)
521{
522 struct buffer_head *header_bh, *su_bh;
523 struct nilfs_segment_usage *su;
524 struct nilfs_sufile_header *header;
525 void *kaddr;
526 int ret;
527
528 if (unlikely(segnum >= nilfs_sufile_get_nsegments(sufile))) {
529 printk(KERN_WARNING "%s: invalid segment number: %llu\n",
530 __func__, (unsigned long long)segnum);
531 return -EINVAL;
532 }
533 down_write(&NILFS_MDT(sufile)->mi_sem);
534
535 ret = nilfs_sufile_get_header_block(sufile, &header_bh);
536 if (ret < 0)
537 goto out_sem;
538 ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &su_bh);
539 if (ret < 0)
540 goto out_header;
541
542 kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
543 su = nilfs_sufile_block_get_segment_usage(sufile, segnum, su_bh, kaddr);
544 if (nilfs_segment_usage_error(su)) {
545 kunmap_atomic(kaddr, KM_USER0);
546 brelse(su_bh);
547 goto out_header;
548 }
549
550 nilfs_segment_usage_set_error(su);
551 kunmap_atomic(kaddr, KM_USER0);
552 brelse(su_bh);
553
554 kaddr = kmap_atomic(header_bh->b_page, KM_USER0);
555 header = nilfs_sufile_block_get_header(sufile, header_bh, kaddr);
556 le64_add_cpu(&header->sh_ndirtysegs, -1);
557 kunmap_atomic(kaddr, KM_USER0);
558 nilfs_mdt_mark_buffer_dirty(header_bh);
559 nilfs_mdt_mark_buffer_dirty(su_bh);
560 nilfs_mdt_mark_dirty(sufile);
561 brelse(su_bh);
562
563 out_header:
564 brelse(header_bh);
565
566 out_sem:
567 up_write(&NILFS_MDT(sufile)->mi_sem);
568 return ret;
569}
570
571/**
572 * nilfs_sufile_get_suinfo -
573 * @sufile: inode of segment usage file
574 * @segnum: segment number to start looking
575 * @si: array of suinfo
576 * @nsi: size of suinfo array
577 *
578 * Description:
579 *
580 * Return Value: On success, 0 is returned and .... On error, one of the
581 * following negative error codes is returned.
582 *
583 * %-EIO - I/O error.
584 *
585 * %-ENOMEM - Insufficient amount of memory available.
586 */
587ssize_t nilfs_sufile_get_suinfo(struct inode *sufile, __u64 segnum,
588 struct nilfs_suinfo *si, size_t nsi)
589{
590 struct buffer_head *su_bh;
591 struct nilfs_segment_usage *su;
592 size_t susz = NILFS_MDT(sufile)->mi_entry_size;
593 struct the_nilfs *nilfs = NILFS_MDT(sufile)->mi_nilfs;
594 void *kaddr;
595 unsigned long nsegs, segusages_per_block;
596 ssize_t n;
597 int ret, i, j;
598
599 down_read(&NILFS_MDT(sufile)->mi_sem);
600
601 segusages_per_block = nilfs_sufile_segment_usages_per_block(sufile);
602 nsegs = min_t(unsigned long,
603 nilfs_sufile_get_nsegments(sufile) - segnum,
604 nsi);
605 for (i = 0; i < nsegs; i += n, segnum += n) {
606 n = min_t(unsigned long,
607 segusages_per_block -
608 nilfs_sufile_get_offset(sufile, segnum),
609 nsegs - i);
610 ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0,
611 &su_bh);
612 if (ret < 0) {
613 if (ret != -ENOENT)
614 goto out;
615 /* hole */
616 memset(&si[i], 0, sizeof(struct nilfs_suinfo) * n);
617 continue;
618 }
619
620 kaddr = kmap_atomic(su_bh->b_page, KM_USER0);
621 su = nilfs_sufile_block_get_segment_usage(
622 sufile, segnum, su_bh, kaddr);
623 for (j = 0; j < n; j++, su = (void *)su + susz) {
624 si[i + j].sui_lastmod = le64_to_cpu(su->su_lastmod);
625 si[i + j].sui_nblocks = le32_to_cpu(su->su_nblocks);
626 si[i + j].sui_flags = le32_to_cpu(su->su_flags) &
627 ~(1UL << NILFS_SEGMENT_USAGE_ACTIVE);
628 if (nilfs_segment_is_active(nilfs, segnum + i + j))
629 si[i + j].sui_flags |=
630 (1UL << NILFS_SEGMENT_USAGE_ACTIVE);
631 }
632 kunmap_atomic(kaddr, KM_USER0);
633 brelse(su_bh);
634 }
635 ret = nsegs;
636
637 out:
638 up_read(&NILFS_MDT(sufile)->mi_sem);
639 return ret;
640}
diff --git a/fs/nilfs2/sufile.h b/fs/nilfs2/sufile.h
new file mode 100644
index 000000000000..d595f33a768d
--- /dev/null
+++ b/fs/nilfs2/sufile.h
@@ -0,0 +1,54 @@
1/*
2 * sufile.h - NILFS segment usage file.
3 *
4 * Copyright (C) 2006-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Koji Sato <koji@osrg.net>.
21 */
22
23#ifndef _NILFS_SUFILE_H
24#define _NILFS_SUFILE_H
25
26#include <linux/fs.h>
27#include <linux/buffer_head.h>
28#include <linux/nilfs2_fs.h>
29#include "mdt.h"
30
31#define NILFS_SUFILE_GFP NILFS_MDT_GFP
32
33static inline unsigned long nilfs_sufile_get_nsegments(struct inode *sufile)
34{
35 return NILFS_MDT(sufile)->mi_nilfs->ns_nsegments;
36}
37
38int nilfs_sufile_alloc(struct inode *, __u64 *);
39int nilfs_sufile_cancel_free(struct inode *, __u64);
40int nilfs_sufile_freev(struct inode *, __u64 *, size_t);
41int nilfs_sufile_free(struct inode *, __u64);
42int nilfs_sufile_get_segment_usage(struct inode *, __u64,
43 struct nilfs_segment_usage **,
44 struct buffer_head **);
45void nilfs_sufile_put_segment_usage(struct inode *, __u64,
46 struct buffer_head *);
47int nilfs_sufile_get_stat(struct inode *, struct nilfs_sustat *);
48int nilfs_sufile_get_ncleansegs(struct inode *, unsigned long *);
49int nilfs_sufile_set_error(struct inode *, __u64);
50ssize_t nilfs_sufile_get_suinfo(struct inode *, __u64, struct nilfs_suinfo *,
51 size_t);
52
53
54#endif /* _NILFS_SUFILE_H */
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
new file mode 100644
index 000000000000..e117e1ea9bff
--- /dev/null
+++ b/fs/nilfs2/super.c
@@ -0,0 +1,1323 @@
1/*
2 * super.c - NILFS module and super block management.
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>
21 */
22/*
23 * linux/fs/ext2/super.c
24 *
25 * Copyright (C) 1992, 1993, 1994, 1995
26 * Remy Card (card@masi.ibp.fr)
27 * Laboratoire MASI - Institut Blaise Pascal
28 * Universite Pierre et Marie Curie (Paris VI)
29 *
30 * from
31 *
32 * linux/fs/minix/inode.c
33 *
34 * Copyright (C) 1991, 1992 Linus Torvalds
35 *
36 * Big-endian to little-endian byte-swapping/bitmaps by
37 * David S. Miller (davem@caip.rutgers.edu), 1995
38 */
39
40#include <linux/module.h>
41#include <linux/string.h>
42#include <linux/slab.h>
43#include <linux/init.h>
44#include <linux/blkdev.h>
45#include <linux/parser.h>
46#include <linux/random.h>
47#include <linux/crc32.h>
48#include <linux/smp_lock.h>
49#include <linux/vfs.h>
50#include <linux/writeback.h>
51#include <linux/kobject.h>
52#include <linux/exportfs.h>
53#include "nilfs.h"
54#include "mdt.h"
55#include "alloc.h"
56#include "page.h"
57#include "cpfile.h"
58#include "ifile.h"
59#include "dat.h"
60#include "segment.h"
61#include "segbuf.h"
62
63MODULE_AUTHOR("NTT Corp.");
64MODULE_DESCRIPTION("A New Implementation of the Log-structured Filesystem "
65 "(NILFS)");
66MODULE_VERSION(NILFS_VERSION);
67MODULE_LICENSE("GPL");
68
69static int nilfs_remount(struct super_block *sb, int *flags, char *data);
70static int test_exclusive_mount(struct file_system_type *fs_type,
71 struct block_device *bdev, int flags);
72
73/**
74 * nilfs_error() - report failure condition on a filesystem
75 *
76 * nilfs_error() sets an ERROR_FS flag on the superblock as well as
77 * reporting an error message. It should be called when NILFS detects
78 * incoherences or defects of meta data on disk. As for sustainable
79 * errors such as a single-shot I/O error, nilfs_warning() or the printk()
80 * function should be used instead.
81 *
82 * The segment constructor must not call this function because it can
83 * kill itself.
84 */
85void nilfs_error(struct super_block *sb, const char *function,
86 const char *fmt, ...)
87{
88 struct nilfs_sb_info *sbi = NILFS_SB(sb);
89 va_list args;
90
91 va_start(args, fmt);
92 printk(KERN_CRIT "NILFS error (device %s): %s: ", sb->s_id, function);
93 vprintk(fmt, args);
94 printk("\n");
95 va_end(args);
96
97 if (!(sb->s_flags & MS_RDONLY)) {
98 struct the_nilfs *nilfs = sbi->s_nilfs;
99
100 if (!nilfs_test_opt(sbi, ERRORS_CONT))
101 nilfs_detach_segment_constructor(sbi);
102
103 down_write(&nilfs->ns_sem);
104 if (!(nilfs->ns_mount_state & NILFS_ERROR_FS)) {
105 nilfs->ns_mount_state |= NILFS_ERROR_FS;
106 nilfs->ns_sbp[0]->s_state |=
107 cpu_to_le16(NILFS_ERROR_FS);
108 nilfs_commit_super(sbi, 1);
109 }
110 up_write(&nilfs->ns_sem);
111
112 if (nilfs_test_opt(sbi, ERRORS_RO)) {
113 printk(KERN_CRIT "Remounting filesystem read-only\n");
114 sb->s_flags |= MS_RDONLY;
115 }
116 }
117
118 if (nilfs_test_opt(sbi, ERRORS_PANIC))
119 panic("NILFS (device %s): panic forced after error\n",
120 sb->s_id);
121}
122
123void nilfs_warning(struct super_block *sb, const char *function,
124 const char *fmt, ...)
125{
126 va_list args;
127
128 va_start(args, fmt);
129 printk(KERN_WARNING "NILFS warning (device %s): %s: ",
130 sb->s_id, function);
131 vprintk(fmt, args);
132 printk("\n");
133 va_end(args);
134}
135
136static struct kmem_cache *nilfs_inode_cachep;
137
138struct inode *nilfs_alloc_inode(struct super_block *sb)
139{
140 struct nilfs_inode_info *ii;
141
142 ii = kmem_cache_alloc(nilfs_inode_cachep, GFP_NOFS);
143 if (!ii)
144 return NULL;
145 ii->i_bh = NULL;
146 ii->i_state = 0;
147 ii->vfs_inode.i_version = 1;
148 nilfs_btnode_cache_init(&ii->i_btnode_cache);
149 return &ii->vfs_inode;
150}
151
152void nilfs_destroy_inode(struct inode *inode)
153{
154 kmem_cache_free(nilfs_inode_cachep, NILFS_I(inode));
155}
156
157static void init_once(void *obj)
158{
159 struct nilfs_inode_info *ii = obj;
160
161 INIT_LIST_HEAD(&ii->i_dirty);
162#ifdef CONFIG_NILFS_XATTR
163 init_rwsem(&ii->xattr_sem);
164#endif
165 nilfs_btnode_cache_init_once(&ii->i_btnode_cache);
166 ii->i_bmap = (struct nilfs_bmap *)&ii->i_bmap_union;
167 inode_init_once(&ii->vfs_inode);
168}
169
170static int nilfs_init_inode_cache(void)
171{
172 nilfs_inode_cachep = kmem_cache_create("nilfs2_inode_cache",
173 sizeof(struct nilfs_inode_info),
174 0, SLAB_RECLAIM_ACCOUNT,
175 init_once);
176
177 return (nilfs_inode_cachep == NULL) ? -ENOMEM : 0;
178}
179
180static inline void nilfs_destroy_inode_cache(void)
181{
182 kmem_cache_destroy(nilfs_inode_cachep);
183}
184
185static void nilfs_clear_inode(struct inode *inode)
186{
187 struct nilfs_inode_info *ii = NILFS_I(inode);
188
189#ifdef CONFIG_NILFS_POSIX_ACL
190 if (ii->i_acl && ii->i_acl != NILFS_ACL_NOT_CACHED) {
191 posix_acl_release(ii->i_acl);
192 ii->i_acl = NILFS_ACL_NOT_CACHED;
193 }
194 if (ii->i_default_acl && ii->i_default_acl != NILFS_ACL_NOT_CACHED) {
195 posix_acl_release(ii->i_default_acl);
196 ii->i_default_acl = NILFS_ACL_NOT_CACHED;
197 }
198#endif
199 /*
200 * Free resources allocated in nilfs_read_inode(), here.
201 */
202 BUG_ON(!list_empty(&ii->i_dirty));
203 brelse(ii->i_bh);
204 ii->i_bh = NULL;
205
206 if (test_bit(NILFS_I_BMAP, &ii->i_state))
207 nilfs_bmap_clear(ii->i_bmap);
208
209 nilfs_btnode_cache_clear(&ii->i_btnode_cache);
210}
211
212static int nilfs_sync_super(struct nilfs_sb_info *sbi, int dupsb)
213{
214 struct the_nilfs *nilfs = sbi->s_nilfs;
215 int err;
216 int barrier_done = 0;
217
218 if (nilfs_test_opt(sbi, BARRIER)) {
219 set_buffer_ordered(nilfs->ns_sbh[0]);
220 barrier_done = 1;
221 }
222 retry:
223 set_buffer_dirty(nilfs->ns_sbh[0]);
224 err = sync_dirty_buffer(nilfs->ns_sbh[0]);
225 if (err == -EOPNOTSUPP && barrier_done) {
226 nilfs_warning(sbi->s_super, __func__,
227 "barrier-based sync failed. "
228 "disabling barriers\n");
229 nilfs_clear_opt(sbi, BARRIER);
230 barrier_done = 0;
231 clear_buffer_ordered(nilfs->ns_sbh[0]);
232 goto retry;
233 }
234 if (unlikely(err)) {
235 printk(KERN_ERR
236 "NILFS: unable to write superblock (err=%d)\n", err);
237 if (err == -EIO && nilfs->ns_sbh[1]) {
238 nilfs_fall_back_super_block(nilfs);
239 goto retry;
240 }
241 } else {
242 struct nilfs_super_block *sbp = nilfs->ns_sbp[0];
243
244 /*
245 * The latest segment becomes trailable from the position
246 * written in superblock.
247 */
248 clear_nilfs_discontinued(nilfs);
249
250 /* update GC protection for recent segments */
251 if (nilfs->ns_sbh[1]) {
252 sbp = NULL;
253 if (dupsb) {
254 set_buffer_dirty(nilfs->ns_sbh[1]);
255 if (!sync_dirty_buffer(nilfs->ns_sbh[1]))
256 sbp = nilfs->ns_sbp[1];
257 }
258 }
259 if (sbp) {
260 spin_lock(&nilfs->ns_last_segment_lock);
261 nilfs->ns_prot_seq = le64_to_cpu(sbp->s_last_seq);
262 spin_unlock(&nilfs->ns_last_segment_lock);
263 }
264 }
265
266 return err;
267}
268
269int nilfs_commit_super(struct nilfs_sb_info *sbi, int dupsb)
270{
271 struct the_nilfs *nilfs = sbi->s_nilfs;
272 struct nilfs_super_block **sbp = nilfs->ns_sbp;
273 sector_t nfreeblocks;
274 time_t t;
275 int err;
276
277 /* nilfs->sem must be locked by the caller. */
278 if (sbp[0]->s_magic != NILFS_SUPER_MAGIC) {
279 if (sbp[1] && sbp[1]->s_magic == NILFS_SUPER_MAGIC)
280 nilfs_swap_super_block(nilfs);
281 else {
282 printk(KERN_CRIT "NILFS: superblock broke on dev %s\n",
283 sbi->s_super->s_id);
284 return -EIO;
285 }
286 }
287 err = nilfs_count_free_blocks(nilfs, &nfreeblocks);
288 if (unlikely(err)) {
289 printk(KERN_ERR "NILFS: failed to count free blocks\n");
290 return err;
291 }
292 spin_lock(&nilfs->ns_last_segment_lock);
293 sbp[0]->s_last_seq = cpu_to_le64(nilfs->ns_last_seq);
294 sbp[0]->s_last_pseg = cpu_to_le64(nilfs->ns_last_pseg);
295 sbp[0]->s_last_cno = cpu_to_le64(nilfs->ns_last_cno);
296 spin_unlock(&nilfs->ns_last_segment_lock);
297
298 t = get_seconds();
299 nilfs->ns_sbwtime[0] = t;
300 sbp[0]->s_free_blocks_count = cpu_to_le64(nfreeblocks);
301 sbp[0]->s_wtime = cpu_to_le64(t);
302 sbp[0]->s_sum = 0;
303 sbp[0]->s_sum = cpu_to_le32(crc32_le(nilfs->ns_crc_seed,
304 (unsigned char *)sbp[0],
305 nilfs->ns_sbsize));
306 if (dupsb && sbp[1]) {
307 memcpy(sbp[1], sbp[0], nilfs->ns_sbsize);
308 nilfs->ns_sbwtime[1] = t;
309 }
310 sbi->s_super->s_dirt = 0;
311 return nilfs_sync_super(sbi, dupsb);
312}
313
314static void nilfs_put_super(struct super_block *sb)
315{
316 struct nilfs_sb_info *sbi = NILFS_SB(sb);
317 struct the_nilfs *nilfs = sbi->s_nilfs;
318
319 nilfs_detach_segment_constructor(sbi);
320
321 if (!(sb->s_flags & MS_RDONLY)) {
322 down_write(&nilfs->ns_sem);
323 nilfs->ns_sbp[0]->s_state = cpu_to_le16(nilfs->ns_mount_state);
324 nilfs_commit_super(sbi, 1);
325 up_write(&nilfs->ns_sem);
326 }
327
328 nilfs_detach_checkpoint(sbi);
329 put_nilfs(sbi->s_nilfs);
330 sbi->s_super = NULL;
331 sb->s_fs_info = NULL;
332 kfree(sbi);
333}
334
335/**
336 * nilfs_write_super - write super block(s) of NILFS
337 * @sb: super_block
338 *
339 * nilfs_write_super() gets a fs-dependent lock, writes super block(s), and
340 * clears s_dirt. This function is called in the section protected by
341 * lock_super().
342 *
343 * The s_dirt flag is managed by each filesystem and we protect it by ns_sem
344 * of the struct the_nilfs. Lock order must be as follows:
345 *
346 * 1. lock_super()
347 * 2. down_write(&nilfs->ns_sem)
348 *
349 * Inside NILFS, locking ns_sem is enough to protect s_dirt and the buffer
350 * of the super block (nilfs->ns_sbp[]).
351 *
352 * In most cases, VFS functions call lock_super() before calling these
353 * methods. So we must be careful not to bring on deadlocks when using
354 * lock_super(); see generic_shutdown_super(), write_super(), and so on.
355 *
356 * Note that order of lock_kernel() and lock_super() depends on contexts
357 * of VFS. We should also note that lock_kernel() can be used in its
358 * protective section and only the outermost one has an effect.
359 */
360static void nilfs_write_super(struct super_block *sb)
361{
362 struct nilfs_sb_info *sbi = NILFS_SB(sb);
363 struct the_nilfs *nilfs = sbi->s_nilfs;
364
365 down_write(&nilfs->ns_sem);
366 if (!(sb->s_flags & MS_RDONLY)) {
367 struct nilfs_super_block **sbp = nilfs->ns_sbp;
368 u64 t = get_seconds();
369 int dupsb;
370
371 if (!nilfs_discontinued(nilfs) && t >= nilfs->ns_sbwtime[0] &&
372 t < nilfs->ns_sbwtime[0] + NILFS_SB_FREQ) {
373 up_write(&nilfs->ns_sem);
374 return;
375 }
376 dupsb = sbp[1] && t > nilfs->ns_sbwtime[1] + NILFS_ALTSB_FREQ;
377 nilfs_commit_super(sbi, dupsb);
378 }
379 sb->s_dirt = 0;
380 up_write(&nilfs->ns_sem);
381}
382
383static int nilfs_sync_fs(struct super_block *sb, int wait)
384{
385 int err = 0;
386
387 /* This function is called when super block should be written back */
388 if (wait)
389 err = nilfs_construct_segment(sb);
390 return err;
391}
392
393int nilfs_attach_checkpoint(struct nilfs_sb_info *sbi, __u64 cno)
394{
395 struct the_nilfs *nilfs = sbi->s_nilfs;
396 struct nilfs_checkpoint *raw_cp;
397 struct buffer_head *bh_cp;
398 int err;
399
400 down_write(&nilfs->ns_sem);
401 list_add(&sbi->s_list, &nilfs->ns_supers);
402 up_write(&nilfs->ns_sem);
403
404 sbi->s_ifile = nilfs_mdt_new(
405 nilfs, sbi->s_super, NILFS_IFILE_INO, NILFS_IFILE_GFP);
406 if (!sbi->s_ifile)
407 return -ENOMEM;
408
409 err = nilfs_palloc_init_blockgroup(sbi->s_ifile, nilfs->ns_inode_size);
410 if (unlikely(err))
411 goto failed;
412
413 err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, cno, 0, &raw_cp,
414 &bh_cp);
415 if (unlikely(err)) {
416 if (err == -ENOENT || err == -EINVAL) {
417 printk(KERN_ERR
418 "NILFS: Invalid checkpoint "
419 "(checkpoint number=%llu)\n",
420 (unsigned long long)cno);
421 err = -EINVAL;
422 }
423 goto failed;
424 }
425 err = nilfs_read_inode_common(sbi->s_ifile, &raw_cp->cp_ifile_inode);
426 if (unlikely(err))
427 goto failed_bh;
428 atomic_set(&sbi->s_inodes_count, le64_to_cpu(raw_cp->cp_inodes_count));
429 atomic_set(&sbi->s_blocks_count, le64_to_cpu(raw_cp->cp_blocks_count));
430
431 nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp);
432 return 0;
433
434 failed_bh:
435 nilfs_cpfile_put_checkpoint(nilfs->ns_cpfile, cno, bh_cp);
436 failed:
437 nilfs_mdt_destroy(sbi->s_ifile);
438 sbi->s_ifile = NULL;
439
440 down_write(&nilfs->ns_sem);
441 list_del_init(&sbi->s_list);
442 up_write(&nilfs->ns_sem);
443
444 return err;
445}
446
447void nilfs_detach_checkpoint(struct nilfs_sb_info *sbi)
448{
449 struct the_nilfs *nilfs = sbi->s_nilfs;
450
451 nilfs_mdt_clear(sbi->s_ifile);
452 nilfs_mdt_destroy(sbi->s_ifile);
453 sbi->s_ifile = NULL;
454 down_write(&nilfs->ns_sem);
455 list_del_init(&sbi->s_list);
456 up_write(&nilfs->ns_sem);
457}
458
459static int nilfs_mark_recovery_complete(struct nilfs_sb_info *sbi)
460{
461 struct the_nilfs *nilfs = sbi->s_nilfs;
462 int err = 0;
463
464 down_write(&nilfs->ns_sem);
465 if (!(nilfs->ns_mount_state & NILFS_VALID_FS)) {
466 nilfs->ns_mount_state |= NILFS_VALID_FS;
467 err = nilfs_commit_super(sbi, 1);
468 if (likely(!err))
469 printk(KERN_INFO "NILFS: recovery complete.\n");
470 }
471 up_write(&nilfs->ns_sem);
472 return err;
473}
474
475static int nilfs_statfs(struct dentry *dentry, struct kstatfs *buf)
476{
477 struct super_block *sb = dentry->d_sb;
478 struct nilfs_sb_info *sbi = NILFS_SB(sb);
479 unsigned long long blocks;
480 unsigned long overhead;
481 unsigned long nrsvblocks;
482 sector_t nfreeblocks;
483 struct the_nilfs *nilfs = sbi->s_nilfs;
484 int err;
485
486 /*
487 * Compute all of the segment blocks
488 *
489 * The blocks before first segment and after last segment
490 * are excluded.
491 */
492 blocks = nilfs->ns_blocks_per_segment * nilfs->ns_nsegments
493 - nilfs->ns_first_data_block;
494 nrsvblocks = nilfs->ns_nrsvsegs * nilfs->ns_blocks_per_segment;
495
496 /*
497 * Compute the overhead
498 *
499 * When distributing meta data blocks outside semgent structure,
500 * We must count them as the overhead.
501 */
502 overhead = 0;
503
504 err = nilfs_count_free_blocks(nilfs, &nfreeblocks);
505 if (unlikely(err))
506 return err;
507
508 buf->f_type = NILFS_SUPER_MAGIC;
509 buf->f_bsize = sb->s_blocksize;
510 buf->f_blocks = blocks - overhead;
511 buf->f_bfree = nfreeblocks;
512 buf->f_bavail = (buf->f_bfree >= nrsvblocks) ?
513 (buf->f_bfree - nrsvblocks) : 0;
514 buf->f_files = atomic_read(&sbi->s_inodes_count);
515 buf->f_ffree = 0; /* nilfs_count_free_inodes(sb); */
516 buf->f_namelen = NILFS_NAME_LEN;
517 return 0;
518}
519
520static struct super_operations nilfs_sops = {
521 .alloc_inode = nilfs_alloc_inode,
522 .destroy_inode = nilfs_destroy_inode,
523 .dirty_inode = nilfs_dirty_inode,
524 /* .write_inode = nilfs_write_inode, */
525 /* .put_inode = nilfs_put_inode, */
526 /* .drop_inode = nilfs_drop_inode, */
527 .delete_inode = nilfs_delete_inode,
528 .put_super = nilfs_put_super,
529 .write_super = nilfs_write_super,
530 .sync_fs = nilfs_sync_fs,
531 /* .write_super_lockfs */
532 /* .unlockfs */
533 .statfs = nilfs_statfs,
534 .remount_fs = nilfs_remount,
535 .clear_inode = nilfs_clear_inode,
536 /* .umount_begin */
537 /* .show_options */
538};
539
540static struct inode *
541nilfs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation)
542{
543 struct inode *inode;
544
545 if (ino < NILFS_FIRST_INO(sb) && ino != NILFS_ROOT_INO &&
546 ino != NILFS_SKETCH_INO)
547 return ERR_PTR(-ESTALE);
548
549 inode = nilfs_iget(sb, ino);
550 if (IS_ERR(inode))
551 return ERR_CAST(inode);
552 if (generation && inode->i_generation != generation) {
553 iput(inode);
554 return ERR_PTR(-ESTALE);
555 }
556
557 return inode;
558}
559
560static struct dentry *
561nilfs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len,
562 int fh_type)
563{
564 return generic_fh_to_dentry(sb, fid, fh_len, fh_type,
565 nilfs_nfs_get_inode);
566}
567
568static struct dentry *
569nilfs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len,
570 int fh_type)
571{
572 return generic_fh_to_parent(sb, fid, fh_len, fh_type,
573 nilfs_nfs_get_inode);
574}
575
576static struct export_operations nilfs_export_ops = {
577 .fh_to_dentry = nilfs_fh_to_dentry,
578 .fh_to_parent = nilfs_fh_to_parent,
579 .get_parent = nilfs_get_parent,
580};
581
582enum {
583 Opt_err_cont, Opt_err_panic, Opt_err_ro,
584 Opt_barrier, Opt_snapshot, Opt_order,
585 Opt_err,
586};
587
588static match_table_t tokens = {
589 {Opt_err_cont, "errors=continue"},
590 {Opt_err_panic, "errors=panic"},
591 {Opt_err_ro, "errors=remount-ro"},
592 {Opt_barrier, "barrier=%s"},
593 {Opt_snapshot, "cp=%u"},
594 {Opt_order, "order=%s"},
595 {Opt_err, NULL}
596};
597
598static int match_bool(substring_t *s, int *result)
599{
600 int len = s->to - s->from;
601
602 if (strncmp(s->from, "on", len) == 0)
603 *result = 1;
604 else if (strncmp(s->from, "off", len) == 0)
605 *result = 0;
606 else
607 return 1;
608 return 0;
609}
610
611static int parse_options(char *options, struct super_block *sb)
612{
613 struct nilfs_sb_info *sbi = NILFS_SB(sb);
614 char *p;
615 substring_t args[MAX_OPT_ARGS];
616 int option;
617
618 if (!options)
619 return 1;
620
621 while ((p = strsep(&options, ",")) != NULL) {
622 int token;
623 if (!*p)
624 continue;
625
626 token = match_token(p, tokens, args);
627 switch (token) {
628 case Opt_barrier:
629 if (match_bool(&args[0], &option))
630 return 0;
631 if (option)
632 nilfs_set_opt(sbi, BARRIER);
633 else
634 nilfs_clear_opt(sbi, BARRIER);
635 break;
636 case Opt_order:
637 if (strcmp(args[0].from, "relaxed") == 0)
638 /* Ordered data semantics */
639 nilfs_clear_opt(sbi, STRICT_ORDER);
640 else if (strcmp(args[0].from, "strict") == 0)
641 /* Strict in-order semantics */
642 nilfs_set_opt(sbi, STRICT_ORDER);
643 else
644 return 0;
645 break;
646 case Opt_err_panic:
647 nilfs_write_opt(sbi, ERROR_MODE, ERRORS_PANIC);
648 break;
649 case Opt_err_ro:
650 nilfs_write_opt(sbi, ERROR_MODE, ERRORS_RO);
651 break;
652 case Opt_err_cont:
653 nilfs_write_opt(sbi, ERROR_MODE, ERRORS_CONT);
654 break;
655 case Opt_snapshot:
656 if (match_int(&args[0], &option) || option <= 0)
657 return 0;
658 if (!(sb->s_flags & MS_RDONLY))
659 return 0;
660 sbi->s_snapshot_cno = option;
661 nilfs_set_opt(sbi, SNAPSHOT);
662 break;
663 default:
664 printk(KERN_ERR
665 "NILFS: Unrecognized mount option \"%s\"\n", p);
666 return 0;
667 }
668 }
669 return 1;
670}
671
672static inline void
673nilfs_set_default_options(struct nilfs_sb_info *sbi,
674 struct nilfs_super_block *sbp)
675{
676 sbi->s_mount_opt =
677 NILFS_MOUNT_ERRORS_CONT | NILFS_MOUNT_BARRIER;
678}
679
680static int nilfs_setup_super(struct nilfs_sb_info *sbi)
681{
682 struct the_nilfs *nilfs = sbi->s_nilfs;
683 struct nilfs_super_block *sbp = nilfs->ns_sbp[0];
684 int max_mnt_count = le16_to_cpu(sbp->s_max_mnt_count);
685 int mnt_count = le16_to_cpu(sbp->s_mnt_count);
686
687 /* nilfs->sem must be locked by the caller. */
688 if (!(nilfs->ns_mount_state & NILFS_VALID_FS)) {
689 printk(KERN_WARNING "NILFS warning: mounting unchecked fs\n");
690 } else if (nilfs->ns_mount_state & NILFS_ERROR_FS) {
691 printk(KERN_WARNING
692 "NILFS warning: mounting fs with errors\n");
693#if 0
694 } else if (max_mnt_count >= 0 && mnt_count >= max_mnt_count) {
695 printk(KERN_WARNING
696 "NILFS warning: maximal mount count reached\n");
697#endif
698 }
699 if (!max_mnt_count)
700 sbp->s_max_mnt_count = cpu_to_le16(NILFS_DFL_MAX_MNT_COUNT);
701
702 sbp->s_mnt_count = cpu_to_le16(mnt_count + 1);
703 sbp->s_state = cpu_to_le16(le16_to_cpu(sbp->s_state) & ~NILFS_VALID_FS);
704 sbp->s_mtime = cpu_to_le64(get_seconds());
705 return nilfs_commit_super(sbi, 1);
706}
707
708struct nilfs_super_block *nilfs_read_super_block(struct super_block *sb,
709 u64 pos, int blocksize,
710 struct buffer_head **pbh)
711{
712 unsigned long long sb_index = pos;
713 unsigned long offset;
714
715 offset = do_div(sb_index, blocksize);
716 *pbh = sb_bread(sb, sb_index);
717 if (!*pbh)
718 return NULL;
719 return (struct nilfs_super_block *)((char *)(*pbh)->b_data + offset);
720}
721
722int nilfs_store_magic_and_option(struct super_block *sb,
723 struct nilfs_super_block *sbp,
724 char *data)
725{
726 struct nilfs_sb_info *sbi = NILFS_SB(sb);
727
728 sb->s_magic = le16_to_cpu(sbp->s_magic);
729
730 /* FS independent flags */
731#ifdef NILFS_ATIME_DISABLE
732 sb->s_flags |= MS_NOATIME;
733#endif
734
735 nilfs_set_default_options(sbi, sbp);
736
737 sbi->s_resuid = le16_to_cpu(sbp->s_def_resuid);
738 sbi->s_resgid = le16_to_cpu(sbp->s_def_resgid);
739 sbi->s_interval = le32_to_cpu(sbp->s_c_interval);
740 sbi->s_watermark = le32_to_cpu(sbp->s_c_block_max);
741
742 return !parse_options(data, sb) ? -EINVAL : 0 ;
743}
744
745/**
746 * nilfs_fill_super() - initialize a super block instance
747 * @sb: super_block
748 * @data: mount options
749 * @silent: silent mode flag
750 * @nilfs: the_nilfs struct
751 *
752 * This function is called exclusively by bd_mount_mutex.
753 * So, the recovery process is protected from other simultaneous mounts.
754 */
755static int
756nilfs_fill_super(struct super_block *sb, void *data, int silent,
757 struct the_nilfs *nilfs)
758{
759 struct nilfs_sb_info *sbi;
760 struct inode *root;
761 __u64 cno;
762 int err;
763
764 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
765 if (!sbi)
766 return -ENOMEM;
767
768 sb->s_fs_info = sbi;
769
770 get_nilfs(nilfs);
771 sbi->s_nilfs = nilfs;
772 sbi->s_super = sb;
773
774 err = init_nilfs(nilfs, sbi, (char *)data);
775 if (err)
776 goto failed_sbi;
777
778 spin_lock_init(&sbi->s_inode_lock);
779 INIT_LIST_HEAD(&sbi->s_dirty_files);
780 INIT_LIST_HEAD(&sbi->s_list);
781
782 /*
783 * Following initialization is overlapped because
784 * nilfs_sb_info structure has been cleared at the beginning.
785 * But we reserve them to keep our interest and make ready
786 * for the future change.
787 */
788 get_random_bytes(&sbi->s_next_generation,
789 sizeof(sbi->s_next_generation));
790 spin_lock_init(&sbi->s_next_gen_lock);
791
792 sb->s_op = &nilfs_sops;
793 sb->s_export_op = &nilfs_export_ops;
794 sb->s_root = NULL;
795 sb->s_time_gran = 1;
796
797 if (!nilfs_loaded(nilfs)) {
798 err = load_nilfs(nilfs, sbi);
799 if (err)
800 goto failed_sbi;
801 }
802 cno = nilfs_last_cno(nilfs);
803
804 if (sb->s_flags & MS_RDONLY) {
805 if (nilfs_test_opt(sbi, SNAPSHOT)) {
806 err = nilfs_cpfile_is_snapshot(nilfs->ns_cpfile,
807 sbi->s_snapshot_cno);
808 if (err < 0)
809 goto failed_sbi;
810 if (!err) {
811 printk(KERN_ERR
812 "NILFS: The specified checkpoint is "
813 "not a snapshot "
814 "(checkpoint number=%llu).\n",
815 (unsigned long long)sbi->s_snapshot_cno);
816 err = -EINVAL;
817 goto failed_sbi;
818 }
819 cno = sbi->s_snapshot_cno;
820 } else
821 /* Read-only mount */
822 sbi->s_snapshot_cno = cno;
823 }
824
825 err = nilfs_attach_checkpoint(sbi, cno);
826 if (err) {
827 printk(KERN_ERR "NILFS: error loading a checkpoint"
828 " (checkpoint number=%llu).\n", (unsigned long long)cno);
829 goto failed_sbi;
830 }
831
832 if (!(sb->s_flags & MS_RDONLY)) {
833 err = nilfs_attach_segment_constructor(sbi);
834 if (err)
835 goto failed_checkpoint;
836 }
837
838 root = nilfs_iget(sb, NILFS_ROOT_INO);
839 if (IS_ERR(root)) {
840 printk(KERN_ERR "NILFS: get root inode failed\n");
841 err = PTR_ERR(root);
842 goto failed_segctor;
843 }
844 if (!S_ISDIR(root->i_mode) || !root->i_blocks || !root->i_size) {
845 iput(root);
846 printk(KERN_ERR "NILFS: corrupt root inode.\n");
847 err = -EINVAL;
848 goto failed_segctor;
849 }
850 sb->s_root = d_alloc_root(root);
851 if (!sb->s_root) {
852 iput(root);
853 printk(KERN_ERR "NILFS: get root dentry failed\n");
854 err = -ENOMEM;
855 goto failed_segctor;
856 }
857
858 if (!(sb->s_flags & MS_RDONLY)) {
859 down_write(&nilfs->ns_sem);
860 nilfs_setup_super(sbi);
861 up_write(&nilfs->ns_sem);
862 }
863
864 err = nilfs_mark_recovery_complete(sbi);
865 if (unlikely(err)) {
866 printk(KERN_ERR "NILFS: recovery failed.\n");
867 goto failed_root;
868 }
869
870 return 0;
871
872 failed_root:
873 dput(sb->s_root);
874 sb->s_root = NULL;
875
876 failed_segctor:
877 nilfs_detach_segment_constructor(sbi);
878
879 failed_checkpoint:
880 nilfs_detach_checkpoint(sbi);
881
882 failed_sbi:
883 put_nilfs(nilfs);
884 sb->s_fs_info = NULL;
885 kfree(sbi);
886 return err;
887}
888
889static int nilfs_remount(struct super_block *sb, int *flags, char *data)
890{
891 struct nilfs_sb_info *sbi = NILFS_SB(sb);
892 struct nilfs_super_block *sbp;
893 struct the_nilfs *nilfs = sbi->s_nilfs;
894 unsigned long old_sb_flags;
895 struct nilfs_mount_options old_opts;
896 int err;
897
898 old_sb_flags = sb->s_flags;
899 old_opts.mount_opt = sbi->s_mount_opt;
900 old_opts.snapshot_cno = sbi->s_snapshot_cno;
901
902 if (!parse_options(data, sb)) {
903 err = -EINVAL;
904 goto restore_opts;
905 }
906 sb->s_flags = (sb->s_flags & ~MS_POSIXACL);
907
908 if ((*flags & MS_RDONLY) &&
909 sbi->s_snapshot_cno != old_opts.snapshot_cno) {
910 printk(KERN_WARNING "NILFS (device %s): couldn't "
911 "remount to a different snapshot. \n",
912 sb->s_id);
913 err = -EINVAL;
914 goto restore_opts;
915 }
916
917 if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY))
918 goto out;
919 if (*flags & MS_RDONLY) {
920 /* Shutting down the segment constructor */
921 nilfs_detach_segment_constructor(sbi);
922 sb->s_flags |= MS_RDONLY;
923
924 sbi->s_snapshot_cno = nilfs_last_cno(nilfs);
925 /* nilfs_set_opt(sbi, SNAPSHOT); */
926
927 /*
928 * Remounting a valid RW partition RDONLY, so set
929 * the RDONLY flag and then mark the partition as valid again.
930 */
931 down_write(&nilfs->ns_sem);
932 sbp = nilfs->ns_sbp[0];
933 if (!(sbp->s_state & le16_to_cpu(NILFS_VALID_FS)) &&
934 (nilfs->ns_mount_state & NILFS_VALID_FS))
935 sbp->s_state = cpu_to_le16(nilfs->ns_mount_state);
936 sbp->s_mtime = cpu_to_le64(get_seconds());
937 nilfs_commit_super(sbi, 1);
938 up_write(&nilfs->ns_sem);
939 } else {
940 /*
941 * Mounting a RDONLY partition read-write, so reread and
942 * store the current valid flag. (It may have been changed
943 * by fsck since we originally mounted the partition.)
944 */
945 down(&sb->s_bdev->bd_mount_sem);
946 /* Check existing RW-mount */
947 if (test_exclusive_mount(sb->s_type, sb->s_bdev, 0)) {
948 printk(KERN_WARNING "NILFS (device %s): couldn't "
949 "remount because a RW-mount exists.\n",
950 sb->s_id);
951 err = -EBUSY;
952 goto rw_remount_failed;
953 }
954 if (sbi->s_snapshot_cno != nilfs_last_cno(nilfs)) {
955 printk(KERN_WARNING "NILFS (device %s): couldn't "
956 "remount because the current RO-mount is not "
957 "the latest one.\n",
958 sb->s_id);
959 err = -EINVAL;
960 goto rw_remount_failed;
961 }
962 sb->s_flags &= ~MS_RDONLY;
963 nilfs_clear_opt(sbi, SNAPSHOT);
964 sbi->s_snapshot_cno = 0;
965
966 err = nilfs_attach_segment_constructor(sbi);
967 if (err)
968 goto rw_remount_failed;
969
970 down_write(&nilfs->ns_sem);
971 nilfs_setup_super(sbi);
972 up_write(&nilfs->ns_sem);
973
974 up(&sb->s_bdev->bd_mount_sem);
975 }
976 out:
977 return 0;
978
979 rw_remount_failed:
980 up(&sb->s_bdev->bd_mount_sem);
981 restore_opts:
982 sb->s_flags = old_sb_flags;
983 sbi->s_mount_opt = old_opts.mount_opt;
984 sbi->s_snapshot_cno = old_opts.snapshot_cno;
985 return err;
986}
987
988struct nilfs_super_data {
989 struct block_device *bdev;
990 __u64 cno;
991 int flags;
992};
993
994/**
995 * nilfs_identify - pre-read mount options needed to identify mount instance
996 * @data: mount options
997 * @sd: nilfs_super_data
998 */
999static int nilfs_identify(char *data, struct nilfs_super_data *sd)
1000{
1001 char *p, *options = data;
1002 substring_t args[MAX_OPT_ARGS];
1003 int option, token;
1004 int ret = 0;
1005
1006 do {
1007 p = strsep(&options, ",");
1008 if (p != NULL && *p) {
1009 token = match_token(p, tokens, args);
1010 if (token == Opt_snapshot) {
1011 if (!(sd->flags & MS_RDONLY))
1012 ret++;
1013 else {
1014 ret = match_int(&args[0], &option);
1015 if (!ret) {
1016 if (option > 0)
1017 sd->cno = option;
1018 else
1019 ret++;
1020 }
1021 }
1022 }
1023 if (ret)
1024 printk(KERN_ERR
1025 "NILFS: invalid mount option: %s\n", p);
1026 }
1027 if (!options)
1028 break;
1029 BUG_ON(options == data);
1030 *(options - 1) = ',';
1031 } while (!ret);
1032 return ret;
1033}
1034
1035static int nilfs_set_bdev_super(struct super_block *s, void *data)
1036{
1037 struct nilfs_super_data *sd = data;
1038
1039 s->s_bdev = sd->bdev;
1040 s->s_dev = s->s_bdev->bd_dev;
1041 return 0;
1042}
1043
1044static int nilfs_test_bdev_super(struct super_block *s, void *data)
1045{
1046 struct nilfs_super_data *sd = data;
1047
1048 return s->s_bdev == sd->bdev;
1049}
1050
1051static int nilfs_test_bdev_super2(struct super_block *s, void *data)
1052{
1053 struct nilfs_super_data *sd = data;
1054 int ret;
1055
1056 if (s->s_bdev != sd->bdev)
1057 return 0;
1058
1059 if (!((s->s_flags | sd->flags) & MS_RDONLY))
1060 return 1; /* Reuse an old R/W-mode super_block */
1061
1062 if (s->s_flags & sd->flags & MS_RDONLY) {
1063 if (down_read_trylock(&s->s_umount)) {
1064 ret = s->s_root &&
1065 (sd->cno == NILFS_SB(s)->s_snapshot_cno);
1066 up_read(&s->s_umount);
1067 /*
1068 * This path is locked with sb_lock by sget().
1069 * So, drop_super() causes deadlock.
1070 */
1071 return ret;
1072 }
1073 }
1074 return 0;
1075}
1076
1077static int
1078nilfs_get_sb(struct file_system_type *fs_type, int flags,
1079 const char *dev_name, void *data, struct vfsmount *mnt)
1080{
1081 struct nilfs_super_data sd;
1082 struct super_block *s, *s2;
1083 struct the_nilfs *nilfs = NULL;
1084 int err, need_to_close = 1;
1085
1086 sd.bdev = open_bdev_exclusive(dev_name, flags, fs_type);
1087 if (IS_ERR(sd.bdev))
1088 return PTR_ERR(sd.bdev);
1089
1090 /*
1091 * To get mount instance using sget() vfs-routine, NILFS needs
1092 * much more information than normal filesystems to identify mount
1093 * instance. For snapshot mounts, not only a mount type (ro-mount
1094 * or rw-mount) but also a checkpoint number is required.
1095 * The results are passed in sget() using nilfs_super_data.
1096 */
1097 sd.cno = 0;
1098 sd.flags = flags;
1099 if (nilfs_identify((char *)data, &sd)) {
1100 err = -EINVAL;
1101 goto failed;
1102 }
1103
1104 /*
1105 * once the super is inserted into the list by sget, s_umount
1106 * will protect the lockfs code from trying to start a snapshot
1107 * while we are mounting
1108 */
1109 down(&sd.bdev->bd_mount_sem);
1110 if (!sd.cno &&
1111 (err = test_exclusive_mount(fs_type, sd.bdev, flags ^ MS_RDONLY))) {
1112 err = (err < 0) ? : -EBUSY;
1113 goto failed_unlock;
1114 }
1115
1116 /*
1117 * Phase-1: search any existent instance and get the_nilfs
1118 */
1119 s = sget(fs_type, nilfs_test_bdev_super, nilfs_set_bdev_super, &sd);
1120 if (IS_ERR(s))
1121 goto error_s;
1122
1123 if (!s->s_root) {
1124 err = -ENOMEM;
1125 nilfs = alloc_nilfs(sd.bdev);
1126 if (!nilfs)
1127 goto cancel_new;
1128 } else {
1129 struct nilfs_sb_info *sbi = NILFS_SB(s);
1130
1131 /*
1132 * s_umount protects super_block from unmount process;
1133 * It covers pointers of nilfs_sb_info and the_nilfs.
1134 */
1135 nilfs = sbi->s_nilfs;
1136 get_nilfs(nilfs);
1137 up_write(&s->s_umount);
1138
1139 /*
1140 * Phase-2: search specified snapshot or R/W mode super_block
1141 */
1142 if (!sd.cno)
1143 /* trying to get the latest checkpoint. */
1144 sd.cno = nilfs_last_cno(nilfs);
1145
1146 s2 = sget(fs_type, nilfs_test_bdev_super2,
1147 nilfs_set_bdev_super, &sd);
1148 deactivate_super(s);
1149 /*
1150 * Although deactivate_super() invokes close_bdev_exclusive() at
1151 * kill_block_super(). Here, s is an existent mount; we need
1152 * one more close_bdev_exclusive() call.
1153 */
1154 s = s2;
1155 if (IS_ERR(s))
1156 goto error_s;
1157 }
1158
1159 if (!s->s_root) {
1160 char b[BDEVNAME_SIZE];
1161
1162 s->s_flags = flags;
1163 strlcpy(s->s_id, bdevname(sd.bdev, b), sizeof(s->s_id));
1164 sb_set_blocksize(s, block_size(sd.bdev));
1165
1166 err = nilfs_fill_super(s, data, flags & MS_VERBOSE, nilfs);
1167 if (err)
1168 goto cancel_new;
1169
1170 s->s_flags |= MS_ACTIVE;
1171 need_to_close = 0;
1172 } else if (!(s->s_flags & MS_RDONLY)) {
1173 err = -EBUSY;
1174 }
1175
1176 up(&sd.bdev->bd_mount_sem);
1177 put_nilfs(nilfs);
1178 if (need_to_close)
1179 close_bdev_exclusive(sd.bdev, flags);
1180 simple_set_mnt(mnt, s);
1181 return 0;
1182
1183 error_s:
1184 up(&sd.bdev->bd_mount_sem);
1185 if (nilfs)
1186 put_nilfs(nilfs);
1187 close_bdev_exclusive(sd.bdev, flags);
1188 return PTR_ERR(s);
1189
1190 failed_unlock:
1191 up(&sd.bdev->bd_mount_sem);
1192 failed:
1193 close_bdev_exclusive(sd.bdev, flags);
1194
1195 return err;
1196
1197 cancel_new:
1198 /* Abandoning the newly allocated superblock */
1199 up(&sd.bdev->bd_mount_sem);
1200 if (nilfs)
1201 put_nilfs(nilfs);
1202 up_write(&s->s_umount);
1203 deactivate_super(s);
1204 /*
1205 * deactivate_super() invokes close_bdev_exclusive().
1206 * We must finish all post-cleaning before this call;
1207 * put_nilfs() and unlocking bd_mount_sem need the block device.
1208 */
1209 return err;
1210}
1211
1212static int nilfs_test_bdev_super3(struct super_block *s, void *data)
1213{
1214 struct nilfs_super_data *sd = data;
1215 int ret;
1216
1217 if (s->s_bdev != sd->bdev)
1218 return 0;
1219 if (down_read_trylock(&s->s_umount)) {
1220 ret = (s->s_flags & MS_RDONLY) && s->s_root &&
1221 nilfs_test_opt(NILFS_SB(s), SNAPSHOT);
1222 up_read(&s->s_umount);
1223 if (ret)
1224 return 0; /* ignore snapshot mounts */
1225 }
1226 return !((sd->flags ^ s->s_flags) & MS_RDONLY);
1227}
1228
1229static int __false_bdev_super(struct super_block *s, void *data)
1230{
1231#if 0 /* XXX: workaround for lock debug. This is not good idea */
1232 up_write(&s->s_umount);
1233#endif
1234 return -EFAULT;
1235}
1236
1237/**
1238 * test_exclusive_mount - check whether an exclusive RW/RO mount exists or not.
1239 * fs_type: filesystem type
1240 * bdev: block device
1241 * flag: 0 (check rw-mount) or MS_RDONLY (check ro-mount)
1242 * res: pointer to an integer to store result
1243 *
1244 * This function must be called within a section protected by bd_mount_mutex.
1245 */
1246static int test_exclusive_mount(struct file_system_type *fs_type,
1247 struct block_device *bdev, int flags)
1248{
1249 struct super_block *s;
1250 struct nilfs_super_data sd = { .flags = flags, .bdev = bdev };
1251
1252 s = sget(fs_type, nilfs_test_bdev_super3, __false_bdev_super, &sd);
1253 if (IS_ERR(s)) {
1254 if (PTR_ERR(s) != -EFAULT)
1255 return PTR_ERR(s);
1256 return 0; /* Not found */
1257 }
1258 up_write(&s->s_umount);
1259 deactivate_super(s);
1260 return 1; /* Found */
1261}
1262
1263struct file_system_type nilfs_fs_type = {
1264 .owner = THIS_MODULE,
1265 .name = "nilfs2",
1266 .get_sb = nilfs_get_sb,
1267 .kill_sb = kill_block_super,
1268 .fs_flags = FS_REQUIRES_DEV,
1269};
1270
1271static int __init init_nilfs_fs(void)
1272{
1273 int err;
1274
1275 err = nilfs_init_inode_cache();
1276 if (err)
1277 goto failed;
1278
1279 err = nilfs_init_transaction_cache();
1280 if (err)
1281 goto failed_inode_cache;
1282
1283 err = nilfs_init_segbuf_cache();
1284 if (err)
1285 goto failed_transaction_cache;
1286
1287 err = nilfs_btree_path_cache_init();
1288 if (err)
1289 goto failed_segbuf_cache;
1290
1291 err = register_filesystem(&nilfs_fs_type);
1292 if (err)
1293 goto failed_btree_path_cache;
1294
1295 return 0;
1296
1297 failed_btree_path_cache:
1298 nilfs_btree_path_cache_destroy();
1299
1300 failed_segbuf_cache:
1301 nilfs_destroy_segbuf_cache();
1302
1303 failed_transaction_cache:
1304 nilfs_destroy_transaction_cache();
1305
1306 failed_inode_cache:
1307 nilfs_destroy_inode_cache();
1308
1309 failed:
1310 return err;
1311}
1312
1313static void __exit exit_nilfs_fs(void)
1314{
1315 nilfs_destroy_segbuf_cache();
1316 nilfs_destroy_transaction_cache();
1317 nilfs_destroy_inode_cache();
1318 nilfs_btree_path_cache_destroy();
1319 unregister_filesystem(&nilfs_fs_type);
1320}
1321
1322module_init(init_nilfs_fs)
1323module_exit(exit_nilfs_fs)
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
new file mode 100644
index 000000000000..33400cf0bbe2
--- /dev/null
+++ b/fs/nilfs2/the_nilfs.c
@@ -0,0 +1,637 @@
1/*
2 * the_nilfs.c - the_nilfs shared structure.
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>
21 *
22 */
23
24#include <linux/buffer_head.h>
25#include <linux/slab.h>
26#include <linux/blkdev.h>
27#include <linux/backing-dev.h>
28#include <linux/crc32.h>
29#include "nilfs.h"
30#include "segment.h"
31#include "alloc.h"
32#include "cpfile.h"
33#include "sufile.h"
34#include "dat.h"
35#include "seglist.h"
36#include "segbuf.h"
37
38void nilfs_set_last_segment(struct the_nilfs *nilfs,
39 sector_t start_blocknr, u64 seq, __u64 cno)
40{
41 spin_lock(&nilfs->ns_last_segment_lock);
42 nilfs->ns_last_pseg = start_blocknr;
43 nilfs->ns_last_seq = seq;
44 nilfs->ns_last_cno = cno;
45 spin_unlock(&nilfs->ns_last_segment_lock);
46}
47
48/**
49 * alloc_nilfs - allocate the_nilfs structure
50 * @bdev: block device to which the_nilfs is related
51 *
52 * alloc_nilfs() allocates memory for the_nilfs and
53 * initializes its reference count and locks.
54 *
55 * Return Value: On success, pointer to the_nilfs is returned.
56 * On error, NULL is returned.
57 */
58struct the_nilfs *alloc_nilfs(struct block_device *bdev)
59{
60 struct the_nilfs *nilfs;
61
62 nilfs = kzalloc(sizeof(*nilfs), GFP_KERNEL);
63 if (!nilfs)
64 return NULL;
65
66 nilfs->ns_bdev = bdev;
67 atomic_set(&nilfs->ns_count, 1);
68 atomic_set(&nilfs->ns_writer_refcount, -1);
69 atomic_set(&nilfs->ns_ndirtyblks, 0);
70 init_rwsem(&nilfs->ns_sem);
71 mutex_init(&nilfs->ns_writer_mutex);
72 INIT_LIST_HEAD(&nilfs->ns_supers);
73 spin_lock_init(&nilfs->ns_last_segment_lock);
74 nilfs->ns_gc_inodes_h = NULL;
75 init_rwsem(&nilfs->ns_segctor_sem);
76
77 return nilfs;
78}
79
80/**
81 * put_nilfs - release a reference to the_nilfs
82 * @nilfs: the_nilfs structure to be released
83 *
84 * put_nilfs() decrements a reference counter of the_nilfs.
85 * If the reference count reaches zero, the_nilfs is freed.
86 */
87void put_nilfs(struct the_nilfs *nilfs)
88{
89 if (!atomic_dec_and_test(&nilfs->ns_count))
90 return;
91 /*
92 * Increment of ns_count never occur below because the caller
93 * of get_nilfs() holds at least one reference to the_nilfs.
94 * Thus its exclusion control is not required here.
95 */
96 might_sleep();
97 if (nilfs_loaded(nilfs)) {
98 nilfs_mdt_clear(nilfs->ns_sufile);
99 nilfs_mdt_destroy(nilfs->ns_sufile);
100 nilfs_mdt_clear(nilfs->ns_cpfile);
101 nilfs_mdt_destroy(nilfs->ns_cpfile);
102 nilfs_mdt_clear(nilfs->ns_dat);
103 nilfs_mdt_destroy(nilfs->ns_dat);
104 /* XXX: how and when to clear nilfs->ns_gc_dat? */
105 nilfs_mdt_destroy(nilfs->ns_gc_dat);
106 }
107 if (nilfs_init(nilfs)) {
108 nilfs_destroy_gccache(nilfs);
109 brelse(nilfs->ns_sbh[0]);
110 brelse(nilfs->ns_sbh[1]);
111 }
112 kfree(nilfs);
113}
114
115static int nilfs_load_super_root(struct the_nilfs *nilfs,
116 struct nilfs_sb_info *sbi, sector_t sr_block)
117{
118 struct buffer_head *bh_sr;
119 struct nilfs_super_root *raw_sr;
120 struct nilfs_super_block **sbp = nilfs->ns_sbp;
121 unsigned dat_entry_size, segment_usage_size, checkpoint_size;
122 unsigned inode_size;
123 int err;
124
125 err = nilfs_read_super_root_block(sbi->s_super, sr_block, &bh_sr, 1);
126 if (unlikely(err))
127 return err;
128
129 down_read(&nilfs->ns_sem);
130 dat_entry_size = le16_to_cpu(sbp[0]->s_dat_entry_size);
131 checkpoint_size = le16_to_cpu(sbp[0]->s_checkpoint_size);
132 segment_usage_size = le16_to_cpu(sbp[0]->s_segment_usage_size);
133 up_read(&nilfs->ns_sem);
134
135 inode_size = nilfs->ns_inode_size;
136
137 err = -ENOMEM;
138 nilfs->ns_dat = nilfs_mdt_new(
139 nilfs, NULL, NILFS_DAT_INO, NILFS_DAT_GFP);
140 if (unlikely(!nilfs->ns_dat))
141 goto failed;
142
143 nilfs->ns_gc_dat = nilfs_mdt_new(
144 nilfs, NULL, NILFS_DAT_INO, NILFS_DAT_GFP);
145 if (unlikely(!nilfs->ns_gc_dat))
146 goto failed_dat;
147
148 nilfs->ns_cpfile = nilfs_mdt_new(
149 nilfs, NULL, NILFS_CPFILE_INO, NILFS_CPFILE_GFP);
150 if (unlikely(!nilfs->ns_cpfile))
151 goto failed_gc_dat;
152
153 nilfs->ns_sufile = nilfs_mdt_new(
154 nilfs, NULL, NILFS_SUFILE_INO, NILFS_SUFILE_GFP);
155 if (unlikely(!nilfs->ns_sufile))
156 goto failed_cpfile;
157
158 err = nilfs_palloc_init_blockgroup(nilfs->ns_dat, dat_entry_size);
159 if (unlikely(err))
160 goto failed_sufile;
161
162 err = nilfs_palloc_init_blockgroup(nilfs->ns_gc_dat, dat_entry_size);
163 if (unlikely(err))
164 goto failed_sufile;
165
166 nilfs_mdt_set_shadow(nilfs->ns_dat, nilfs->ns_gc_dat);
167 nilfs_mdt_set_entry_size(nilfs->ns_cpfile, checkpoint_size,
168 sizeof(struct nilfs_cpfile_header));
169 nilfs_mdt_set_entry_size(nilfs->ns_sufile, segment_usage_size,
170 sizeof(struct nilfs_sufile_header));
171
172 err = nilfs_mdt_read_inode_direct(
173 nilfs->ns_dat, bh_sr, NILFS_SR_DAT_OFFSET(inode_size));
174 if (unlikely(err))
175 goto failed_sufile;
176
177 err = nilfs_mdt_read_inode_direct(
178 nilfs->ns_cpfile, bh_sr, NILFS_SR_CPFILE_OFFSET(inode_size));
179 if (unlikely(err))
180 goto failed_sufile;
181
182 err = nilfs_mdt_read_inode_direct(
183 nilfs->ns_sufile, bh_sr, NILFS_SR_SUFILE_OFFSET(inode_size));
184 if (unlikely(err))
185 goto failed_sufile;
186
187 raw_sr = (struct nilfs_super_root *)bh_sr->b_data;
188 nilfs->ns_nongc_ctime = le64_to_cpu(raw_sr->sr_nongc_ctime);
189
190 failed:
191 brelse(bh_sr);
192 return err;
193
194 failed_sufile:
195 nilfs_mdt_destroy(nilfs->ns_sufile);
196
197 failed_cpfile:
198 nilfs_mdt_destroy(nilfs->ns_cpfile);
199
200 failed_gc_dat:
201 nilfs_mdt_destroy(nilfs->ns_gc_dat);
202
203 failed_dat:
204 nilfs_mdt_destroy(nilfs->ns_dat);
205 goto failed;
206}
207
208static void nilfs_init_recovery_info(struct nilfs_recovery_info *ri)
209{
210 memset(ri, 0, sizeof(*ri));
211 INIT_LIST_HEAD(&ri->ri_used_segments);
212}
213
214static void nilfs_clear_recovery_info(struct nilfs_recovery_info *ri)
215{
216 nilfs_dispose_segment_list(&ri->ri_used_segments);
217}
218
219/**
220 * load_nilfs - load and recover the nilfs
221 * @nilfs: the_nilfs structure to be released
222 * @sbi: nilfs_sb_info used to recover past segment
223 *
224 * load_nilfs() searches and load the latest super root,
225 * attaches the last segment, and does recovery if needed.
226 * The caller must call this exclusively for simultaneous mounts.
227 */
228int load_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
229{
230 struct nilfs_recovery_info ri;
231 unsigned int s_flags = sbi->s_super->s_flags;
232 int really_read_only = bdev_read_only(nilfs->ns_bdev);
233 unsigned valid_fs;
234 int err = 0;
235
236 nilfs_init_recovery_info(&ri);
237
238 down_write(&nilfs->ns_sem);
239 valid_fs = (nilfs->ns_mount_state & NILFS_VALID_FS);
240 up_write(&nilfs->ns_sem);
241
242 if (!valid_fs && (s_flags & MS_RDONLY)) {
243 printk(KERN_INFO "NILFS: INFO: recovery "
244 "required for readonly filesystem.\n");
245 if (really_read_only) {
246 printk(KERN_ERR "NILFS: write access "
247 "unavailable, cannot proceed.\n");
248 err = -EROFS;
249 goto failed;
250 }
251 printk(KERN_INFO "NILFS: write access will "
252 "be enabled during recovery.\n");
253 sbi->s_super->s_flags &= ~MS_RDONLY;
254 }
255
256 err = nilfs_search_super_root(nilfs, sbi, &ri);
257 if (unlikely(err)) {
258 printk(KERN_ERR "NILFS: error searching super root.\n");
259 goto failed;
260 }
261
262 err = nilfs_load_super_root(nilfs, sbi, ri.ri_super_root);
263 if (unlikely(err)) {
264 printk(KERN_ERR "NILFS: error loading super root.\n");
265 goto failed;
266 }
267
268 if (!valid_fs) {
269 err = nilfs_recover_logical_segments(nilfs, sbi, &ri);
270 if (unlikely(err)) {
271 nilfs_mdt_destroy(nilfs->ns_cpfile);
272 nilfs_mdt_destroy(nilfs->ns_sufile);
273 nilfs_mdt_destroy(nilfs->ns_dat);
274 goto failed;
275 }
276 if (ri.ri_need_recovery == NILFS_RECOVERY_SR_UPDATED)
277 sbi->s_super->s_dirt = 1;
278 }
279
280 set_nilfs_loaded(nilfs);
281
282 failed:
283 nilfs_clear_recovery_info(&ri);
284 sbi->s_super->s_flags = s_flags;
285 return err;
286}
287
288static unsigned long long nilfs_max_size(unsigned int blkbits)
289{
290 unsigned int max_bits;
291 unsigned long long res = MAX_LFS_FILESIZE; /* page cache limit */
292
293 max_bits = blkbits + NILFS_BMAP_KEY_BIT; /* bmap size limit */
294 if (max_bits < 64)
295 res = min_t(unsigned long long, res, (1ULL << max_bits) - 1);
296 return res;
297}
298
299static int nilfs_store_disk_layout(struct the_nilfs *nilfs,
300 struct nilfs_super_block *sbp)
301{
302 if (le32_to_cpu(sbp->s_rev_level) != NILFS_CURRENT_REV) {
303 printk(KERN_ERR "NILFS: revision mismatch "
304 "(superblock rev.=%d.%d, current rev.=%d.%d). "
305 "Please check the version of mkfs.nilfs.\n",
306 le32_to_cpu(sbp->s_rev_level),
307 le16_to_cpu(sbp->s_minor_rev_level),
308 NILFS_CURRENT_REV, NILFS_MINOR_REV);
309 return -EINVAL;
310 }
311 nilfs->ns_sbsize = le16_to_cpu(sbp->s_bytes);
312 if (nilfs->ns_sbsize > BLOCK_SIZE)
313 return -EINVAL;
314
315 nilfs->ns_inode_size = le16_to_cpu(sbp->s_inode_size);
316 nilfs->ns_first_ino = le32_to_cpu(sbp->s_first_ino);
317
318 nilfs->ns_blocks_per_segment = le32_to_cpu(sbp->s_blocks_per_segment);
319 if (nilfs->ns_blocks_per_segment < NILFS_SEG_MIN_BLOCKS) {
320 printk(KERN_ERR "NILFS: too short segment. \n");
321 return -EINVAL;
322 }
323
324 nilfs->ns_first_data_block = le64_to_cpu(sbp->s_first_data_block);
325 nilfs->ns_nsegments = le64_to_cpu(sbp->s_nsegments);
326 nilfs->ns_r_segments_percentage =
327 le32_to_cpu(sbp->s_r_segments_percentage);
328 nilfs->ns_nrsvsegs =
329 max_t(unsigned long, NILFS_MIN_NRSVSEGS,
330 DIV_ROUND_UP(nilfs->ns_nsegments *
331 nilfs->ns_r_segments_percentage, 100));
332 nilfs->ns_crc_seed = le32_to_cpu(sbp->s_crc_seed);
333 return 0;
334}
335
336static int nilfs_valid_sb(struct nilfs_super_block *sbp)
337{
338 static unsigned char sum[4];
339 const int sumoff = offsetof(struct nilfs_super_block, s_sum);
340 size_t bytes;
341 u32 crc;
342
343 if (!sbp || le16_to_cpu(sbp->s_magic) != NILFS_SUPER_MAGIC)
344 return 0;
345 bytes = le16_to_cpu(sbp->s_bytes);
346 if (bytes > BLOCK_SIZE)
347 return 0;
348 crc = crc32_le(le32_to_cpu(sbp->s_crc_seed), (unsigned char *)sbp,
349 sumoff);
350 crc = crc32_le(crc, sum, 4);
351 crc = crc32_le(crc, (unsigned char *)sbp + sumoff + 4,
352 bytes - sumoff - 4);
353 return crc == le32_to_cpu(sbp->s_sum);
354}
355
356static int nilfs_sb2_bad_offset(struct nilfs_super_block *sbp, u64 offset)
357{
358 return offset < ((le64_to_cpu(sbp->s_nsegments) *
359 le32_to_cpu(sbp->s_blocks_per_segment)) <<
360 (le32_to_cpu(sbp->s_log_block_size) + 10));
361}
362
363static void nilfs_release_super_block(struct the_nilfs *nilfs)
364{
365 int i;
366
367 for (i = 0; i < 2; i++) {
368 if (nilfs->ns_sbp[i]) {
369 brelse(nilfs->ns_sbh[i]);
370 nilfs->ns_sbh[i] = NULL;
371 nilfs->ns_sbp[i] = NULL;
372 }
373 }
374}
375
376void nilfs_fall_back_super_block(struct the_nilfs *nilfs)
377{
378 brelse(nilfs->ns_sbh[0]);
379 nilfs->ns_sbh[0] = nilfs->ns_sbh[1];
380 nilfs->ns_sbp[0] = nilfs->ns_sbp[1];
381 nilfs->ns_sbh[1] = NULL;
382 nilfs->ns_sbp[1] = NULL;
383}
384
385void nilfs_swap_super_block(struct the_nilfs *nilfs)
386{
387 struct buffer_head *tsbh = nilfs->ns_sbh[0];
388 struct nilfs_super_block *tsbp = nilfs->ns_sbp[0];
389
390 nilfs->ns_sbh[0] = nilfs->ns_sbh[1];
391 nilfs->ns_sbp[0] = nilfs->ns_sbp[1];
392 nilfs->ns_sbh[1] = tsbh;
393 nilfs->ns_sbp[1] = tsbp;
394}
395
396static int nilfs_load_super_block(struct the_nilfs *nilfs,
397 struct super_block *sb, int blocksize,
398 struct nilfs_super_block **sbpp)
399{
400 struct nilfs_super_block **sbp = nilfs->ns_sbp;
401 struct buffer_head **sbh = nilfs->ns_sbh;
402 u64 sb2off = NILFS_SB2_OFFSET_BYTES(nilfs->ns_bdev->bd_inode->i_size);
403 int valid[2], swp = 0;
404
405 sbp[0] = nilfs_read_super_block(sb, NILFS_SB_OFFSET_BYTES, blocksize,
406 &sbh[0]);
407 sbp[1] = nilfs_read_super_block(sb, sb2off, blocksize, &sbh[1]);
408
409 if (!sbp[0]) {
410 if (!sbp[1]) {
411 printk(KERN_ERR "NILFS: unable to read superblock\n");
412 return -EIO;
413 }
414 printk(KERN_WARNING
415 "NILFS warning: unable to read primary superblock\n");
416 } else if (!sbp[1])
417 printk(KERN_WARNING
418 "NILFS warning: unable to read secondary superblock\n");
419
420 valid[0] = nilfs_valid_sb(sbp[0]);
421 valid[1] = nilfs_valid_sb(sbp[1]);
422 swp = valid[1] &&
423 (!valid[0] ||
424 le64_to_cpu(sbp[1]->s_wtime) > le64_to_cpu(sbp[0]->s_wtime));
425
426 if (valid[swp] && nilfs_sb2_bad_offset(sbp[swp], sb2off)) {
427 brelse(sbh[1]);
428 sbh[1] = NULL;
429 sbp[1] = NULL;
430 swp = 0;
431 }
432 if (!valid[swp]) {
433 nilfs_release_super_block(nilfs);
434 printk(KERN_ERR "NILFS: Can't find nilfs on dev %s.\n",
435 sb->s_id);
436 return -EINVAL;
437 }
438
439 if (swp) {
440 printk(KERN_WARNING "NILFS warning: broken superblock. "
441 "using spare superblock.\n");
442 nilfs_swap_super_block(nilfs);
443 }
444
445 nilfs->ns_sbwtime[0] = le64_to_cpu(sbp[0]->s_wtime);
446 nilfs->ns_sbwtime[1] = valid[!swp] ? le64_to_cpu(sbp[1]->s_wtime) : 0;
447 nilfs->ns_prot_seq = le64_to_cpu(sbp[valid[1] & !swp]->s_last_seq);
448 *sbpp = sbp[0];
449 return 0;
450}
451
452/**
453 * init_nilfs - initialize a NILFS instance.
454 * @nilfs: the_nilfs structure
455 * @sbi: nilfs_sb_info
456 * @sb: super block
457 * @data: mount options
458 *
459 * init_nilfs() performs common initialization per block device (e.g.
460 * reading the super block, getting disk layout information, initializing
461 * shared fields in the_nilfs). It takes on some portion of the jobs
462 * typically done by a fill_super() routine. This division arises from
463 * the nature that multiple NILFS instances may be simultaneously
464 * mounted on a device.
465 * For multiple mounts on the same device, only the first mount
466 * invokes these tasks.
467 *
468 * Return Value: On success, 0 is returned. On error, a negative error
469 * code is returned.
470 */
471int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data)
472{
473 struct super_block *sb = sbi->s_super;
474 struct nilfs_super_block *sbp;
475 struct backing_dev_info *bdi;
476 int blocksize;
477 int err;
478
479 down_write(&nilfs->ns_sem);
480 if (nilfs_init(nilfs)) {
481 /* Load values from existing the_nilfs */
482 sbp = nilfs->ns_sbp[0];
483 err = nilfs_store_magic_and_option(sb, sbp, data);
484 if (err)
485 goto out;
486
487 blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
488 if (sb->s_blocksize != blocksize &&
489 !sb_set_blocksize(sb, blocksize)) {
490 printk(KERN_ERR "NILFS: blocksize %d unfit to device\n",
491 blocksize);
492 err = -EINVAL;
493 }
494 sb->s_maxbytes = nilfs_max_size(sb->s_blocksize_bits);
495 goto out;
496 }
497
498 blocksize = sb_min_blocksize(sb, BLOCK_SIZE);
499 if (!blocksize) {
500 printk(KERN_ERR "NILFS: unable to set blocksize\n");
501 err = -EINVAL;
502 goto out;
503 }
504 err = nilfs_load_super_block(nilfs, sb, blocksize, &sbp);
505 if (err)
506 goto out;
507
508 err = nilfs_store_magic_and_option(sb, sbp, data);
509 if (err)
510 goto failed_sbh;
511
512 blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size);
513 if (sb->s_blocksize != blocksize) {
514 int hw_blocksize = bdev_hardsect_size(sb->s_bdev);
515
516 if (blocksize < hw_blocksize) {
517 printk(KERN_ERR
518 "NILFS: blocksize %d too small for device "
519 "(sector-size = %d).\n",
520 blocksize, hw_blocksize);
521 err = -EINVAL;
522 goto failed_sbh;
523 }
524 nilfs_release_super_block(nilfs);
525 sb_set_blocksize(sb, blocksize);
526
527 err = nilfs_load_super_block(nilfs, sb, blocksize, &sbp);
528 if (err)
529 goto out;
530 /* not failed_sbh; sbh is released automatically
531 when reloading fails. */
532 }
533 nilfs->ns_blocksize_bits = sb->s_blocksize_bits;
534
535 err = nilfs_store_disk_layout(nilfs, sbp);
536 if (err)
537 goto failed_sbh;
538
539 sb->s_maxbytes = nilfs_max_size(sb->s_blocksize_bits);
540
541 nilfs->ns_mount_state = le16_to_cpu(sbp->s_state);
542
543 bdi = nilfs->ns_bdev->bd_inode_backing_dev_info;
544 if (!bdi)
545 bdi = nilfs->ns_bdev->bd_inode->i_mapping->backing_dev_info;
546 nilfs->ns_bdi = bdi ? : &default_backing_dev_info;
547
548 /* Finding last segment */
549 nilfs->ns_last_pseg = le64_to_cpu(sbp->s_last_pseg);
550 nilfs->ns_last_cno = le64_to_cpu(sbp->s_last_cno);
551 nilfs->ns_last_seq = le64_to_cpu(sbp->s_last_seq);
552
553 nilfs->ns_seg_seq = nilfs->ns_last_seq;
554 nilfs->ns_segnum =
555 nilfs_get_segnum_of_block(nilfs, nilfs->ns_last_pseg);
556 nilfs->ns_cno = nilfs->ns_last_cno + 1;
557 if (nilfs->ns_segnum >= nilfs->ns_nsegments) {
558 printk(KERN_ERR "NILFS invalid last segment number.\n");
559 err = -EINVAL;
560 goto failed_sbh;
561 }
562 /* Dummy values */
563 nilfs->ns_free_segments_count =
564 nilfs->ns_nsegments - (nilfs->ns_segnum + 1);
565
566 /* Initialize gcinode cache */
567 err = nilfs_init_gccache(nilfs);
568 if (err)
569 goto failed_sbh;
570
571 set_nilfs_init(nilfs);
572 err = 0;
573 out:
574 up_write(&nilfs->ns_sem);
575 return err;
576
577 failed_sbh:
578 nilfs_release_super_block(nilfs);
579 goto out;
580}
581
582int nilfs_count_free_blocks(struct the_nilfs *nilfs, sector_t *nblocks)
583{
584 struct inode *dat = nilfs_dat_inode(nilfs);
585 unsigned long ncleansegs;
586 int err;
587
588 down_read(&NILFS_MDT(dat)->mi_sem); /* XXX */
589 err = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile, &ncleansegs);
590 up_read(&NILFS_MDT(dat)->mi_sem); /* XXX */
591 if (likely(!err))
592 *nblocks = (sector_t)ncleansegs * nilfs->ns_blocks_per_segment;
593 return err;
594}
595
596int nilfs_near_disk_full(struct the_nilfs *nilfs)
597{
598 struct inode *sufile = nilfs->ns_sufile;
599 unsigned long ncleansegs, nincsegs;
600 int ret;
601
602 ret = nilfs_sufile_get_ncleansegs(sufile, &ncleansegs);
603 if (likely(!ret)) {
604 nincsegs = atomic_read(&nilfs->ns_ndirtyblks) /
605 nilfs->ns_blocks_per_segment + 1;
606 if (ncleansegs <= nilfs->ns_nrsvsegs + nincsegs)
607 ret++;
608 }
609 return ret;
610}
611
612int nilfs_checkpoint_is_mounted(struct the_nilfs *nilfs, __u64 cno,
613 int snapshot_mount)
614{
615 struct nilfs_sb_info *sbi;
616 int ret = 0;
617
618 down_read(&nilfs->ns_sem);
619 if (cno == 0 || cno > nilfs->ns_cno)
620 goto out_unlock;
621
622 list_for_each_entry(sbi, &nilfs->ns_supers, s_list) {
623 if (sbi->s_snapshot_cno == cno &&
624 (!snapshot_mount || nilfs_test_opt(sbi, SNAPSHOT))) {
625 /* exclude read-only mounts */
626 ret++;
627 break;
628 }
629 }
630 /* for protecting recent checkpoints */
631 if (cno >= nilfs_last_cno(nilfs))
632 ret++;
633
634 out_unlock:
635 up_read(&nilfs->ns_sem);
636 return ret;
637}
diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h
new file mode 100644
index 000000000000..30fe58778d05
--- /dev/null
+++ b/fs/nilfs2/the_nilfs.h
@@ -0,0 +1,298 @@
1/*
2 * the_nilfs.h - the_nilfs shared structure.
3 *
4 * Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
19 *
20 * Written by Ryusuke Konishi <ryusuke@osrg.net>
21 *
22 */
23
24#ifndef _THE_NILFS_H
25#define _THE_NILFS_H
26
27#include <linux/types.h>
28#include <linux/buffer_head.h>
29#include <linux/fs.h>
30#include <linux/blkdev.h>
31#include <linux/backing-dev.h>
32#include "sb.h"
33
34/* the_nilfs struct */
35enum {
36 THE_NILFS_INIT = 0, /* Information from super_block is set */
37 THE_NILFS_LOADED, /* Roll-back/roll-forward has done and
38 the latest checkpoint was loaded */
39 THE_NILFS_DISCONTINUED, /* 'next' pointer chain has broken */
40};
41
42/**
43 * struct the_nilfs - struct to supervise multiple nilfs mount points
44 * @ns_flags: flags
45 * @ns_count: reference count
46 * @ns_bdev: block device
47 * @ns_bdi: backing dev info
48 * @ns_writer: back pointer to writable nilfs_sb_info
49 * @ns_sem: semaphore for shared states
50 * @ns_writer_mutex: mutex protecting ns_writer attach/detach
51 * @ns_writer_refcount: number of referrers on ns_writer
52 * @ns_sbh: buffer heads of on-disk super blocks
53 * @ns_sbp: pointers to super block data
54 * @ns_sbwtime: previous write time of super blocks
55 * @ns_sbsize: size of valid data in super block
56 * @ns_supers: list of nilfs super block structs
57 * @ns_seg_seq: segment sequence counter
58 * @ns_segnum: index number of the latest full segment.
59 * @ns_nextnum: index number of the full segment index to be used next
60 * @ns_pseg_offset: offset of next partial segment in the current full segment
61 * @ns_cno: next checkpoint number
62 * @ns_ctime: write time of the last segment
63 * @ns_nongc_ctime: write time of the last segment not for cleaner operation
64 * @ns_ndirtyblks: Number of dirty data blocks
65 * @ns_last_segment_lock: lock protecting fields for the latest segment
66 * @ns_last_pseg: start block number of the latest segment
67 * @ns_last_seq: sequence value of the latest segment
68 * @ns_last_cno: checkpoint number of the latest segment
69 * @ns_prot_seq: least sequence number of segments which must not be reclaimed
70 * @ns_free_segments_count: counter of free segments
71 * @ns_segctor_sem: segment constructor semaphore
72 * @ns_dat: DAT file inode
73 * @ns_cpfile: checkpoint file inode
74 * @ns_sufile: segusage file inode
75 * @ns_gc_dat: shadow inode of the DAT file inode for GC
76 * @ns_gc_inodes: dummy inodes to keep live blocks
77 * @ns_gc_inodes_h: hash list to keep dummy inode holding live blocks
78 * @ns_blocksize_bits: bit length of block size
79 * @ns_nsegments: number of segments in filesystem
80 * @ns_blocks_per_segment: number of blocks per segment
81 * @ns_r_segments_percentage: reserved segments percentage
82 * @ns_nrsvsegs: number of reserved segments
83 * @ns_first_data_block: block number of first data block
84 * @ns_inode_size: size of on-disk inode
85 * @ns_first_ino: first not-special inode number
86 * @ns_crc_seed: seed value of CRC32 calculation
87 */
88struct the_nilfs {
89 unsigned long ns_flags;
90 atomic_t ns_count;
91
92 struct block_device *ns_bdev;
93 struct backing_dev_info *ns_bdi;
94 struct nilfs_sb_info *ns_writer;
95 struct rw_semaphore ns_sem;
96 struct mutex ns_writer_mutex;
97 atomic_t ns_writer_refcount;
98
99 /*
100 * used for
101 * - loading the latest checkpoint exclusively.
102 * - allocating a new full segment.
103 * - protecting s_dirt in the super_block struct
104 * (see nilfs_write_super) and the following fields.
105 */
106 struct buffer_head *ns_sbh[2];
107 struct nilfs_super_block *ns_sbp[2];
108 time_t ns_sbwtime[2];
109 unsigned ns_sbsize;
110 unsigned ns_mount_state;
111 struct list_head ns_supers;
112
113 /*
114 * Following fields are dedicated to a writable FS-instance.
115 * Except for the period seeking checkpoint, code outside the segment
116 * constructor must lock a segment semaphore while accessing these
117 * fields.
118 * The writable FS-instance is sole during a lifetime of the_nilfs.
119 */
120 u64 ns_seg_seq;
121 __u64 ns_segnum;
122 __u64 ns_nextnum;
123 unsigned long ns_pseg_offset;
124 __u64 ns_cno;
125 time_t ns_ctime;
126 time_t ns_nongc_ctime;
127 atomic_t ns_ndirtyblks;
128
129 /*
130 * The following fields hold information on the latest partial segment
131 * written to disk with a super root. These fields are protected by
132 * ns_last_segment_lock.
133 */
134 spinlock_t ns_last_segment_lock;
135 sector_t ns_last_pseg;
136 u64 ns_last_seq;
137 __u64 ns_last_cno;
138 u64 ns_prot_seq;
139 unsigned long ns_free_segments_count;
140
141 struct rw_semaphore ns_segctor_sem;
142
143 /*
144 * Following fields are lock free except for the period before
145 * the_nilfs is initialized.
146 */
147 struct inode *ns_dat;
148 struct inode *ns_cpfile;
149 struct inode *ns_sufile;
150 struct inode *ns_gc_dat;
151
152 /* GC inode list and hash table head */
153 struct list_head ns_gc_inodes;
154 struct hlist_head *ns_gc_inodes_h;
155
156 /* Disk layout information (static) */
157 unsigned int ns_blocksize_bits;
158 unsigned long ns_nsegments;
159 unsigned long ns_blocks_per_segment;
160 unsigned long ns_r_segments_percentage;
161 unsigned long ns_nrsvsegs;
162 unsigned long ns_first_data_block;
163 int ns_inode_size;
164 int ns_first_ino;
165 u32 ns_crc_seed;
166};
167
168#define NILFS_GCINODE_HASH_BITS 8
169#define NILFS_GCINODE_HASH_SIZE (1<<NILFS_GCINODE_HASH_BITS)
170
171#define THE_NILFS_FNS(bit, name) \
172static inline void set_nilfs_##name(struct the_nilfs *nilfs) \
173{ \
174 set_bit(THE_NILFS_##bit, &(nilfs)->ns_flags); \
175} \
176static inline void clear_nilfs_##name(struct the_nilfs *nilfs) \
177{ \
178 clear_bit(THE_NILFS_##bit, &(nilfs)->ns_flags); \
179} \
180static inline int nilfs_##name(struct the_nilfs *nilfs) \
181{ \
182 return test_bit(THE_NILFS_##bit, &(nilfs)->ns_flags); \
183}
184
185THE_NILFS_FNS(INIT, init)
186THE_NILFS_FNS(LOADED, loaded)
187THE_NILFS_FNS(DISCONTINUED, discontinued)
188
189/* Minimum interval of periodical update of superblocks (in seconds) */
190#define NILFS_SB_FREQ 10
191#define NILFS_ALTSB_FREQ 60 /* spare superblock */
192
193void nilfs_set_last_segment(struct the_nilfs *, sector_t, u64, __u64);
194struct the_nilfs *alloc_nilfs(struct block_device *);
195void put_nilfs(struct the_nilfs *);
196int init_nilfs(struct the_nilfs *, struct nilfs_sb_info *, char *);
197int load_nilfs(struct the_nilfs *, struct nilfs_sb_info *);
198int nilfs_count_free_blocks(struct the_nilfs *, sector_t *);
199int nilfs_checkpoint_is_mounted(struct the_nilfs *, __u64, int);
200int nilfs_near_disk_full(struct the_nilfs *);
201void nilfs_fall_back_super_block(struct the_nilfs *);
202void nilfs_swap_super_block(struct the_nilfs *);
203
204
205static inline void get_nilfs(struct the_nilfs *nilfs)
206{
207 /* Caller must have at least one reference of the_nilfs. */
208 atomic_inc(&nilfs->ns_count);
209}
210
211static inline struct nilfs_sb_info *nilfs_get_writer(struct the_nilfs *nilfs)
212{
213 if (atomic_inc_and_test(&nilfs->ns_writer_refcount))
214 mutex_lock(&nilfs->ns_writer_mutex);
215 return nilfs->ns_writer;
216}
217
218static inline void nilfs_put_writer(struct the_nilfs *nilfs)
219{
220 if (atomic_add_negative(-1, &nilfs->ns_writer_refcount))
221 mutex_unlock(&nilfs->ns_writer_mutex);
222}
223
224static inline void
225nilfs_attach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
226{
227 mutex_lock(&nilfs->ns_writer_mutex);
228 nilfs->ns_writer = sbi;
229 mutex_unlock(&nilfs->ns_writer_mutex);
230}
231
232static inline void
233nilfs_detach_writer(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi)
234{
235 mutex_lock(&nilfs->ns_writer_mutex);
236 if (sbi == nilfs->ns_writer)
237 nilfs->ns_writer = NULL;
238 mutex_unlock(&nilfs->ns_writer_mutex);
239}
240
241static inline void
242nilfs_get_segment_range(struct the_nilfs *nilfs, __u64 segnum,
243 sector_t *seg_start, sector_t *seg_end)
244{
245 *seg_start = (sector_t)nilfs->ns_blocks_per_segment * segnum;
246 *seg_end = *seg_start + nilfs->ns_blocks_per_segment - 1;
247 if (segnum == 0)
248 *seg_start = nilfs->ns_first_data_block;
249}
250
251static inline sector_t
252nilfs_get_segment_start_blocknr(struct the_nilfs *nilfs, __u64 segnum)
253{
254 return (segnum == 0) ? nilfs->ns_first_data_block :
255 (sector_t)nilfs->ns_blocks_per_segment * segnum;
256}
257
258static inline __u64
259nilfs_get_segnum_of_block(struct the_nilfs *nilfs, sector_t blocknr)
260{
261 sector_t segnum = blocknr;
262
263 sector_div(segnum, nilfs->ns_blocks_per_segment);
264 return segnum;
265}
266
267static inline void
268nilfs_terminate_segment(struct the_nilfs *nilfs, sector_t seg_start,
269 sector_t seg_end)
270{
271 /* terminate the current full segment (used in case of I/O-error) */
272 nilfs->ns_pseg_offset = seg_end - seg_start + 1;
273}
274
275static inline void nilfs_shift_to_next_segment(struct the_nilfs *nilfs)
276{
277 /* move forward with a full segment */
278 nilfs->ns_segnum = nilfs->ns_nextnum;
279 nilfs->ns_pseg_offset = 0;
280 nilfs->ns_seg_seq++;
281}
282
283static inline __u64 nilfs_last_cno(struct the_nilfs *nilfs)
284{
285 __u64 cno;
286
287 spin_lock(&nilfs->ns_last_segment_lock);
288 cno = nilfs->ns_last_cno;
289 spin_unlock(&nilfs->ns_last_segment_lock);
290 return cno;
291}
292
293static inline int nilfs_segment_is_active(struct the_nilfs *nilfs, __u64 n)
294{
295 return n == nilfs->ns_segnum || n == nilfs->ns_nextnum;
296}
297
298#endif /* _THE_NILFS_H */
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index a5887df2cd8a..8672b9536039 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -1926,7 +1926,7 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
1926 out->f_path.dentry->d_name.len, 1926 out->f_path.dentry->d_name.len,
1927 out->f_path.dentry->d_name.name); 1927 out->f_path.dentry->d_name.name);
1928 1928
1929 inode_double_lock(inode, pipe->inode); 1929 mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
1930 1930
1931 ret = ocfs2_rw_lock(inode, 1); 1931 ret = ocfs2_rw_lock(inode, 1);
1932 if (ret < 0) { 1932 if (ret < 0) {
@@ -1941,12 +1941,16 @@ static ssize_t ocfs2_file_splice_write(struct pipe_inode_info *pipe,
1941 goto out_unlock; 1941 goto out_unlock;
1942 } 1942 }
1943 1943
1944 if (pipe->inode)
1945 mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_CHILD);
1944 ret = generic_file_splice_write_nolock(pipe, out, ppos, len, flags); 1946 ret = generic_file_splice_write_nolock(pipe, out, ppos, len, flags);
1947 if (pipe->inode)
1948 mutex_unlock(&pipe->inode->i_mutex);
1945 1949
1946out_unlock: 1950out_unlock:
1947 ocfs2_rw_unlock(inode, 1); 1951 ocfs2_rw_unlock(inode, 1);
1948out: 1952out:
1949 inode_double_unlock(inode, pipe->inode); 1953 mutex_unlock(&inode->i_mutex);
1950 1954
1951 mlog_exit(ret); 1955 mlog_exit(ret);
1952 return ret; 1956 return ret;
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index b0ae0be4801f..39e4ad4f59f4 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -204,6 +204,7 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
204 struct file *file = vma->vm_file; 204 struct file *file = vma->vm_file;
205 int flags = vma->vm_flags; 205 int flags = vma->vm_flags;
206 unsigned long ino = 0; 206 unsigned long ino = 0;
207 unsigned long long pgoff = 0;
207 dev_t dev = 0; 208 dev_t dev = 0;
208 int len; 209 int len;
209 210
@@ -211,6 +212,7 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
211 struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 212 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
212 dev = inode->i_sb->s_dev; 213 dev = inode->i_sb->s_dev;
213 ino = inode->i_ino; 214 ino = inode->i_ino;
215 pgoff = ((loff_t)vma->vm_pgoff) << PAGE_SHIFT;
214 } 216 }
215 217
216 seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n", 218 seq_printf(m, "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
@@ -220,7 +222,7 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
220 flags & VM_WRITE ? 'w' : '-', 222 flags & VM_WRITE ? 'w' : '-',
221 flags & VM_EXEC ? 'x' : '-', 223 flags & VM_EXEC ? 'x' : '-',
222 flags & VM_MAYSHARE ? 's' : 'p', 224 flags & VM_MAYSHARE ? 's' : 'p',
223 ((loff_t)vma->vm_pgoff) << PAGE_SHIFT, 225 pgoff,
224 MAJOR(dev), MINOR(dev), ino, &len); 226 MAJOR(dev), MINOR(dev), ino, &len);
225 227
226 /* 228 /*
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 863464d5519c..64a72e2e7650 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -126,6 +126,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
126 struct file *file; 126 struct file *file;
127 dev_t dev = 0; 127 dev_t dev = 0;
128 int flags, len; 128 int flags, len;
129 unsigned long long pgoff = 0;
129 130
130 flags = vma->vm_flags; 131 flags = vma->vm_flags;
131 file = vma->vm_file; 132 file = vma->vm_file;
@@ -134,6 +135,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
134 struct inode *inode = vma->vm_file->f_path.dentry->d_inode; 135 struct inode *inode = vma->vm_file->f_path.dentry->d_inode;
135 dev = inode->i_sb->s_dev; 136 dev = inode->i_sb->s_dev;
136 ino = inode->i_ino; 137 ino = inode->i_ino;
138 pgoff = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
137 } 139 }
138 140
139 seq_printf(m, 141 seq_printf(m,
@@ -144,7 +146,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
144 flags & VM_WRITE ? 'w' : '-', 146 flags & VM_WRITE ? 'w' : '-',
145 flags & VM_EXEC ? 'x' : '-', 147 flags & VM_EXEC ? 'x' : '-',
146 flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p', 148 flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
147 (unsigned long long) vma->vm_pgoff << PAGE_SHIFT, 149 pgoff,
148 MAJOR(dev), MINOR(dev), ino, &len); 150 MAJOR(dev), MINOR(dev), ino, &len);
149 151
150 if (file) { 152 if (file) {
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index a404fb88e456..3a6b193d8444 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -221,22 +221,23 @@ static int ramfs_fill_super(struct super_block * sb, void * data, int silent)
221 save_mount_options(sb, data); 221 save_mount_options(sb, data);
222 222
223 fsi = kzalloc(sizeof(struct ramfs_fs_info), GFP_KERNEL); 223 fsi = kzalloc(sizeof(struct ramfs_fs_info), GFP_KERNEL);
224 sb->s_fs_info = fsi;
224 if (!fsi) { 225 if (!fsi) {
225 err = -ENOMEM; 226 err = -ENOMEM;
226 goto fail; 227 goto fail;
227 } 228 }
228 sb->s_fs_info = fsi;
229 229
230 err = ramfs_parse_options(data, &fsi->mount_opts); 230 err = ramfs_parse_options(data, &fsi->mount_opts);
231 if (err) 231 if (err)
232 goto fail; 232 goto fail;
233 233
234 sb->s_maxbytes = MAX_LFS_FILESIZE; 234 sb->s_maxbytes = MAX_LFS_FILESIZE;
235 sb->s_blocksize = PAGE_CACHE_SIZE; 235 sb->s_blocksize = PAGE_CACHE_SIZE;
236 sb->s_blocksize_bits = PAGE_CACHE_SHIFT; 236 sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
237 sb->s_magic = RAMFS_MAGIC; 237 sb->s_magic = RAMFS_MAGIC;
238 sb->s_op = &ramfs_ops; 238 sb->s_op = &ramfs_ops;
239 sb->s_time_gran = 1; 239 sb->s_time_gran = 1;
240
240 inode = ramfs_get_inode(sb, S_IFDIR | fsi->mount_opts.mode, 0); 241 inode = ramfs_get_inode(sb, S_IFDIR | fsi->mount_opts.mode, 0);
241 if (!inode) { 242 if (!inode) {
242 err = -ENOMEM; 243 err = -ENOMEM;
@@ -244,14 +245,16 @@ static int ramfs_fill_super(struct super_block * sb, void * data, int silent)
244 } 245 }
245 246
246 root = d_alloc_root(inode); 247 root = d_alloc_root(inode);
248 sb->s_root = root;
247 if (!root) { 249 if (!root) {
248 err = -ENOMEM; 250 err = -ENOMEM;
249 goto fail; 251 goto fail;
250 } 252 }
251 sb->s_root = root; 253
252 return 0; 254 return 0;
253fail: 255fail:
254 kfree(fsi); 256 kfree(fsi);
257 sb->s_fs_info = NULL;
255 iput(inode); 258 iput(inode);
256 return err; 259 return err;
257} 260}
diff --git a/fs/romfs/Kconfig b/fs/romfs/Kconfig
index 1a17020f9faf..ce2d6bcc6266 100644
--- a/fs/romfs/Kconfig
+++ b/fs/romfs/Kconfig
@@ -1,6 +1,6 @@
1config ROMFS_FS 1config ROMFS_FS
2 tristate "ROM file system support" 2 tristate "ROM file system support"
3 depends on BLOCK 3 depends on BLOCK || MTD
4 ---help--- 4 ---help---
5 This is a very small read-only file system mainly intended for 5 This is a very small read-only file system mainly intended for
6 initial ram disks of installation disks, but it could be used for 6 initial ram disks of installation disks, but it could be used for
@@ -14,3 +14,49 @@ config ROMFS_FS
14 14
15 If you don't know whether you need it, then you don't need it: 15 If you don't know whether you need it, then you don't need it:
16 answer N. 16 answer N.
17
18#
19# Select the backing stores to be supported
20#
21choice
22 prompt "RomFS backing stores"
23 depends on ROMFS_FS
24 default ROMFS_BACKED_BY_BLOCK
25 help
26 Select the backing stores to be supported.
27
28config ROMFS_BACKED_BY_BLOCK
29 bool "Block device-backed ROM file system support"
30 depends on BLOCK
31 help
32 This permits ROMFS to use block devices buffered through the page
33 cache as the medium from which to retrieve data. It does not allow
34 direct mapping of the medium.
35
36 If unsure, answer Y.
37
38config ROMFS_BACKED_BY_MTD
39 bool "MTD-backed ROM file system support"
40 depends on MTD=y || (ROMFS_FS=m && MTD)
41 help
42 This permits ROMFS to use MTD based devices directly, without the
43 intercession of the block layer (which may have been disabled). It
44 also allows direct mapping of MTD devices through romfs files under
45 NOMMU conditions if the underlying device is directly addressable by
46 the CPU.
47
48 If unsure, answer Y.
49
50config ROMFS_BACKED_BY_BOTH
51 bool "Both the above"
52 depends on BLOCK && (MTD=y || (ROMFS_FS=m && MTD))
53endchoice
54
55
56config ROMFS_ON_BLOCK
57 bool
58 default y if ROMFS_BACKED_BY_BLOCK || ROMFS_BACKED_BY_BOTH
59
60config ROMFS_ON_MTD
61 bool
62 default y if ROMFS_BACKED_BY_MTD || ROMFS_BACKED_BY_BOTH
diff --git a/fs/romfs/Makefile b/fs/romfs/Makefile
index c95b21cf49a3..420beb7d495c 100644
--- a/fs/romfs/Makefile
+++ b/fs/romfs/Makefile
@@ -1,7 +1,12 @@
1# 1#
2# Makefile for the linux romfs filesystem routines. 2# Makefile for the linux RomFS filesystem routines.
3# 3#
4 4
5obj-$(CONFIG_ROMFS_FS) += romfs.o 5obj-$(CONFIG_ROMFS_FS) += romfs.o
6 6
7romfs-objs := inode.o 7romfs-y := storage.o super.o
8
9ifneq ($(CONFIG_MMU),y)
10romfs-$(CONFIG_ROMFS_ON_MTD) += mmap-nommu.o
11endif
12
diff --git a/fs/romfs/inode.c b/fs/romfs/inode.c
deleted file mode 100644
index 98a232f7196b..000000000000
--- a/fs/romfs/inode.c
+++ /dev/null
@@ -1,665 +0,0 @@
1/*
2 * ROMFS file system, Linux implementation
3 *
4 * Copyright (C) 1997-1999 Janos Farkas <chexum@shadow.banki.hu>
5 *
6 * Using parts of the minix filesystem
7 * Copyright (C) 1991, 1992 Linus Torvalds
8 *
9 * and parts of the affs filesystem additionally
10 * Copyright (C) 1993 Ray Burr
11 * Copyright (C) 1996 Hans-Joachim Widmaier
12 *
13 * This program is free software; you can redistribute it and/or
14 * modify it under the terms of the GNU General Public License
15 * as published by the Free Software Foundation; either version
16 * 2 of the License, or (at your option) any later version.
17 *
18 * Changes
19 * Changed for 2.1.19 modules
20 * Jan 1997 Initial release
21 * Jun 1997 2.1.43+ changes
22 * Proper page locking in readpage
23 * Changed to work with 2.1.45+ fs
24 * Jul 1997 Fixed follow_link
25 * 2.1.47
26 * lookup shouldn't return -ENOENT
27 * from Horst von Brand:
28 * fail on wrong checksum
29 * double unlock_super was possible
30 * correct namelen for statfs
31 * spotted by Bill Hawes:
32 * readlink shouldn't iput()
33 * Jun 1998 2.1.106 from Avery Pennarun: glibc scandir()
34 * exposed a problem in readdir
35 * 2.1.107 code-freeze spellchecker run
36 * Aug 1998 2.1.118+ VFS changes
37 * Sep 1998 2.1.122 another VFS change (follow_link)
38 * Apr 1999 2.2.7 no more EBADF checking in
39 * lookup/readdir, use ERR_PTR
40 * Jun 1999 2.3.6 d_alloc_root use changed
41 * 2.3.9 clean up usage of ENOENT/negative
42 * dentries in lookup
43 * clean up page flags setting
44 * (error, uptodate, locking) in
45 * in readpage
46 * use init_special_inode for
47 * fifos/sockets (and streamline) in
48 * read_inode, fix _ops table order
49 * Aug 1999 2.3.16 __initfunc() => __init change
50 * Oct 1999 2.3.24 page->owner hack obsoleted
51 * Nov 1999 2.3.27 2.3.25+ page->offset => index change
52 */
53
54/* todo:
55 * - see Documentation/filesystems/romfs.txt
56 * - use allocated, not stack memory for file names?
57 * - considering write access...
58 * - network (tftp) files?
59 * - merge back some _op tables
60 */
61
62/*
63 * Sorry about some optimizations and for some goto's. I just wanted
64 * to squeeze some more bytes out of this code.. :)
65 */
66
67#include <linux/module.h>
68#include <linux/types.h>
69#include <linux/errno.h>
70#include <linux/slab.h>
71#include <linux/romfs_fs.h>
72#include <linux/fs.h>
73#include <linux/init.h>
74#include <linux/pagemap.h>
75#include <linux/smp_lock.h>
76#include <linux/buffer_head.h>
77#include <linux/vfs.h>
78
79#include <asm/uaccess.h>
80
81struct romfs_inode_info {
82 unsigned long i_metasize; /* size of non-data area */
83 unsigned long i_dataoffset; /* from the start of fs */
84 struct inode vfs_inode;
85};
86
87static struct inode *romfs_iget(struct super_block *, unsigned long);
88
89/* instead of private superblock data */
90static inline unsigned long romfs_maxsize(struct super_block *sb)
91{
92 return (unsigned long)sb->s_fs_info;
93}
94
95static inline struct romfs_inode_info *ROMFS_I(struct inode *inode)
96{
97 return container_of(inode, struct romfs_inode_info, vfs_inode);
98}
99
100static __u32
101romfs_checksum(void *data, int size)
102{
103 __u32 sum;
104 __be32 *ptr;
105
106 sum = 0; ptr = data;
107 size>>=2;
108 while (size>0) {
109 sum += be32_to_cpu(*ptr++);
110 size--;
111 }
112 return sum;
113}
114
115static const struct super_operations romfs_ops;
116
117static int romfs_fill_super(struct super_block *s, void *data, int silent)
118{
119 struct buffer_head *bh;
120 struct romfs_super_block *rsb;
121 struct inode *root;
122 int sz, ret = -EINVAL;
123
124 /* I would parse the options here, but there are none.. :) */
125
126 sb_set_blocksize(s, ROMBSIZE);
127 s->s_maxbytes = 0xFFFFFFFF;
128
129 bh = sb_bread(s, 0);
130 if (!bh) {
131 /* XXX merge with other printk? */
132 printk ("romfs: unable to read superblock\n");
133 goto outnobh;
134 }
135
136 rsb = (struct romfs_super_block *)bh->b_data;
137 sz = be32_to_cpu(rsb->size);
138 if (rsb->word0 != ROMSB_WORD0 || rsb->word1 != ROMSB_WORD1
139 || sz < ROMFH_SIZE) {
140 if (!silent)
141 printk ("VFS: Can't find a romfs filesystem on dev "
142 "%s.\n", s->s_id);
143 goto out;
144 }
145 if (romfs_checksum(rsb, min_t(int, sz, 512))) {
146 printk ("romfs: bad initial checksum on dev "
147 "%s.\n", s->s_id);
148 goto out;
149 }
150
151 s->s_magic = ROMFS_MAGIC;
152 s->s_fs_info = (void *)(long)sz;
153
154 s->s_flags |= MS_RDONLY;
155
156 /* Find the start of the fs */
157 sz = (ROMFH_SIZE +
158 strnlen(rsb->name, ROMFS_MAXFN) + 1 + ROMFH_PAD)
159 & ROMFH_MASK;
160
161 s->s_op = &romfs_ops;
162 root = romfs_iget(s, sz);
163 if (IS_ERR(root)) {
164 ret = PTR_ERR(root);
165 goto out;
166 }
167
168 ret = -ENOMEM;
169 s->s_root = d_alloc_root(root);
170 if (!s->s_root)
171 goto outiput;
172
173 brelse(bh);
174 return 0;
175
176outiput:
177 iput(root);
178out:
179 brelse(bh);
180outnobh:
181 return ret;
182}
183
184/* That's simple too. */
185
186static int
187romfs_statfs(struct dentry *dentry, struct kstatfs *buf)
188{
189 buf->f_type = ROMFS_MAGIC;
190 buf->f_bsize = ROMBSIZE;
191 buf->f_bfree = buf->f_bavail = buf->f_ffree;
192 buf->f_blocks = (romfs_maxsize(dentry->d_sb)+ROMBSIZE-1)>>ROMBSBITS;
193 buf->f_namelen = ROMFS_MAXFN;
194 return 0;
195}
196
197/* some helper routines */
198
199static int
200romfs_strnlen(struct inode *i, unsigned long offset, unsigned long count)
201{
202 struct buffer_head *bh;
203 unsigned long avail, maxsize, res;
204
205 maxsize = romfs_maxsize(i->i_sb);
206 if (offset >= maxsize)
207 return -1;
208
209 /* strnlen is almost always valid */
210 if (count > maxsize || offset+count > maxsize)
211 count = maxsize-offset;
212
213 bh = sb_bread(i->i_sb, offset>>ROMBSBITS);
214 if (!bh)
215 return -1; /* error */
216
217 avail = ROMBSIZE - (offset & ROMBMASK);
218 maxsize = min_t(unsigned long, count, avail);
219 res = strnlen(((char *)bh->b_data)+(offset&ROMBMASK), maxsize);
220 brelse(bh);
221
222 if (res < maxsize)
223 return res; /* found all of it */
224
225 while (res < count) {
226 offset += maxsize;
227
228 bh = sb_bread(i->i_sb, offset>>ROMBSBITS);
229 if (!bh)
230 return -1;
231 maxsize = min_t(unsigned long, count - res, ROMBSIZE);
232 avail = strnlen(bh->b_data, maxsize);
233 res += avail;
234 brelse(bh);
235 if (avail < maxsize)
236 return res;
237 }
238 return res;
239}
240
241static int
242romfs_copyfrom(struct inode *i, void *dest, unsigned long offset, unsigned long count)
243{
244 struct buffer_head *bh;
245 unsigned long avail, maxsize, res;
246
247 maxsize = romfs_maxsize(i->i_sb);
248 if (offset >= maxsize || count > maxsize || offset+count>maxsize)
249 return -1;
250
251 bh = sb_bread(i->i_sb, offset>>ROMBSBITS);
252 if (!bh)
253 return -1; /* error */
254
255 avail = ROMBSIZE - (offset & ROMBMASK);
256 maxsize = min_t(unsigned long, count, avail);
257 memcpy(dest, ((char *)bh->b_data) + (offset & ROMBMASK), maxsize);
258 brelse(bh);
259
260 res = maxsize; /* all of it */
261
262 while (res < count) {
263 offset += maxsize;
264 dest += maxsize;
265
266 bh = sb_bread(i->i_sb, offset>>ROMBSBITS);
267 if (!bh)
268 return -1;
269 maxsize = min_t(unsigned long, count - res, ROMBSIZE);
270 memcpy(dest, bh->b_data, maxsize);
271 brelse(bh);
272 res += maxsize;
273 }
274 return res;
275}
276
277static unsigned char romfs_dtype_table[] = {
278 DT_UNKNOWN, DT_DIR, DT_REG, DT_LNK, DT_BLK, DT_CHR, DT_SOCK, DT_FIFO
279};
280
281static int
282romfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
283{
284 struct inode *i = filp->f_path.dentry->d_inode;
285 struct romfs_inode ri;
286 unsigned long offset, maxoff;
287 int j, ino, nextfh;
288 int stored = 0;
289 char fsname[ROMFS_MAXFN]; /* XXX dynamic? */
290
291 lock_kernel();
292
293 maxoff = romfs_maxsize(i->i_sb);
294
295 offset = filp->f_pos;
296 if (!offset) {
297 offset = i->i_ino & ROMFH_MASK;
298 if (romfs_copyfrom(i, &ri, offset, ROMFH_SIZE) <= 0)
299 goto out;
300 offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
301 }
302
303 /* Not really failsafe, but we are read-only... */
304 for(;;) {
305 if (!offset || offset >= maxoff) {
306 offset = maxoff;
307 filp->f_pos = offset;
308 goto out;
309 }
310 filp->f_pos = offset;
311
312 /* Fetch inode info */
313 if (romfs_copyfrom(i, &ri, offset, ROMFH_SIZE) <= 0)
314 goto out;
315
316 j = romfs_strnlen(i, offset+ROMFH_SIZE, sizeof(fsname)-1);
317 if (j < 0)
318 goto out;
319
320 fsname[j]=0;
321 romfs_copyfrom(i, fsname, offset+ROMFH_SIZE, j);
322
323 ino = offset;
324 nextfh = be32_to_cpu(ri.next);
325 if ((nextfh & ROMFH_TYPE) == ROMFH_HRD)
326 ino = be32_to_cpu(ri.spec);
327 if (filldir(dirent, fsname, j, offset, ino,
328 romfs_dtype_table[nextfh & ROMFH_TYPE]) < 0) {
329 goto out;
330 }
331 stored++;
332 offset = nextfh & ROMFH_MASK;
333 }
334out:
335 unlock_kernel();
336 return stored;
337}
338
339static struct dentry *
340romfs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
341{
342 unsigned long offset, maxoff;
343 long res;
344 int fslen;
345 struct inode *inode = NULL;
346 char fsname[ROMFS_MAXFN]; /* XXX dynamic? */
347 struct romfs_inode ri;
348 const char *name; /* got from dentry */
349 int len;
350
351 res = -EACCES; /* placeholder for "no data here" */
352 offset = dir->i_ino & ROMFH_MASK;
353 lock_kernel();
354 if (romfs_copyfrom(dir, &ri, offset, ROMFH_SIZE) <= 0)
355 goto error;
356
357 maxoff = romfs_maxsize(dir->i_sb);
358 offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
359
360 /* OK, now find the file whose name is in "dentry" in the
361 * directory specified by "dir". */
362
363 name = dentry->d_name.name;
364 len = dentry->d_name.len;
365
366 for(;;) {
367 if (!offset || offset >= maxoff)
368 goto success; /* negative success */
369 if (romfs_copyfrom(dir, &ri, offset, ROMFH_SIZE) <= 0)
370 goto error;
371
372 /* try to match the first 16 bytes of name */
373 fslen = romfs_strnlen(dir, offset+ROMFH_SIZE, ROMFH_SIZE);
374 if (len < ROMFH_SIZE) {
375 if (len == fslen) {
376 /* both are shorter, and same size */
377 romfs_copyfrom(dir, fsname, offset+ROMFH_SIZE, len+1);
378 if (strncmp (name, fsname, len) == 0)
379 break;
380 }
381 } else if (fslen >= ROMFH_SIZE) {
382 /* both are longer; XXX optimize max size */
383 fslen = romfs_strnlen(dir, offset+ROMFH_SIZE, sizeof(fsname)-1);
384 if (len == fslen) {
385 romfs_copyfrom(dir, fsname, offset+ROMFH_SIZE, len+1);
386 if (strncmp(name, fsname, len) == 0)
387 break;
388 }
389 }
390 /* next entry */
391 offset = be32_to_cpu(ri.next) & ROMFH_MASK;
392 }
393
394 /* Hard link handling */
395 if ((be32_to_cpu(ri.next) & ROMFH_TYPE) == ROMFH_HRD)
396 offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
397
398 inode = romfs_iget(dir->i_sb, offset);
399 if (IS_ERR(inode)) {
400 res = PTR_ERR(inode);
401 goto error;
402 }
403
404success:
405 d_add(dentry, inode);
406 res = 0;
407error:
408 unlock_kernel();
409 return ERR_PTR(res);
410}
411
412/*
413 * Ok, we do readpage, to be able to execute programs. Unfortunately,
414 * we can't use bmap, since we may have looser alignments.
415 */
416
417static int
418romfs_readpage(struct file *file, struct page * page)
419{
420 struct inode *inode = page->mapping->host;
421 loff_t offset, size;
422 unsigned long filled;
423 void *buf;
424 int result = -EIO;
425
426 page_cache_get(page);
427 lock_kernel();
428 buf = kmap(page);
429 if (!buf)
430 goto err_out;
431
432 /* 32 bit warning -- but not for us :) */
433 offset = page_offset(page);
434 size = i_size_read(inode);
435 filled = 0;
436 result = 0;
437 if (offset < size) {
438 unsigned long readlen;
439
440 size -= offset;
441 readlen = size > PAGE_SIZE ? PAGE_SIZE : size;
442
443 filled = romfs_copyfrom(inode, buf, ROMFS_I(inode)->i_dataoffset+offset, readlen);
444
445 if (filled != readlen) {
446 SetPageError(page);
447 filled = 0;
448 result = -EIO;
449 }
450 }
451
452 if (filled < PAGE_SIZE)
453 memset(buf + filled, 0, PAGE_SIZE-filled);
454
455 if (!result)
456 SetPageUptodate(page);
457 flush_dcache_page(page);
458
459 unlock_page(page);
460
461 kunmap(page);
462err_out:
463 page_cache_release(page);
464 unlock_kernel();
465
466 return result;
467}
468
469/* Mapping from our types to the kernel */
470
471static const struct address_space_operations romfs_aops = {
472 .readpage = romfs_readpage
473};
474
475static const struct file_operations romfs_dir_operations = {
476 .read = generic_read_dir,
477 .readdir = romfs_readdir,
478};
479
480static const struct inode_operations romfs_dir_inode_operations = {
481 .lookup = romfs_lookup,
482};
483
484static mode_t romfs_modemap[] =
485{
486 0, S_IFDIR+0644, S_IFREG+0644, S_IFLNK+0777,
487 S_IFBLK+0600, S_IFCHR+0600, S_IFSOCK+0644, S_IFIFO+0644
488};
489
490static struct inode *
491romfs_iget(struct super_block *sb, unsigned long ino)
492{
493 int nextfh, ret;
494 struct romfs_inode ri;
495 struct inode *i;
496
497 ino &= ROMFH_MASK;
498 i = iget_locked(sb, ino);
499 if (!i)
500 return ERR_PTR(-ENOMEM);
501 if (!(i->i_state & I_NEW))
502 return i;
503
504 i->i_mode = 0;
505
506 /* Loop for finding the real hard link */
507 for(;;) {
508 if (romfs_copyfrom(i, &ri, ino, ROMFH_SIZE) <= 0) {
509 printk(KERN_ERR "romfs: read error for inode 0x%lx\n",
510 ino);
511 iget_failed(i);
512 return ERR_PTR(-EIO);
513 }
514 /* XXX: do romfs_checksum here too (with name) */
515
516 nextfh = be32_to_cpu(ri.next);
517 if ((nextfh & ROMFH_TYPE) != ROMFH_HRD)
518 break;
519
520 ino = be32_to_cpu(ri.spec) & ROMFH_MASK;
521 }
522
523 i->i_nlink = 1; /* Hard to decide.. */
524 i->i_size = be32_to_cpu(ri.size);
525 i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0;
526 i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0;
527
528 /* Precalculate the data offset */
529 ret = romfs_strnlen(i, ino + ROMFH_SIZE, ROMFS_MAXFN);
530 if (ret >= 0)
531 ino = (ROMFH_SIZE + ret + 1 + ROMFH_PAD) & ROMFH_MASK;
532 else
533 ino = 0;
534
535 ROMFS_I(i)->i_metasize = ino;
536 ROMFS_I(i)->i_dataoffset = ino+(i->i_ino&ROMFH_MASK);
537
538 /* Compute permissions */
539 ino = romfs_modemap[nextfh & ROMFH_TYPE];
540 /* only "normal" files have ops */
541 switch (nextfh & ROMFH_TYPE) {
542 case 1:
543 i->i_size = ROMFS_I(i)->i_metasize;
544 i->i_op = &romfs_dir_inode_operations;
545 i->i_fop = &romfs_dir_operations;
546 if (nextfh & ROMFH_EXEC)
547 ino |= S_IXUGO;
548 i->i_mode = ino;
549 break;
550 case 2:
551 i->i_fop = &generic_ro_fops;
552 i->i_data.a_ops = &romfs_aops;
553 if (nextfh & ROMFH_EXEC)
554 ino |= S_IXUGO;
555 i->i_mode = ino;
556 break;
557 case 3:
558 i->i_op = &page_symlink_inode_operations;
559 i->i_data.a_ops = &romfs_aops;
560 i->i_mode = ino | S_IRWXUGO;
561 break;
562 default:
563 /* depending on MBZ for sock/fifos */
564 nextfh = be32_to_cpu(ri.spec);
565 init_special_inode(i, ino,
566 MKDEV(nextfh>>16,nextfh&0xffff));
567 }
568 unlock_new_inode(i);
569 return i;
570}
571
572static struct kmem_cache * romfs_inode_cachep;
573
574static struct inode *romfs_alloc_inode(struct super_block *sb)
575{
576 struct romfs_inode_info *ei;
577 ei = kmem_cache_alloc(romfs_inode_cachep, GFP_KERNEL);
578 if (!ei)
579 return NULL;
580 return &ei->vfs_inode;
581}
582
583static void romfs_destroy_inode(struct inode *inode)
584{
585 kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode));
586}
587
588static void init_once(void *foo)
589{
590 struct romfs_inode_info *ei = foo;
591
592 inode_init_once(&ei->vfs_inode);
593}
594
595static int init_inodecache(void)
596{
597 romfs_inode_cachep = kmem_cache_create("romfs_inode_cache",
598 sizeof(struct romfs_inode_info),
599 0, (SLAB_RECLAIM_ACCOUNT|
600 SLAB_MEM_SPREAD),
601 init_once);
602 if (romfs_inode_cachep == NULL)
603 return -ENOMEM;
604 return 0;
605}
606
607static void destroy_inodecache(void)
608{
609 kmem_cache_destroy(romfs_inode_cachep);
610}
611
612static int romfs_remount(struct super_block *sb, int *flags, char *data)
613{
614 *flags |= MS_RDONLY;
615 return 0;
616}
617
618static const struct super_operations romfs_ops = {
619 .alloc_inode = romfs_alloc_inode,
620 .destroy_inode = romfs_destroy_inode,
621 .statfs = romfs_statfs,
622 .remount_fs = romfs_remount,
623};
624
625static int romfs_get_sb(struct file_system_type *fs_type,
626 int flags, const char *dev_name, void *data, struct vfsmount *mnt)
627{
628 return get_sb_bdev(fs_type, flags, dev_name, data, romfs_fill_super,
629 mnt);
630}
631
632static struct file_system_type romfs_fs_type = {
633 .owner = THIS_MODULE,
634 .name = "romfs",
635 .get_sb = romfs_get_sb,
636 .kill_sb = kill_block_super,
637 .fs_flags = FS_REQUIRES_DEV,
638};
639
640static int __init init_romfs_fs(void)
641{
642 int err = init_inodecache();
643 if (err)
644 goto out1;
645 err = register_filesystem(&romfs_fs_type);
646 if (err)
647 goto out;
648 return 0;
649out:
650 destroy_inodecache();
651out1:
652 return err;
653}
654
655static void __exit exit_romfs_fs(void)
656{
657 unregister_filesystem(&romfs_fs_type);
658 destroy_inodecache();
659}
660
661/* Yes, works even as a module... :) */
662
663module_init(init_romfs_fs)
664module_exit(exit_romfs_fs)
665MODULE_LICENSE("GPL");
diff --git a/fs/romfs/internal.h b/fs/romfs/internal.h
new file mode 100644
index 000000000000..06044a9dc62d
--- /dev/null
+++ b/fs/romfs/internal.h
@@ -0,0 +1,47 @@
1/* RomFS internal definitions
2 *
3 * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#include <linux/romfs_fs.h>
13
14struct romfs_inode_info {
15 struct inode vfs_inode;
16 unsigned long i_metasize; /* size of non-data area */
17 unsigned long i_dataoffset; /* from the start of fs */
18};
19
20static inline size_t romfs_maxsize(struct super_block *sb)
21{
22 return (size_t) (unsigned long) sb->s_fs_info;
23}
24
25static inline struct romfs_inode_info *ROMFS_I(struct inode *inode)
26{
27 return container_of(inode, struct romfs_inode_info, vfs_inode);
28}
29
30/*
31 * mmap-nommu.c
32 */
33#if !defined(CONFIG_MMU) && defined(CONFIG_ROMFS_ON_MTD)
34extern const struct file_operations romfs_ro_fops;
35#else
36#define romfs_ro_fops generic_ro_fops
37#endif
38
39/*
40 * storage.c
41 */
42extern int romfs_dev_read(struct super_block *sb, unsigned long pos,
43 void *buf, size_t buflen);
44extern ssize_t romfs_dev_strnlen(struct super_block *sb,
45 unsigned long pos, size_t maxlen);
46extern int romfs_dev_strncmp(struct super_block *sb, unsigned long pos,
47 const char *str, size_t size);
diff --git a/fs/romfs/mmap-nommu.c b/fs/romfs/mmap-nommu.c
new file mode 100644
index 000000000000..f0511e816967
--- /dev/null
+++ b/fs/romfs/mmap-nommu.c
@@ -0,0 +1,75 @@
1/* NOMMU mmap support for RomFS on MTD devices
2 *
3 * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#include <linux/mm.h>
13#include <linux/mtd/super.h>
14#include "internal.h"
15
16/*
17 * try to determine where a shared mapping can be made
18 * - only supported for NOMMU at the moment (MMU can't doesn't copy private
19 * mappings)
20 * - attempts to map through to the underlying MTD device
21 */
22static unsigned long romfs_get_unmapped_area(struct file *file,
23 unsigned long addr,
24 unsigned long len,
25 unsigned long pgoff,
26 unsigned long flags)
27{
28 struct inode *inode = file->f_mapping->host;
29 struct mtd_info *mtd = inode->i_sb->s_mtd;
30 unsigned long isize, offset;
31
32 if (!mtd)
33 goto cant_map_directly;
34
35 isize = i_size_read(inode);
36 offset = pgoff << PAGE_SHIFT;
37 if (offset > isize || len > isize || offset > isize - len)
38 return (unsigned long) -EINVAL;
39
40 /* we need to call down to the MTD layer to do the actual mapping */
41 if (mtd->get_unmapped_area) {
42 if (addr != 0)
43 return (unsigned long) -EINVAL;
44
45 if (len > mtd->size || pgoff >= (mtd->size >> PAGE_SHIFT))
46 return (unsigned long) -EINVAL;
47
48 offset += ROMFS_I(inode)->i_dataoffset;
49 if (offset > mtd->size - len)
50 return (unsigned long) -EINVAL;
51
52 return mtd->get_unmapped_area(mtd, len, offset, flags);
53 }
54
55cant_map_directly:
56 return (unsigned long) -ENOSYS;
57}
58
59/*
60 * permit a R/O mapping to be made directly through onto an MTD device if
61 * possible
62 */
63static int romfs_mmap(struct file *file, struct vm_area_struct *vma)
64{
65 return vma->vm_flags & (VM_SHARED | VM_MAYSHARE) ? 0 : -ENOSYS;
66}
67
68const struct file_operations romfs_ro_fops = {
69 .llseek = generic_file_llseek,
70 .read = do_sync_read,
71 .aio_read = generic_file_aio_read,
72 .splice_read = generic_file_splice_read,
73 .mmap = romfs_mmap,
74 .get_unmapped_area = romfs_get_unmapped_area,
75};
diff --git a/fs/romfs/storage.c b/fs/romfs/storage.c
new file mode 100644
index 000000000000..7e3e1e12a081
--- /dev/null
+++ b/fs/romfs/storage.c
@@ -0,0 +1,261 @@
1/* RomFS storage access routines
2 *
3 * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public License
8 * as published by the Free Software Foundation; either version
9 * 2 of the License, or (at your option) any later version.
10 */
11
12#include <linux/fs.h>
13#include <linux/mtd/super.h>
14#include <linux/buffer_head.h>
15#include "internal.h"
16
17#if !defined(CONFIG_ROMFS_ON_MTD) && !defined(CONFIG_ROMFS_ON_BLOCK)
18#error no ROMFS backing store interface configured
19#endif
20
21#ifdef CONFIG_ROMFS_ON_MTD
22#define ROMFS_MTD_READ(sb, ...) ((sb)->s_mtd->read((sb)->s_mtd, ##__VA_ARGS__))
23
24/*
25 * read data from an romfs image on an MTD device
26 */
27static int romfs_mtd_read(struct super_block *sb, unsigned long pos,
28 void *buf, size_t buflen)
29{
30 size_t rlen;
31 int ret;
32
33 ret = ROMFS_MTD_READ(sb, pos, buflen, &rlen, buf);
34 return (ret < 0 || rlen != buflen) ? -EIO : 0;
35}
36
37/*
38 * determine the length of a string in a romfs image on an MTD device
39 */
40static ssize_t romfs_mtd_strnlen(struct super_block *sb,
41 unsigned long pos, size_t maxlen)
42{
43 ssize_t n = 0;
44 size_t segment;
45 u_char buf[16], *p;
46 size_t len;
47 int ret;
48
49 /* scan the string up to 16 bytes at a time */
50 while (maxlen > 0) {
51 segment = min_t(size_t, maxlen, 16);
52 ret = ROMFS_MTD_READ(sb, pos, segment, &len, buf);
53 if (ret < 0)
54 return ret;
55 p = memchr(buf, 0, len);
56 if (p)
57 return n + (p - buf);
58 maxlen -= len;
59 pos += len;
60 n += len;
61 }
62
63 return n;
64}
65
66/*
67 * compare a string to one in a romfs image on MTD
68 * - return 1 if matched, 0 if differ, -ve if error
69 */
70static int romfs_mtd_strncmp(struct super_block *sb, unsigned long pos,
71 const char *str, size_t size)
72{
73 u_char buf[16];
74 size_t len, segment;
75 int ret;
76
77 /* scan the string up to 16 bytes at a time */
78 while (size > 0) {
79 segment = min_t(size_t, size, 16);
80 ret = ROMFS_MTD_READ(sb, pos, segment, &len, buf);
81 if (ret < 0)
82 return ret;
83 if (memcmp(buf, str, len) != 0)
84 return 0;
85 size -= len;
86 pos += len;
87 str += len;
88 }
89
90 return 1;
91}
92#endif /* CONFIG_ROMFS_ON_MTD */
93
94#ifdef CONFIG_ROMFS_ON_BLOCK
95/*
96 * read data from an romfs image on a block device
97 */
98static int romfs_blk_read(struct super_block *sb, unsigned long pos,
99 void *buf, size_t buflen)
100{
101 struct buffer_head *bh;
102 unsigned long offset;
103 size_t segment;
104
105 /* copy the string up to blocksize bytes at a time */
106 while (buflen > 0) {
107 offset = pos & (ROMBSIZE - 1);
108 segment = min_t(size_t, buflen, ROMBSIZE - offset);
109 bh = sb_bread(sb, pos >> ROMBSBITS);
110 if (!bh)
111 return -EIO;
112 memcpy(buf, bh->b_data + offset, segment);
113 brelse(bh);
114 buflen -= segment;
115 pos += segment;
116 }
117
118 return 0;
119}
120
121/*
122 * determine the length of a string in romfs on a block device
123 */
124static ssize_t romfs_blk_strnlen(struct super_block *sb,
125 unsigned long pos, size_t limit)
126{
127 struct buffer_head *bh;
128 unsigned long offset;
129 ssize_t n = 0;
130 size_t segment;
131 u_char *buf, *p;
132
133 /* scan the string up to blocksize bytes at a time */
134 while (limit > 0) {
135 offset = pos & (ROMBSIZE - 1);
136 segment = min_t(size_t, limit, ROMBSIZE - offset);
137 bh = sb_bread(sb, pos >> ROMBSBITS);
138 if (!bh)
139 return -EIO;
140 buf = bh->b_data + offset;
141 p = memchr(buf, 0, segment);
142 brelse(bh);
143 if (p)
144 return n + (p - buf);
145 limit -= segment;
146 pos += segment;
147 n += segment;
148 }
149
150 return n;
151}
152
153/*
154 * compare a string to one in a romfs image on a block device
155 * - return 1 if matched, 0 if differ, -ve if error
156 */
157static int romfs_blk_strncmp(struct super_block *sb, unsigned long pos,
158 const char *str, size_t size)
159{
160 struct buffer_head *bh;
161 unsigned long offset;
162 size_t segment;
163 bool x;
164
165 /* scan the string up to 16 bytes at a time */
166 while (size > 0) {
167 offset = pos & (ROMBSIZE - 1);
168 segment = min_t(size_t, size, ROMBSIZE - offset);
169 bh = sb_bread(sb, pos >> ROMBSBITS);
170 if (!bh)
171 return -EIO;
172 x = (memcmp(bh->b_data + offset, str, segment) != 0);
173 brelse(bh);
174 if (x)
175 return 0;
176 size -= segment;
177 pos += segment;
178 str += segment;
179 }
180
181 return 1;
182}
183#endif /* CONFIG_ROMFS_ON_BLOCK */
184
185/*
186 * read data from the romfs image
187 */
188int romfs_dev_read(struct super_block *sb, unsigned long pos,
189 void *buf, size_t buflen)
190{
191 size_t limit;
192
193 limit = romfs_maxsize(sb);
194 if (pos >= limit)
195 return -EIO;
196 if (buflen > limit - pos)
197 buflen = limit - pos;
198
199#ifdef CONFIG_ROMFS_ON_MTD
200 if (sb->s_mtd)
201 return romfs_mtd_read(sb, pos, buf, buflen);
202#endif
203#ifdef CONFIG_ROMFS_ON_BLOCK
204 if (sb->s_bdev)
205 return romfs_blk_read(sb, pos, buf, buflen);
206#endif
207 return -EIO;
208}
209
210/*
211 * determine the length of a string in romfs
212 */
213ssize_t romfs_dev_strnlen(struct super_block *sb,
214 unsigned long pos, size_t maxlen)
215{
216 size_t limit;
217
218 limit = romfs_maxsize(sb);
219 if (pos >= limit)
220 return -EIO;
221 if (maxlen > limit - pos)
222 maxlen = limit - pos;
223
224#ifdef CONFIG_ROMFS_ON_MTD
225 if (sb->s_mtd)
226 return romfs_mtd_strnlen(sb, pos, limit);
227#endif
228#ifdef CONFIG_ROMFS_ON_BLOCK
229 if (sb->s_bdev)
230 return romfs_blk_strnlen(sb, pos, limit);
231#endif
232 return -EIO;
233}
234
235/*
236 * compare a string to one in romfs
237 * - return 1 if matched, 0 if differ, -ve if error
238 */
239int romfs_dev_strncmp(struct super_block *sb, unsigned long pos,
240 const char *str, size_t size)
241{
242 size_t limit;
243
244 limit = romfs_maxsize(sb);
245 if (pos >= limit)
246 return -EIO;
247 if (size > ROMFS_MAXFN)
248 return -ENAMETOOLONG;
249 if (size > limit - pos)
250 return -EIO;
251
252#ifdef CONFIG_ROMFS_ON_MTD
253 if (sb->s_mtd)
254 return romfs_mtd_strncmp(sb, pos, str, size);
255#endif
256#ifdef CONFIG_ROMFS_ON_BLOCK
257 if (sb->s_bdev)
258 return romfs_blk_strncmp(sb, pos, str, size);
259#endif
260 return -EIO;
261}
diff --git a/fs/romfs/super.c b/fs/romfs/super.c
new file mode 100644
index 000000000000..10ca7d984a8b
--- /dev/null
+++ b/fs/romfs/super.c
@@ -0,0 +1,653 @@
1/* Block- or MTD-based romfs
2 *
3 * Copyright © 2007 Red Hat, Inc. All Rights Reserved.
4 * Written by David Howells (dhowells@redhat.com)
5 *
6 * Derived from: ROMFS file system, Linux implementation
7 *
8 * Copyright © 1997-1999 Janos Farkas <chexum@shadow.banki.hu>
9 *
10 * Using parts of the minix filesystem
11 * Copyright © 1991, 1992 Linus Torvalds
12 *
13 * and parts of the affs filesystem additionally
14 * Copyright © 1993 Ray Burr
15 * Copyright © 1996 Hans-Joachim Widmaier
16 *
17 * Changes
18 * Changed for 2.1.19 modules
19 * Jan 1997 Initial release
20 * Jun 1997 2.1.43+ changes
21 * Proper page locking in readpage
22 * Changed to work with 2.1.45+ fs
23 * Jul 1997 Fixed follow_link
24 * 2.1.47
25 * lookup shouldn't return -ENOENT
26 * from Horst von Brand:
27 * fail on wrong checksum
28 * double unlock_super was possible
29 * correct namelen for statfs
30 * spotted by Bill Hawes:
31 * readlink shouldn't iput()
32 * Jun 1998 2.1.106 from Avery Pennarun: glibc scandir()
33 * exposed a problem in readdir
34 * 2.1.107 code-freeze spellchecker run
35 * Aug 1998 2.1.118+ VFS changes
36 * Sep 1998 2.1.122 another VFS change (follow_link)
37 * Apr 1999 2.2.7 no more EBADF checking in
38 * lookup/readdir, use ERR_PTR
39 * Jun 1999 2.3.6 d_alloc_root use changed
40 * 2.3.9 clean up usage of ENOENT/negative
41 * dentries in lookup
42 * clean up page flags setting
43 * (error, uptodate, locking) in
44 * in readpage
45 * use init_special_inode for
46 * fifos/sockets (and streamline) in
47 * read_inode, fix _ops table order
48 * Aug 1999 2.3.16 __initfunc() => __init change
49 * Oct 1999 2.3.24 page->owner hack obsoleted
50 * Nov 1999 2.3.27 2.3.25+ page->offset => index change
51 *
52 *
53 * This program is free software; you can redistribute it and/or
54 * modify it under the terms of the GNU General Public Licence
55 * as published by the Free Software Foundation; either version
56 * 2 of the Licence, or (at your option) any later version.
57 */
58
59#include <linux/module.h>
60#include <linux/string.h>
61#include <linux/fs.h>
62#include <linux/time.h>
63#include <linux/slab.h>
64#include <linux/init.h>
65#include <linux/blkdev.h>
66#include <linux/parser.h>
67#include <linux/mount.h>
68#include <linux/namei.h>
69#include <linux/statfs.h>
70#include <linux/mtd/super.h>
71#include <linux/ctype.h>
72#include <linux/highmem.h>
73#include <linux/pagemap.h>
74#include <linux/uaccess.h>
75#include "internal.h"
76
77static struct kmem_cache *romfs_inode_cachep;
78
79static const umode_t romfs_modemap[8] = {
80 0, /* hard link */
81 S_IFDIR | 0644, /* directory */
82 S_IFREG | 0644, /* regular file */
83 S_IFLNK | 0777, /* symlink */
84 S_IFBLK | 0600, /* blockdev */
85 S_IFCHR | 0600, /* chardev */
86 S_IFSOCK | 0644, /* socket */
87 S_IFIFO | 0644 /* FIFO */
88};
89
90static const unsigned char romfs_dtype_table[] = {
91 DT_UNKNOWN, DT_DIR, DT_REG, DT_LNK, DT_BLK, DT_CHR, DT_SOCK, DT_FIFO
92};
93
94static struct inode *romfs_iget(struct super_block *sb, unsigned long pos);
95
96/*
97 * read a page worth of data from the image
98 */
99static int romfs_readpage(struct file *file, struct page *page)
100{
101 struct inode *inode = page->mapping->host;
102 loff_t offset, size;
103 unsigned long fillsize, pos;
104 void *buf;
105 int ret;
106
107 buf = kmap(page);
108 if (!buf)
109 return -ENOMEM;
110
111 /* 32 bit warning -- but not for us :) */
112 offset = page_offset(page);
113 size = i_size_read(inode);
114 fillsize = 0;
115 ret = 0;
116 if (offset < size) {
117 size -= offset;
118 fillsize = size > PAGE_SIZE ? PAGE_SIZE : size;
119
120 pos = ROMFS_I(inode)->i_dataoffset + offset;
121
122 ret = romfs_dev_read(inode->i_sb, pos, buf, fillsize);
123 if (ret < 0) {
124 SetPageError(page);
125 fillsize = 0;
126 ret = -EIO;
127 }
128 }
129
130 if (fillsize < PAGE_SIZE)
131 memset(buf + fillsize, 0, PAGE_SIZE - fillsize);
132 if (ret == 0)
133 SetPageUptodate(page);
134
135 flush_dcache_page(page);
136 kunmap(page);
137 unlock_page(page);
138 return ret;
139}
140
141static const struct address_space_operations romfs_aops = {
142 .readpage = romfs_readpage
143};
144
145/*
146 * read the entries from a directory
147 */
148static int romfs_readdir(struct file *filp, void *dirent, filldir_t filldir)
149{
150 struct inode *i = filp->f_dentry->d_inode;
151 struct romfs_inode ri;
152 unsigned long offset, maxoff;
153 int j, ino, nextfh;
154 int stored = 0;
155 char fsname[ROMFS_MAXFN]; /* XXX dynamic? */
156 int ret;
157
158 maxoff = romfs_maxsize(i->i_sb);
159
160 offset = filp->f_pos;
161 if (!offset) {
162 offset = i->i_ino & ROMFH_MASK;
163 ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE);
164 if (ret < 0)
165 goto out;
166 offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
167 }
168
169 /* Not really failsafe, but we are read-only... */
170 for (;;) {
171 if (!offset || offset >= maxoff) {
172 offset = maxoff;
173 filp->f_pos = offset;
174 goto out;
175 }
176 filp->f_pos = offset;
177
178 /* Fetch inode info */
179 ret = romfs_dev_read(i->i_sb, offset, &ri, ROMFH_SIZE);
180 if (ret < 0)
181 goto out;
182
183 j = romfs_dev_strnlen(i->i_sb, offset + ROMFH_SIZE,
184 sizeof(fsname) - 1);
185 if (j < 0)
186 goto out;
187
188 ret = romfs_dev_read(i->i_sb, offset + ROMFH_SIZE, fsname, j);
189 if (ret < 0)
190 goto out;
191 fsname[j] = '\0';
192
193 ino = offset;
194 nextfh = be32_to_cpu(ri.next);
195 if ((nextfh & ROMFH_TYPE) == ROMFH_HRD)
196 ino = be32_to_cpu(ri.spec);
197 if (filldir(dirent, fsname, j, offset, ino,
198 romfs_dtype_table[nextfh & ROMFH_TYPE]) < 0)
199 goto out;
200
201 stored++;
202 offset = nextfh & ROMFH_MASK;
203 }
204
205out:
206 return stored;
207}
208
209/*
210 * look up an entry in a directory
211 */
212static struct dentry *romfs_lookup(struct inode *dir, struct dentry *dentry,
213 struct nameidata *nd)
214{
215 unsigned long offset, maxoff;
216 struct inode *inode;
217 struct romfs_inode ri;
218 const char *name; /* got from dentry */
219 int len, ret;
220
221 offset = dir->i_ino & ROMFH_MASK;
222 ret = romfs_dev_read(dir->i_sb, offset, &ri, ROMFH_SIZE);
223 if (ret < 0)
224 goto error;
225
226 /* search all the file entries in the list starting from the one
227 * pointed to by the directory's special data */
228 maxoff = romfs_maxsize(dir->i_sb);
229 offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
230
231 name = dentry->d_name.name;
232 len = dentry->d_name.len;
233
234 for (;;) {
235 if (!offset || offset >= maxoff)
236 goto out0;
237
238 ret = romfs_dev_read(dir->i_sb, offset, &ri, sizeof(ri));
239 if (ret < 0)
240 goto error;
241
242 /* try to match the first 16 bytes of name */
243 ret = romfs_dev_strncmp(dir->i_sb, offset + ROMFH_SIZE, name,
244 len);
245 if (ret < 0)
246 goto error;
247 if (ret == 1)
248 break;
249
250 /* next entry */
251 offset = be32_to_cpu(ri.next) & ROMFH_MASK;
252 }
253
254 /* Hard link handling */
255 if ((be32_to_cpu(ri.next) & ROMFH_TYPE) == ROMFH_HRD)
256 offset = be32_to_cpu(ri.spec) & ROMFH_MASK;
257
258 inode = romfs_iget(dir->i_sb, offset);
259 if (IS_ERR(inode)) {
260 ret = PTR_ERR(inode);
261 goto error;
262 }
263 goto outi;
264
265 /*
266 * it's a bit funky, _lookup needs to return an error code
267 * (negative) or a NULL, both as a dentry. ENOENT should not
268 * be returned, instead we need to create a negative dentry by
269 * d_add(dentry, NULL); and return 0 as no error.
270 * (Although as I see, it only matters on writable file
271 * systems).
272 */
273out0:
274 inode = NULL;
275outi:
276 d_add(dentry, inode);
277 ret = 0;
278error:
279 return ERR_PTR(ret);
280}
281
282static const struct file_operations romfs_dir_operations = {
283 .read = generic_read_dir,
284 .readdir = romfs_readdir,
285};
286
287static struct inode_operations romfs_dir_inode_operations = {
288 .lookup = romfs_lookup,
289};
290
291/*
292 * get a romfs inode based on its position in the image (which doubles as the
293 * inode number)
294 */
295static struct inode *romfs_iget(struct super_block *sb, unsigned long pos)
296{
297 struct romfs_inode_info *inode;
298 struct romfs_inode ri;
299 struct inode *i;
300 unsigned long nlen;
301 unsigned nextfh, ret;
302 umode_t mode;
303
304 /* we might have to traverse a chain of "hard link" file entries to get
305 * to the actual file */
306 for (;;) {
307 ret = romfs_dev_read(sb, pos, &ri, sizeof(ri));
308 if (ret < 0)
309 goto error;
310
311 /* XXX: do romfs_checksum here too (with name) */
312
313 nextfh = be32_to_cpu(ri.next);
314 if ((nextfh & ROMFH_TYPE) != ROMFH_HRD)
315 break;
316
317 pos = be32_to_cpu(ri.spec) & ROMFH_MASK;
318 }
319
320 /* determine the length of the filename */
321 nlen = romfs_dev_strnlen(sb, pos + ROMFH_SIZE, ROMFS_MAXFN);
322 if (IS_ERR_VALUE(nlen))
323 goto eio;
324
325 /* get an inode for this image position */
326 i = iget_locked(sb, pos);
327 if (!i)
328 return ERR_PTR(-ENOMEM);
329
330 if (!(i->i_state & I_NEW))
331 return i;
332
333 /* precalculate the data offset */
334 inode = ROMFS_I(i);
335 inode->i_metasize = (ROMFH_SIZE + nlen + 1 + ROMFH_PAD) & ROMFH_MASK;
336 inode->i_dataoffset = pos + inode->i_metasize;
337
338 i->i_nlink = 1; /* Hard to decide.. */
339 i->i_size = be32_to_cpu(ri.size);
340 i->i_mtime.tv_sec = i->i_atime.tv_sec = i->i_ctime.tv_sec = 0;
341 i->i_mtime.tv_nsec = i->i_atime.tv_nsec = i->i_ctime.tv_nsec = 0;
342
343 /* set up mode and ops */
344 mode = romfs_modemap[nextfh & ROMFH_TYPE];
345
346 switch (nextfh & ROMFH_TYPE) {
347 case ROMFH_DIR:
348 i->i_size = ROMFS_I(i)->i_metasize;
349 i->i_op = &romfs_dir_inode_operations;
350 i->i_fop = &romfs_dir_operations;
351 if (nextfh & ROMFH_EXEC)
352 mode |= S_IXUGO;
353 break;
354 case ROMFH_REG:
355 i->i_fop = &romfs_ro_fops;
356 i->i_data.a_ops = &romfs_aops;
357 if (i->i_sb->s_mtd)
358 i->i_data.backing_dev_info =
359 i->i_sb->s_mtd->backing_dev_info;
360 if (nextfh & ROMFH_EXEC)
361 mode |= S_IXUGO;
362 break;
363 case ROMFH_SYM:
364 i->i_op = &page_symlink_inode_operations;
365 i->i_data.a_ops = &romfs_aops;
366 mode |= S_IRWXUGO;
367 break;
368 default:
369 /* depending on MBZ for sock/fifos */
370 nextfh = be32_to_cpu(ri.spec);
371 init_special_inode(i, mode, MKDEV(nextfh >> 16,
372 nextfh & 0xffff));
373 break;
374 }
375
376 i->i_mode = mode;
377
378 unlock_new_inode(i);
379 return i;
380
381eio:
382 ret = -EIO;
383error:
384 printk(KERN_ERR "ROMFS: read error for inode 0x%lx\n", pos);
385 return ERR_PTR(ret);
386}
387
388/*
389 * allocate a new inode
390 */
391static struct inode *romfs_alloc_inode(struct super_block *sb)
392{
393 struct romfs_inode_info *inode;
394 inode = kmem_cache_alloc(romfs_inode_cachep, GFP_KERNEL);
395 return inode ? &inode->vfs_inode : NULL;
396}
397
398/*
399 * return a spent inode to the slab cache
400 */
401static void romfs_destroy_inode(struct inode *inode)
402{
403 kmem_cache_free(romfs_inode_cachep, ROMFS_I(inode));
404}
405
406/*
407 * get filesystem statistics
408 */
409static int romfs_statfs(struct dentry *dentry, struct kstatfs *buf)
410{
411 struct super_block *sb = dentry->d_sb;
412 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
413
414 buf->f_type = ROMFS_MAGIC;
415 buf->f_namelen = ROMFS_MAXFN;
416 buf->f_bsize = ROMBSIZE;
417 buf->f_bfree = buf->f_bavail = buf->f_ffree;
418 buf->f_blocks =
419 (romfs_maxsize(dentry->d_sb) + ROMBSIZE - 1) >> ROMBSBITS;
420 buf->f_fsid.val[0] = (u32)id;
421 buf->f_fsid.val[1] = (u32)(id >> 32);
422 return 0;
423}
424
425/*
426 * remounting must involve read-only
427 */
428static int romfs_remount(struct super_block *sb, int *flags, char *data)
429{
430 *flags |= MS_RDONLY;
431 return 0;
432}
433
434static const struct super_operations romfs_super_ops = {
435 .alloc_inode = romfs_alloc_inode,
436 .destroy_inode = romfs_destroy_inode,
437 .statfs = romfs_statfs,
438 .remount_fs = romfs_remount,
439};
440
441/*
442 * checksum check on part of a romfs filesystem
443 */
444static __u32 romfs_checksum(const void *data, int size)
445{
446 const __be32 *ptr = data;
447 __u32 sum;
448
449 sum = 0;
450 size >>= 2;
451 while (size > 0) {
452 sum += be32_to_cpu(*ptr++);
453 size--;
454 }
455 return sum;
456}
457
458/*
459 * fill in the superblock
460 */
461static int romfs_fill_super(struct super_block *sb, void *data, int silent)
462{
463 struct romfs_super_block *rsb;
464 struct inode *root;
465 unsigned long pos, img_size;
466 const char *storage;
467 size_t len;
468 int ret;
469
470#ifdef CONFIG_BLOCK
471 if (!sb->s_mtd) {
472 sb_set_blocksize(sb, ROMBSIZE);
473 } else {
474 sb->s_blocksize = ROMBSIZE;
475 sb->s_blocksize_bits = blksize_bits(ROMBSIZE);
476 }
477#endif
478
479 sb->s_maxbytes = 0xFFFFFFFF;
480 sb->s_magic = ROMFS_MAGIC;
481 sb->s_flags |= MS_RDONLY | MS_NOATIME;
482 sb->s_op = &romfs_super_ops;
483
484 /* read the image superblock and check it */
485 rsb = kmalloc(512, GFP_KERNEL);
486 if (!rsb)
487 return -ENOMEM;
488
489 sb->s_fs_info = (void *) 512;
490 ret = romfs_dev_read(sb, 0, rsb, 512);
491 if (ret < 0)
492 goto error_rsb;
493
494 img_size = be32_to_cpu(rsb->size);
495
496 if (sb->s_mtd && img_size > sb->s_mtd->size)
497 goto error_rsb_inval;
498
499 sb->s_fs_info = (void *) img_size;
500
501 if (rsb->word0 != ROMSB_WORD0 || rsb->word1 != ROMSB_WORD1 ||
502 img_size < ROMFH_SIZE) {
503 if (!silent)
504 printk(KERN_WARNING "VFS:"
505 " Can't find a romfs filesystem on dev %s.\n",
506 sb->s_id);
507 goto error_rsb_inval;
508 }
509
510 if (romfs_checksum(rsb, min_t(size_t, img_size, 512))) {
511 printk(KERN_ERR "ROMFS: bad initial checksum on dev %s.\n",
512 sb->s_id);
513 goto error_rsb_inval;
514 }
515
516 storage = sb->s_mtd ? "MTD" : "the block layer";
517
518 len = strnlen(rsb->name, ROMFS_MAXFN);
519 if (!silent)
520 printk(KERN_NOTICE "ROMFS: Mounting image '%*.*s' through %s\n",
521 (unsigned) len, (unsigned) len, rsb->name, storage);
522
523 kfree(rsb);
524 rsb = NULL;
525
526 /* find the root directory */
527 pos = (ROMFH_SIZE + len + 1 + ROMFH_PAD) & ROMFH_MASK;
528
529 root = romfs_iget(sb, pos);
530 if (!root)
531 goto error;
532
533 sb->s_root = d_alloc_root(root);
534 if (!sb->s_root)
535 goto error_i;
536
537 return 0;
538
539error_i:
540 iput(root);
541error:
542 return -EINVAL;
543error_rsb_inval:
544 ret = -EINVAL;
545error_rsb:
546 return ret;
547}
548
549/*
550 * get a superblock for mounting
551 */
552static int romfs_get_sb(struct file_system_type *fs_type,
553 int flags, const char *dev_name,
554 void *data, struct vfsmount *mnt)
555{
556 int ret = -EINVAL;
557
558#ifdef CONFIG_ROMFS_ON_MTD
559 ret = get_sb_mtd(fs_type, flags, dev_name, data, romfs_fill_super,
560 mnt);
561#endif
562#ifdef CONFIG_ROMFS_ON_BLOCK
563 if (ret == -EINVAL)
564 ret = get_sb_bdev(fs_type, flags, dev_name, data,
565 romfs_fill_super, mnt);
566#endif
567 return ret;
568}
569
570/*
571 * destroy a romfs superblock in the appropriate manner
572 */
573static void romfs_kill_sb(struct super_block *sb)
574{
575#ifdef CONFIG_ROMFS_ON_MTD
576 if (sb->s_mtd) {
577 kill_mtd_super(sb);
578 return;
579 }
580#endif
581#ifdef CONFIG_ROMFS_ON_BLOCK
582 if (sb->s_bdev) {
583 kill_block_super(sb);
584 return;
585 }
586#endif
587}
588
589static struct file_system_type romfs_fs_type = {
590 .owner = THIS_MODULE,
591 .name = "romfs",
592 .get_sb = romfs_get_sb,
593 .kill_sb = romfs_kill_sb,
594 .fs_flags = FS_REQUIRES_DEV,
595};
596
597/*
598 * inode storage initialiser
599 */
600static void romfs_i_init_once(void *_inode)
601{
602 struct romfs_inode_info *inode = _inode;
603
604 inode_init_once(&inode->vfs_inode);
605}
606
607/*
608 * romfs module initialisation
609 */
610static int __init init_romfs_fs(void)
611{
612 int ret;
613
614 printk(KERN_INFO "ROMFS MTD (C) 2007 Red Hat, Inc.\n");
615
616 romfs_inode_cachep =
617 kmem_cache_create("romfs_i",
618 sizeof(struct romfs_inode_info), 0,
619 SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD,
620 romfs_i_init_once);
621
622 if (!romfs_inode_cachep) {
623 printk(KERN_ERR
624 "ROMFS error: Failed to initialise inode cache\n");
625 return -ENOMEM;
626 }
627 ret = register_filesystem(&romfs_fs_type);
628 if (ret) {
629 printk(KERN_ERR "ROMFS error: Failed to register filesystem\n");
630 goto error_register;
631 }
632 return 0;
633
634error_register:
635 kmem_cache_destroy(romfs_inode_cachep);
636 return ret;
637}
638
639/*
640 * romfs module removal
641 */
642static void __exit exit_romfs_fs(void)
643{
644 unregister_filesystem(&romfs_fs_type);
645 kmem_cache_destroy(romfs_inode_cachep);
646}
647
648module_init(init_romfs_fs);
649module_exit(exit_romfs_fs);
650
651MODULE_DESCRIPTION("Direct-MTD Capable RomFS");
652MODULE_AUTHOR("Red Hat, Inc.");
653MODULE_LICENSE("GPL"); /* Actually dual-licensed, but it doesn't matter for */
diff --git a/fs/splice.c b/fs/splice.c
index dd727d43e5b7..c18aa7e03e2b 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -737,10 +737,19 @@ ssize_t splice_from_pipe(struct pipe_inode_info *pipe, struct file *out,
737 * ->write_end. Most of the time, these expect i_mutex to 737 * ->write_end. Most of the time, these expect i_mutex to
738 * be held. Since this may result in an ABBA deadlock with 738 * be held. Since this may result in an ABBA deadlock with
739 * pipe->inode, we have to order lock acquiry here. 739 * pipe->inode, we have to order lock acquiry here.
740 *
741 * Outer lock must be inode->i_mutex, as pipe_wait() will
742 * release and reacquire pipe->inode->i_mutex, AND inode must
743 * never be a pipe.
740 */ 744 */
741 inode_double_lock(inode, pipe->inode); 745 WARN_ON(S_ISFIFO(inode->i_mode));
746 mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
747 if (pipe->inode)
748 mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_CHILD);
742 ret = __splice_from_pipe(pipe, &sd, actor); 749 ret = __splice_from_pipe(pipe, &sd, actor);
743 inode_double_unlock(inode, pipe->inode); 750 if (pipe->inode)
751 mutex_unlock(&pipe->inode->i_mutex);
752 mutex_unlock(&inode->i_mutex);
744 753
745 return ret; 754 return ret;
746} 755}
@@ -831,11 +840,17 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
831 }; 840 };
832 ssize_t ret; 841 ssize_t ret;
833 842
834 inode_double_lock(inode, pipe->inode); 843 WARN_ON(S_ISFIFO(inode->i_mode));
844 mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT);
835 ret = file_remove_suid(out); 845 ret = file_remove_suid(out);
836 if (likely(!ret)) 846 if (likely(!ret)) {
847 if (pipe->inode)
848 mutex_lock_nested(&pipe->inode->i_mutex, I_MUTEX_CHILD);
837 ret = __splice_from_pipe(pipe, &sd, pipe_to_file); 849 ret = __splice_from_pipe(pipe, &sd, pipe_to_file);
838 inode_double_unlock(inode, pipe->inode); 850 if (pipe->inode)
851 mutex_unlock(&pipe->inode->i_mutex);
852 }
853 mutex_unlock(&inode->i_mutex);
839 if (ret > 0) { 854 if (ret > 0) {
840 unsigned long nr_pages; 855 unsigned long nr_pages;
841 856
diff --git a/fs/squashfs/export.c b/fs/squashfs/export.c
index 69e971d5ddc1..2b1b8fe5e037 100644
--- a/fs/squashfs/export.c
+++ b/fs/squashfs/export.c
@@ -40,6 +40,7 @@
40#include <linux/dcache.h> 40#include <linux/dcache.h>
41#include <linux/exportfs.h> 41#include <linux/exportfs.h>
42#include <linux/zlib.h> 42#include <linux/zlib.h>
43#include <linux/slab.h>
43 44
44#include "squashfs_fs.h" 45#include "squashfs_fs.h"
45#include "squashfs_fs_sb.h" 46#include "squashfs_fs_sb.h"
diff --git a/fs/super.c b/fs/super.c
index 77cb4ec919b9..786fe7d72790 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -771,6 +771,46 @@ void kill_litter_super(struct super_block *sb)
771 771
772EXPORT_SYMBOL(kill_litter_super); 772EXPORT_SYMBOL(kill_litter_super);
773 773
774static int ns_test_super(struct super_block *sb, void *data)
775{
776 return sb->s_fs_info == data;
777}
778
779static int ns_set_super(struct super_block *sb, void *data)
780{
781 sb->s_fs_info = data;
782 return set_anon_super(sb, NULL);
783}
784
785int get_sb_ns(struct file_system_type *fs_type, int flags, void *data,
786 int (*fill_super)(struct super_block *, void *, int),
787 struct vfsmount *mnt)
788{
789 struct super_block *sb;
790
791 sb = sget(fs_type, ns_test_super, ns_set_super, data);
792 if (IS_ERR(sb))
793 return PTR_ERR(sb);
794
795 if (!sb->s_root) {
796 int err;
797 sb->s_flags = flags;
798 err = fill_super(sb, data, flags & MS_SILENT ? 1 : 0);
799 if (err) {
800 up_write(&sb->s_umount);
801 deactivate_super(sb);
802 return err;
803 }
804
805 sb->s_flags |= MS_ACTIVE;
806 }
807
808 simple_set_mnt(mnt, sb);
809 return 0;
810}
811
812EXPORT_SYMBOL(get_sb_ns);
813
774#ifdef CONFIG_BLOCK 814#ifdef CONFIG_BLOCK
775static int set_bdev_super(struct super_block *s, void *data) 815static int set_bdev_super(struct super_block *s, void *data)
776{ 816{
diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c
index f393620890ee..af1914462f02 100644
--- a/fs/ubifs/budget.c
+++ b/fs/ubifs/budget.c
@@ -194,29 +194,26 @@ static int make_free_space(struct ubifs_info *c)
194} 194}
195 195
196/** 196/**
197 * ubifs_calc_min_idx_lebs - calculate amount of eraseblocks for the index. 197 * ubifs_calc_min_idx_lebs - calculate amount of LEBs for the index.
198 * @c: UBIFS file-system description object 198 * @c: UBIFS file-system description object
199 * 199 *
200 * This function calculates and returns the number of eraseblocks which should 200 * This function calculates and returns the number of LEBs which should be kept
201 * be kept for index usage. 201 * for index usage.
202 */ 202 */
203int ubifs_calc_min_idx_lebs(struct ubifs_info *c) 203int ubifs_calc_min_idx_lebs(struct ubifs_info *c)
204{ 204{
205 int idx_lebs, eff_leb_size = c->leb_size - c->max_idx_node_sz; 205 int idx_lebs;
206 long long idx_size; 206 long long idx_size;
207 207
208 idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx; 208 idx_size = c->old_idx_sz + c->budg_idx_growth + c->budg_uncommitted_idx;
209
210 /* And make sure we have thrice the index size of space reserved */ 209 /* And make sure we have thrice the index size of space reserved */
211 idx_size = idx_size + (idx_size << 1); 210 idx_size += idx_size << 1;
212
213 /* 211 /*
214 * We do not maintain 'old_idx_size' as 'old_idx_lebs'/'old_idx_bytes' 212 * We do not maintain 'old_idx_size' as 'old_idx_lebs'/'old_idx_bytes'
215 * pair, nor similarly the two variables for the new index size, so we 213 * pair, nor similarly the two variables for the new index size, so we
216 * have to do this costly 64-bit division on fast-path. 214 * have to do this costly 64-bit division on fast-path.
217 */ 215 */
218 idx_size += eff_leb_size - 1; 216 idx_lebs = div_u64(idx_size + c->idx_leb_size - 1, c->idx_leb_size);
219 idx_lebs = div_u64(idx_size, eff_leb_size);
220 /* 217 /*
221 * The index head is not available for the in-the-gaps method, so add an 218 * The index head is not available for the in-the-gaps method, so add an
222 * extra LEB to compensate. 219 * extra LEB to compensate.
@@ -310,23 +307,23 @@ static int can_use_rp(struct ubifs_info *c)
310 * do_budget_space - reserve flash space for index and data growth. 307 * do_budget_space - reserve flash space for index and data growth.
311 * @c: UBIFS file-system description object 308 * @c: UBIFS file-system description object
312 * 309 *
313 * This function makes sure UBIFS has enough free eraseblocks for index growth 310 * This function makes sure UBIFS has enough free LEBs for index growth and
314 * and data. 311 * data.
315 * 312 *
316 * When budgeting index space, UBIFS reserves thrice as many LEBs as the index 313 * When budgeting index space, UBIFS reserves thrice as many LEBs as the index
317 * would take if it was consolidated and written to the flash. This guarantees 314 * would take if it was consolidated and written to the flash. This guarantees
318 * that the "in-the-gaps" commit method always succeeds and UBIFS will always 315 * that the "in-the-gaps" commit method always succeeds and UBIFS will always
319 * be able to commit dirty index. So this function basically adds amount of 316 * be able to commit dirty index. So this function basically adds amount of
320 * budgeted index space to the size of the current index, multiplies this by 3, 317 * budgeted index space to the size of the current index, multiplies this by 3,
321 * and makes sure this does not exceed the amount of free eraseblocks. 318 * and makes sure this does not exceed the amount of free LEBs.
322 * 319 *
323 * Notes about @c->min_idx_lebs and @c->lst.idx_lebs variables: 320 * Notes about @c->min_idx_lebs and @c->lst.idx_lebs variables:
324 * o @c->lst.idx_lebs is the number of LEBs the index currently uses. It might 321 * o @c->lst.idx_lebs is the number of LEBs the index currently uses. It might
325 * be large, because UBIFS does not do any index consolidation as long as 322 * be large, because UBIFS does not do any index consolidation as long as
326 * there is free space. IOW, the index may take a lot of LEBs, but the LEBs 323 * there is free space. IOW, the index may take a lot of LEBs, but the LEBs
327 * will contain a lot of dirt. 324 * will contain a lot of dirt.
328 * o @c->min_idx_lebs is the the index presumably takes. IOW, the index may be 325 * o @c->min_idx_lebs is the number of LEBS the index presumably takes. IOW,
329 * consolidated to take up to @c->min_idx_lebs LEBs. 326 * the index may be consolidated to take up to @c->min_idx_lebs LEBs.
330 * 327 *
331 * This function returns zero in case of success, and %-ENOSPC in case of 328 * This function returns zero in case of success, and %-ENOSPC in case of
332 * failure. 329 * failure.
@@ -695,12 +692,12 @@ long long ubifs_reported_space(const struct ubifs_info *c, long long free)
695 * This function calculates amount of free space to report to user-space. 692 * This function calculates amount of free space to report to user-space.
696 * 693 *
697 * Because UBIFS may introduce substantial overhead (the index, node headers, 694 * Because UBIFS may introduce substantial overhead (the index, node headers,
698 * alignment, wastage at the end of eraseblocks, etc), it cannot report real 695 * alignment, wastage at the end of LEBs, etc), it cannot report real amount of
699 * amount of free flash space it has (well, because not all dirty space is 696 * free flash space it has (well, because not all dirty space is reclaimable,
700 * reclaimable, UBIFS does not actually know the real amount). If UBIFS did so, 697 * UBIFS does not actually know the real amount). If UBIFS did so, it would
701 * it would bread user expectations about what free space is. Users seem to 698 * bread user expectations about what free space is. Users seem to accustomed
702 * accustomed to assume that if the file-system reports N bytes of free space, 699 * to assume that if the file-system reports N bytes of free space, they would
703 * they would be able to fit a file of N bytes to the FS. This almost works for 700 * be able to fit a file of N bytes to the FS. This almost works for
704 * traditional file-systems, because they have way less overhead than UBIFS. 701 * traditional file-systems, because they have way less overhead than UBIFS.
705 * So, to keep users happy, UBIFS tries to take the overhead into account. 702 * So, to keep users happy, UBIFS tries to take the overhead into account.
706 */ 703 */
diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c
index e975bd82f38b..ce2cd8343618 100644
--- a/fs/ubifs/debug.c
+++ b/fs/ubifs/debug.c
@@ -479,9 +479,9 @@ void dbg_dump_node(const struct ubifs_info *c, const void *node)
479 "bad or corrupted node)"); 479 "bad or corrupted node)");
480 else { 480 else {
481 for (i = 0; i < nlen && dent->name[i]; i++) 481 for (i = 0; i < nlen && dent->name[i]; i++)
482 printk("%c", dent->name[i]); 482 printk(KERN_CONT "%c", dent->name[i]);
483 } 483 }
484 printk("\n"); 484 printk(KERN_CONT "\n");
485 485
486 break; 486 break;
487 } 487 }
@@ -1214,7 +1214,7 @@ static int dbg_check_znode(struct ubifs_info *c, struct ubifs_zbranch *zbr)
1214 1214
1215 /* 1215 /*
1216 * Make sure the last key in our znode is less or 1216 * Make sure the last key in our znode is less or
1217 * equivalent than the the key in zbranch which goes 1217 * equivalent than the key in the zbranch which goes
1218 * after our pointing zbranch. 1218 * after our pointing zbranch.
1219 */ 1219 */
1220 cmp = keys_cmp(c, max, 1220 cmp = keys_cmp(c, max,
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 0ff89fe71e51..6d34dc7e33e1 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -430,6 +430,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
430 struct ubifs_inode *ui = ubifs_inode(inode); 430 struct ubifs_inode *ui = ubifs_inode(inode);
431 pgoff_t index = pos >> PAGE_CACHE_SHIFT; 431 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
432 int uninitialized_var(err), appending = !!(pos + len > inode->i_size); 432 int uninitialized_var(err), appending = !!(pos + len > inode->i_size);
433 int skipped_read = 0;
433 struct page *page; 434 struct page *page;
434 435
435 ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size); 436 ubifs_assert(ubifs_inode(inode)->ui_size == inode->i_size);
@@ -444,7 +445,7 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
444 445
445 if (!PageUptodate(page)) { 446 if (!PageUptodate(page)) {
446 /* The page is not loaded from the flash */ 447 /* The page is not loaded from the flash */
447 if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) 448 if (!(pos & ~PAGE_CACHE_MASK) && len == PAGE_CACHE_SIZE) {
448 /* 449 /*
449 * We change whole page so no need to load it. But we 450 * We change whole page so no need to load it. But we
450 * have to set the @PG_checked flag to make the further 451 * have to set the @PG_checked flag to make the further
@@ -453,7 +454,8 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
453 * the media. 454 * the media.
454 */ 455 */
455 SetPageChecked(page); 456 SetPageChecked(page);
456 else { 457 skipped_read = 1;
458 } else {
457 err = do_readpage(page); 459 err = do_readpage(page);
458 if (err) { 460 if (err) {
459 unlock_page(page); 461 unlock_page(page);
@@ -470,6 +472,14 @@ static int ubifs_write_begin(struct file *file, struct address_space *mapping,
470 if (unlikely(err)) { 472 if (unlikely(err)) {
471 ubifs_assert(err == -ENOSPC); 473 ubifs_assert(err == -ENOSPC);
472 /* 474 /*
475 * If we skipped reading the page because we were going to
476 * write all of it, then it is not up to date.
477 */
478 if (skipped_read) {
479 ClearPageChecked(page);
480 ClearPageUptodate(page);
481 }
482 /*
473 * Budgeting failed which means it would have to force 483 * Budgeting failed which means it would have to force
474 * write-back but didn't, because we set the @fast flag in the 484 * write-back but didn't, because we set the @fast flag in the
475 * request. Write-back cannot be done now, while we have the 485 * request. Write-back cannot be done now, while we have the
@@ -949,7 +959,7 @@ static int do_writepage(struct page *page, int len)
949 * whole index and correct all inode sizes, which is long an unacceptable. 959 * whole index and correct all inode sizes, which is long an unacceptable.
950 * 960 *
951 * To prevent situations like this, UBIFS writes pages back only if they are 961 * To prevent situations like this, UBIFS writes pages back only if they are
952 * within last synchronized inode size, i.e. the the size which has been 962 * within the last synchronized inode size, i.e. the size which has been
953 * written to the flash media last time. Otherwise, UBIFS forces inode 963 * written to the flash media last time. Otherwise, UBIFS forces inode
954 * write-back, thus making sure the on-flash inode contains current inode size, 964 * write-back, thus making sure the on-flash inode contains current inode size,
955 * and then keeps writing pages back. 965 * and then keeps writing pages back.
diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c
index 717d79c97c5e..1d54383d1269 100644
--- a/fs/ubifs/find.c
+++ b/fs/ubifs/find.c
@@ -478,7 +478,7 @@ const struct ubifs_lprops *do_find_free_space(struct ubifs_info *c,
478 * ubifs_find_free_space - find a data LEB with free space. 478 * ubifs_find_free_space - find a data LEB with free space.
479 * @c: the UBIFS file-system description object 479 * @c: the UBIFS file-system description object
480 * @min_space: minimum amount of required free space 480 * @min_space: minimum amount of required free space
481 * @free: contains amount of free space in the LEB on exit 481 * @offs: contains offset of where free space starts on exit
482 * @squeeze: whether to try to find space in a non-empty LEB first 482 * @squeeze: whether to try to find space in a non-empty LEB first
483 * 483 *
484 * This function looks for an LEB with at least @min_space bytes of free space. 484 * This function looks for an LEB with at least @min_space bytes of free space.
@@ -490,7 +490,7 @@ const struct ubifs_lprops *do_find_free_space(struct ubifs_info *c,
490 * failed to find a LEB with @min_space bytes of free space and other a negative 490 * failed to find a LEB with @min_space bytes of free space and other a negative
491 * error codes in case of failure. 491 * error codes in case of failure.
492 */ 492 */
493int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free, 493int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *offs,
494 int squeeze) 494 int squeeze)
495{ 495{
496 const struct ubifs_lprops *lprops; 496 const struct ubifs_lprops *lprops;
@@ -558,10 +558,10 @@ int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
558 spin_unlock(&c->space_lock); 558 spin_unlock(&c->space_lock);
559 } 559 }
560 560
561 *free = lprops->free; 561 *offs = c->leb_size - lprops->free;
562 ubifs_release_lprops(c); 562 ubifs_release_lprops(c);
563 563
564 if (*free == c->leb_size) { 564 if (*offs == 0) {
565 /* 565 /*
566 * Ensure that empty LEBs have been unmapped. They may not have 566 * Ensure that empty LEBs have been unmapped. They may not have
567 * been, for example, because of an unclean unmount. Also 567 * been, for example, because of an unclean unmount. Also
@@ -573,8 +573,8 @@ int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free,
573 return err; 573 return err;
574 } 574 }
575 575
576 dbg_find("found LEB %d, free %d", lnum, *free); 576 dbg_find("found LEB %d, free %d", lnum, c->leb_size - *offs);
577 ubifs_assert(*free >= min_space); 577 ubifs_assert(*offs <= c->leb_size - min_space);
578 return lnum; 578 return lnum;
579 579
580out: 580out:
diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c
index a711d33b3d3e..f0f5f15d384e 100644
--- a/fs/ubifs/gc.c
+++ b/fs/ubifs/gc.c
@@ -47,7 +47,7 @@
47 * have to waste large pieces of free space at the end of LEB B, because nodes 47 * have to waste large pieces of free space at the end of LEB B, because nodes
48 * from LEB A would not fit. And the worst situation is when all nodes are of 48 * from LEB A would not fit. And the worst situation is when all nodes are of
49 * maximum size. So dark watermark is the amount of free + dirty space in LEB 49 * maximum size. So dark watermark is the amount of free + dirty space in LEB
50 * which are guaranteed to be reclaimable. If LEB has less space, the GC migh 50 * which are guaranteed to be reclaimable. If LEB has less space, the GC might
51 * be unable to reclaim it. So, LEBs with free + dirty greater than dark 51 * be unable to reclaim it. So, LEBs with free + dirty greater than dark
52 * watermark are "good" LEBs from GC's point of few. The other LEBs are not so 52 * watermark are "good" LEBs from GC's point of few. The other LEBs are not so
53 * good, and GC takes extra care when moving them. 53 * good, and GC takes extra care when moving them.
@@ -57,14 +57,6 @@
57#include "ubifs.h" 57#include "ubifs.h"
58 58
59/* 59/*
60 * GC tries to optimize the way it fit nodes to available space, and it sorts
61 * nodes a little. The below constants are watermarks which define "large",
62 * "medium", and "small" nodes.
63 */
64#define MEDIUM_NODE_WM (UBIFS_BLOCK_SIZE / 4)
65#define SMALL_NODE_WM UBIFS_MAX_DENT_NODE_SZ
66
67/*
68 * GC may need to move more than one LEB to make progress. The below constants 60 * GC may need to move more than one LEB to make progress. The below constants
69 * define "soft" and "hard" limits on the number of LEBs the garbage collector 61 * define "soft" and "hard" limits on the number of LEBs the garbage collector
70 * may move. 62 * may move.
@@ -116,83 +108,222 @@ static int switch_gc_head(struct ubifs_info *c)
116} 108}
117 109
118/** 110/**
119 * joinup - bring data nodes for an inode together. 111 * list_sort - sort a list.
120 * @c: UBIFS file-system description object 112 * @priv: private data, passed to @cmp
121 * @sleb: describes scanned LEB 113 * @head: the list to sort
122 * @inum: inode number 114 * @cmp: the elements comparison function
123 * @blk: block number
124 * @data: list to which to add data nodes
125 * 115 *
126 * This function looks at the first few nodes in the scanned LEB @sleb and adds 116 * This function has been implemented by Mark J Roberts <mjr@znex.org>. It
127 * them to @data if they are data nodes from @inum and have a larger block 117 * implements "merge sort" which has O(nlog(n)) complexity. The list is sorted
128 * number than @blk. This function returns %0 on success and a negative error 118 * in ascending order.
129 * code on failure. 119 *
120 * The comparison function @cmp is supposed to return a negative value if @a is
121 * than @b, and a positive value if @a is greater than @b. If @a and @b are
122 * equivalent, then it does not matter what this function returns.
130 */ 123 */
131static int joinup(struct ubifs_info *c, struct ubifs_scan_leb *sleb, ino_t inum, 124static void list_sort(void *priv, struct list_head *head,
132 unsigned int blk, struct list_head *data) 125 int (*cmp)(void *priv, struct list_head *a,
126 struct list_head *b))
133{ 127{
134 int err, cnt = 6, lnum = sleb->lnum, offs; 128 struct list_head *p, *q, *e, *list, *tail, *oldhead;
135 struct ubifs_scan_node *snod, *tmp; 129 int insize, nmerges, psize, qsize, i;
136 union ubifs_key *key; 130
131 if (list_empty(head))
132 return;
133
134 list = head->next;
135 list_del(head);
136 insize = 1;
137 for (;;) {
138 p = oldhead = list;
139 list = tail = NULL;
140 nmerges = 0;
141
142 while (p) {
143 nmerges++;
144 q = p;
145 psize = 0;
146 for (i = 0; i < insize; i++) {
147 psize++;
148 q = q->next == oldhead ? NULL : q->next;
149 if (!q)
150 break;
151 }
137 152
138 list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) { 153 qsize = insize;
139 key = &snod->key; 154 while (psize > 0 || (qsize > 0 && q)) {
140 if (key_inum(c, key) == inum && 155 if (!psize) {
141 key_type(c, key) == UBIFS_DATA_KEY && 156 e = q;
142 key_block(c, key) > blk) { 157 q = q->next;
143 offs = snod->offs; 158 qsize--;
144 err = ubifs_tnc_has_node(c, key, 0, lnum, offs, 0); 159 if (q == oldhead)
145 if (err < 0) 160 q = NULL;
146 return err; 161 } else if (!qsize || !q) {
147 list_del(&snod->list); 162 e = p;
148 if (err) { 163 p = p->next;
149 list_add_tail(&snod->list, data); 164 psize--;
150 blk = key_block(c, key); 165 if (p == oldhead)
151 } else 166 p = NULL;
152 kfree(snod); 167 } else if (cmp(priv, p, q) <= 0) {
153 cnt = 6; 168 e = p;
154 } else if (--cnt == 0) 169 p = p->next;
170 psize--;
171 if (p == oldhead)
172 p = NULL;
173 } else {
174 e = q;
175 q = q->next;
176 qsize--;
177 if (q == oldhead)
178 q = NULL;
179 }
180 if (tail)
181 tail->next = e;
182 else
183 list = e;
184 e->prev = tail;
185 tail = e;
186 }
187 p = q;
188 }
189
190 tail->next = list;
191 list->prev = tail;
192
193 if (nmerges <= 1)
155 break; 194 break;
195
196 insize *= 2;
156 } 197 }
157 return 0; 198
199 head->next = list;
200 head->prev = list->prev;
201 list->prev->next = head;
202 list->prev = head;
158} 203}
159 204
160/** 205/**
161 * move_nodes - move nodes. 206 * data_nodes_cmp - compare 2 data nodes.
207 * @priv: UBIFS file-system description object
208 * @a: first data node
209 * @a: second data node
210 *
211 * This function compares data nodes @a and @b. Returns %1 if @a has greater
212 * inode or block number, and %-1 otherwise.
213 */
214int data_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
215{
216 ino_t inuma, inumb;
217 struct ubifs_info *c = priv;
218 struct ubifs_scan_node *sa, *sb;
219
220 cond_resched();
221 sa = list_entry(a, struct ubifs_scan_node, list);
222 sb = list_entry(b, struct ubifs_scan_node, list);
223 ubifs_assert(key_type(c, &sa->key) == UBIFS_DATA_KEY);
224 ubifs_assert(key_type(c, &sb->key) == UBIFS_DATA_KEY);
225
226 inuma = key_inum(c, &sa->key);
227 inumb = key_inum(c, &sb->key);
228
229 if (inuma == inumb) {
230 unsigned int blka = key_block(c, &sa->key);
231 unsigned int blkb = key_block(c, &sb->key);
232
233 if (blka <= blkb)
234 return -1;
235 } else if (inuma <= inumb)
236 return -1;
237
238 return 1;
239}
240
241/*
242 * nondata_nodes_cmp - compare 2 non-data nodes.
243 * @priv: UBIFS file-system description object
244 * @a: first node
245 * @a: second node
246 *
247 * This function compares nodes @a and @b. It makes sure that inode nodes go
248 * first and sorted by length in descending order. Directory entry nodes go
249 * after inode nodes and are sorted in ascending hash valuer order.
250 */
251int nondata_nodes_cmp(void *priv, struct list_head *a, struct list_head *b)
252{
253 int typea, typeb;
254 ino_t inuma, inumb;
255 struct ubifs_info *c = priv;
256 struct ubifs_scan_node *sa, *sb;
257
258 cond_resched();
259 sa = list_entry(a, struct ubifs_scan_node, list);
260 sb = list_entry(b, struct ubifs_scan_node, list);
261 typea = key_type(c, &sa->key);
262 typeb = key_type(c, &sb->key);
263 ubifs_assert(typea != UBIFS_DATA_KEY && typeb != UBIFS_DATA_KEY);
264
265 /* Inodes go before directory entries */
266 if (typea == UBIFS_INO_KEY) {
267 if (typeb == UBIFS_INO_KEY)
268 return sb->len - sa->len;
269 return -1;
270 }
271 if (typeb == UBIFS_INO_KEY)
272 return 1;
273
274 ubifs_assert(typea == UBIFS_DENT_KEY && typeb == UBIFS_DENT_KEY);
275 inuma = key_inum(c, &sa->key);
276 inumb = key_inum(c, &sb->key);
277
278 if (inuma == inumb) {
279 uint32_t hasha = key_hash(c, &sa->key);
280 uint32_t hashb = key_hash(c, &sb->key);
281
282 if (hasha <= hashb)
283 return -1;
284 } else if (inuma <= inumb)
285 return -1;
286
287 return 1;
288}
289
290/**
291 * sort_nodes - sort nodes for GC.
162 * @c: UBIFS file-system description object 292 * @c: UBIFS file-system description object
163 * @sleb: describes nodes to move 293 * @sleb: describes nodes to sort and contains the result on exit
294 * @nondata: contains non-data nodes on exit
295 * @min: minimum node size is returned here
164 * 296 *
165 * This function moves valid nodes from data LEB described by @sleb to the GC 297 * This function sorts the list of inodes to garbage collect. First of all, it
166 * journal head. The obsolete nodes are dropped. 298 * kills obsolete nodes and separates data and non-data nodes to the
299 * @sleb->nodes and @nondata lists correspondingly.
300 *
301 * Data nodes are then sorted in block number order - this is important for
302 * bulk-read; data nodes with lower inode number go before data nodes with
303 * higher inode number, and data nodes with lower block number go before data
304 * nodes with higher block number;
167 * 305 *
168 * When moving nodes we have to deal with classical bin-packing problem: the 306 * Non-data nodes are sorted as follows.
169 * space in the current GC journal head LEB and in @c->gc_lnum are the "bins", 307 * o First go inode nodes - they are sorted in descending length order.
170 * where the nodes in the @sleb->nodes list are the elements which should be 308 * o Then go directory entry nodes - they are sorted in hash order, which
171 * fit optimally to the bins. This function uses the "first fit decreasing" 309 * should supposedly optimize 'readdir()'. Direntry nodes with lower parent
172 * strategy, although it does not really sort the nodes but just split them on 310 * inode number go before direntry nodes with higher parent inode number,
173 * 3 classes - large, medium, and small, so they are roughly sorted. 311 * and direntry nodes with lower name hash values go before direntry nodes
312 * with higher name hash values.
174 * 313 *
175 * This function returns zero in case of success, %-EAGAIN if commit is 314 * This function returns zero in case of success and a negative error code in
176 * required, and other negative error codes in case of other failures. 315 * case of failure.
177 */ 316 */
178static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb) 317static int sort_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
318 struct list_head *nondata, int *min)
179{ 319{
180 struct ubifs_scan_node *snod, *tmp; 320 struct ubifs_scan_node *snod, *tmp;
181 struct list_head data, large, medium, small;
182 struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
183 int avail, err, min = INT_MAX;
184 unsigned int blk = 0;
185 ino_t inum = 0;
186 321
187 INIT_LIST_HEAD(&data); 322 *min = INT_MAX;
188 INIT_LIST_HEAD(&large);
189 INIT_LIST_HEAD(&medium);
190 INIT_LIST_HEAD(&small);
191 323
192 while (!list_empty(&sleb->nodes)) { 324 /* Separate data nodes and non-data nodes */
193 struct list_head *lst = sleb->nodes.next; 325 list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
194 326 int err;
195 snod = list_entry(lst, struct ubifs_scan_node, list);
196 327
197 ubifs_assert(snod->type != UBIFS_IDX_NODE); 328 ubifs_assert(snod->type != UBIFS_IDX_NODE);
198 ubifs_assert(snod->type != UBIFS_REF_NODE); 329 ubifs_assert(snod->type != UBIFS_REF_NODE);
@@ -201,53 +332,72 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
201 err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum, 332 err = ubifs_tnc_has_node(c, &snod->key, 0, sleb->lnum,
202 snod->offs, 0); 333 snod->offs, 0);
203 if (err < 0) 334 if (err < 0)
204 goto out; 335 return err;
205 336
206 list_del(lst);
207 if (!err) { 337 if (!err) {
208 /* The node is obsolete, remove it from the list */ 338 /* The node is obsolete, remove it from the list */
339 list_del(&snod->list);
209 kfree(snod); 340 kfree(snod);
210 continue; 341 continue;
211 } 342 }
212 343
213 /* 344 if (snod->len < *min)
214 * Sort the list of nodes so that data nodes go first, large 345 *min = snod->len;
215 * nodes go second, and small nodes go last. 346
216 */ 347 if (key_type(c, &snod->key) != UBIFS_DATA_KEY)
217 if (key_type(c, &snod->key) == UBIFS_DATA_KEY) { 348 list_move_tail(&snod->list, nondata);
218 if (inum != key_inum(c, &snod->key)) {
219 if (inum) {
220 /*
221 * Try to move data nodes from the same
222 * inode together.
223 */
224 err = joinup(c, sleb, inum, blk, &data);
225 if (err)
226 goto out;
227 }
228 inum = key_inum(c, &snod->key);
229 blk = key_block(c, &snod->key);
230 }
231 list_add_tail(lst, &data);
232 } else if (snod->len > MEDIUM_NODE_WM)
233 list_add_tail(lst, &large);
234 else if (snod->len > SMALL_NODE_WM)
235 list_add_tail(lst, &medium);
236 else
237 list_add_tail(lst, &small);
238
239 /* And find the smallest node */
240 if (snod->len < min)
241 min = snod->len;
242 } 349 }
243 350
244 /* 351 /* Sort data and non-data nodes */
245 * Join the tree lists so that we'd have one roughly sorted list 352 list_sort(c, &sleb->nodes, &data_nodes_cmp);
246 * ('large' will be the head of the joined list). 353 list_sort(c, nondata, &nondata_nodes_cmp);
247 */ 354 return 0;
248 list_splice(&data, &large); 355}
249 list_splice(&medium, large.prev); 356
250 list_splice(&small, large.prev); 357/**
358 * move_node - move a node.
359 * @c: UBIFS file-system description object
360 * @sleb: describes the LEB to move nodes from
361 * @snod: the mode to move
362 * @wbuf: write-buffer to move node to
363 *
364 * This function moves node @snod to @wbuf, changes TNC correspondingly, and
365 * destroys @snod. Returns zero in case of success and a negative error code in
366 * case of failure.
367 */
368static int move_node(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
369 struct ubifs_scan_node *snod, struct ubifs_wbuf *wbuf)
370{
371 int err, new_lnum = wbuf->lnum, new_offs = wbuf->offs + wbuf->used;
372
373 cond_resched();
374 err = ubifs_wbuf_write_nolock(wbuf, snod->node, snod->len);
375 if (err)
376 return err;
377
378 err = ubifs_tnc_replace(c, &snod->key, sleb->lnum,
379 snod->offs, new_lnum, new_offs,
380 snod->len);
381 list_del(&snod->list);
382 kfree(snod);
383 return err;
384}
385
386/**
387 * move_nodes - move nodes.
388 * @c: UBIFS file-system description object
389 * @sleb: describes the LEB to move nodes from
390 *
391 * This function moves valid nodes from data LEB described by @sleb to the GC
392 * journal head. This function returns zero in case of success, %-EAGAIN if
393 * commit is required, and other negative error codes in case of other
394 * failures.
395 */
396static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
397{
398 int err, min;
399 LIST_HEAD(nondata);
400 struct ubifs_wbuf *wbuf = &c->jheads[GCHD].wbuf;
251 401
252 if (wbuf->lnum == -1) { 402 if (wbuf->lnum == -1) {
253 /* 403 /*
@@ -256,42 +406,59 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
256 */ 406 */
257 err = switch_gc_head(c); 407 err = switch_gc_head(c);
258 if (err) 408 if (err)
259 goto out; 409 return err;
260 } 410 }
261 411
412 err = sort_nodes(c, sleb, &nondata, &min);
413 if (err)
414 goto out;
415
262 /* Write nodes to their new location. Use the first-fit strategy */ 416 /* Write nodes to their new location. Use the first-fit strategy */
263 while (1) { 417 while (1) {
264 avail = c->leb_size - wbuf->offs - wbuf->used; 418 int avail;
265 list_for_each_entry_safe(snod, tmp, &large, list) { 419 struct ubifs_scan_node *snod, *tmp;
266 int new_lnum, new_offs; 420
421 /* Move data nodes */
422 list_for_each_entry_safe(snod, tmp, &sleb->nodes, list) {
423 avail = c->leb_size - wbuf->offs - wbuf->used;
424 if (snod->len > avail)
425 /*
426 * Do not skip data nodes in order to optimize
427 * bulk-read.
428 */
429 break;
430
431 err = move_node(c, sleb, snod, wbuf);
432 if (err)
433 goto out;
434 }
267 435
436 /* Move non-data nodes */
437 list_for_each_entry_safe(snod, tmp, &nondata, list) {
438 avail = c->leb_size - wbuf->offs - wbuf->used;
268 if (avail < min) 439 if (avail < min)
269 break; 440 break;
270 441
271 if (snod->len > avail) 442 if (snod->len > avail) {
272 /* This node does not fit */ 443 /*
444 * Keep going only if this is an inode with
445 * some data. Otherwise stop and switch the GC
446 * head. IOW, we assume that data-less inode
447 * nodes and direntry nodes are roughly of the
448 * same size.
449 */
450 if (key_type(c, &snod->key) == UBIFS_DENT_KEY ||
451 snod->len == UBIFS_INO_NODE_SZ)
452 break;
273 continue; 453 continue;
454 }
274 455
275 cond_resched(); 456 err = move_node(c, sleb, snod, wbuf);
276
277 new_lnum = wbuf->lnum;
278 new_offs = wbuf->offs + wbuf->used;
279 err = ubifs_wbuf_write_nolock(wbuf, snod->node,
280 snod->len);
281 if (err) 457 if (err)
282 goto out; 458 goto out;
283 err = ubifs_tnc_replace(c, &snod->key, sleb->lnum,
284 snod->offs, new_lnum, new_offs,
285 snod->len);
286 if (err)
287 goto out;
288
289 avail = c->leb_size - wbuf->offs - wbuf->used;
290 list_del(&snod->list);
291 kfree(snod);
292 } 459 }
293 460
294 if (list_empty(&large)) 461 if (list_empty(&sleb->nodes) && list_empty(&nondata))
295 break; 462 break;
296 463
297 /* 464 /*
@@ -306,10 +473,7 @@ static int move_nodes(struct ubifs_info *c, struct ubifs_scan_leb *sleb)
306 return 0; 473 return 0;
307 474
308out: 475out:
309 list_for_each_entry_safe(snod, tmp, &large, list) { 476 list_splice_tail(&nondata, &sleb->nodes);
310 list_del(&snod->list);
311 kfree(snod);
312 }
313 return err; 477 return err;
314} 478}
315 479
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index a11ca0958a23..64b5f3a309f5 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -114,7 +114,7 @@ static inline void zero_trun_node_unused(struct ubifs_trun_node *trun)
114 */ 114 */
115static int reserve_space(struct ubifs_info *c, int jhead, int len) 115static int reserve_space(struct ubifs_info *c, int jhead, int len)
116{ 116{
117 int err = 0, err1, retries = 0, avail, lnum, offs, free, squeeze; 117 int err = 0, err1, retries = 0, avail, lnum, offs, squeeze;
118 struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf; 118 struct ubifs_wbuf *wbuf = &c->jheads[jhead].wbuf;
119 119
120 /* 120 /*
@@ -139,10 +139,9 @@ again:
139 * Write buffer wasn't seek'ed or there is no enough space - look for an 139 * Write buffer wasn't seek'ed or there is no enough space - look for an
140 * LEB with some empty space. 140 * LEB with some empty space.
141 */ 141 */
142 lnum = ubifs_find_free_space(c, len, &free, squeeze); 142 lnum = ubifs_find_free_space(c, len, &offs, squeeze);
143 if (lnum >= 0) { 143 if (lnum >= 0) {
144 /* Found an LEB, add it to the journal head */ 144 /* Found an LEB, add it to the journal head */
145 offs = c->leb_size - free;
146 err = ubifs_add_bud_to_log(c, jhead, lnum, offs); 145 err = ubifs_add_bud_to_log(c, jhead, lnum, offs);
147 if (err) 146 if (err)
148 goto out_return; 147 goto out_return;
@@ -1366,7 +1365,7 @@ out_ro:
1366 * @host: host inode 1365 * @host: host inode
1367 * 1366 *
1368 * This function writes the updated version of an extended attribute inode and 1367 * This function writes the updated version of an extended attribute inode and
1369 * the host inode tho the journal (to the base head). The host inode is written 1368 * the host inode to the journal (to the base head). The host inode is written
1370 * after the extended attribute inode in order to guarantee that the extended 1369 * after the extended attribute inode in order to guarantee that the extended
1371 * attribute will be flushed when the inode is synchronized by 'fsync()' and 1370 * attribute will be flushed when the inode is synchronized by 'fsync()' and
1372 * consequently, the write-buffer is synchronized. This function returns zero 1371 * consequently, the write-buffer is synchronized. This function returns zero
diff --git a/fs/ubifs/key.h b/fs/ubifs/key.h
index efb3430a2581..5fa27ea031ba 100644
--- a/fs/ubifs/key.h
+++ b/fs/ubifs/key.h
@@ -381,8 +381,8 @@ static inline ino_t key_inum_flash(const struct ubifs_info *c, const void *k)
381 * @c: UBIFS file-system description object 381 * @c: UBIFS file-system description object
382 * @key: the key to get hash from 382 * @key: the key to get hash from
383 */ 383 */
384static inline int key_hash(const struct ubifs_info *c, 384static inline uint32_t key_hash(const struct ubifs_info *c,
385 const union ubifs_key *key) 385 const union ubifs_key *key)
386{ 386{
387 return key->u32[1] & UBIFS_S_KEY_HASH_MASK; 387 return key->u32[1] & UBIFS_S_KEY_HASH_MASK;
388} 388}
@@ -392,7 +392,7 @@ static inline int key_hash(const struct ubifs_info *c,
392 * @c: UBIFS file-system description object 392 * @c: UBIFS file-system description object
393 * @k: the key to get hash from 393 * @k: the key to get hash from
394 */ 394 */
395static inline int key_hash_flash(const struct ubifs_info *c, const void *k) 395static inline uint32_t key_hash_flash(const struct ubifs_info *c, const void *k)
396{ 396{
397 const union ubifs_key *key = k; 397 const union ubifs_key *key = k;
398 398
diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c
index 3e0aa7367556..56e33772a1ee 100644
--- a/fs/ubifs/log.c
+++ b/fs/ubifs/log.c
@@ -239,7 +239,7 @@ int ubifs_add_bud_to_log(struct ubifs_info *c, int jhead, int lnum, int offs)
239 } 239 }
240 240
241 /* 241 /*
242 * Make sure the the amount of space in buds will not exceed 242 * Make sure the amount of space in buds will not exceed the
243 * 'c->max_bud_bytes' limit, because we want to guarantee mount time 243 * 'c->max_bud_bytes' limit, because we want to guarantee mount time
244 * limits. 244 * limits.
245 * 245 *
@@ -367,7 +367,6 @@ static void remove_buds(struct ubifs_info *c)
367 bud->jhead, c->leb_size - bud->start, 367 bud->jhead, c->leb_size - bud->start,
368 c->cmt_bud_bytes); 368 c->cmt_bud_bytes);
369 rb_erase(p1, &c->buds); 369 rb_erase(p1, &c->buds);
370 list_del(&bud->list);
371 /* 370 /*
372 * If the commit does not finish, the recovery will need 371 * If the commit does not finish, the recovery will need
373 * to replay the journal, in which case the old buds 372 * to replay the journal, in which case the old buds
@@ -375,7 +374,7 @@ static void remove_buds(struct ubifs_info *c)
375 * commit i.e. do not allow them to be garbage 374 * commit i.e. do not allow them to be garbage
376 * collected. 375 * collected.
377 */ 376 */
378 list_add(&bud->list, &c->old_buds); 377 list_move(&bud->list, &c->old_buds);
379 } 378 }
380 } 379 }
381 spin_unlock(&c->buds_lock); 380 spin_unlock(&c->buds_lock);
diff --git a/fs/ubifs/lpt_commit.c b/fs/ubifs/lpt_commit.c
index 3216a1f277f8..8cbfb8248025 100644
--- a/fs/ubifs/lpt_commit.c
+++ b/fs/ubifs/lpt_commit.c
@@ -229,7 +229,7 @@ static int layout_cnodes(struct ubifs_info *c)
229 while (offs + len > c->leb_size) { 229 while (offs + len > c->leb_size) {
230 alen = ALIGN(offs, c->min_io_size); 230 alen = ALIGN(offs, c->min_io_size);
231 upd_ltab(c, lnum, c->leb_size - alen, alen - offs); 231 upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
232 dbg_chk_lpt_sz(c, 2, alen - offs); 232 dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
233 err = alloc_lpt_leb(c, &lnum); 233 err = alloc_lpt_leb(c, &lnum);
234 if (err) 234 if (err)
235 goto no_space; 235 goto no_space;
@@ -272,7 +272,7 @@ static int layout_cnodes(struct ubifs_info *c)
272 if (offs + c->lsave_sz > c->leb_size) { 272 if (offs + c->lsave_sz > c->leb_size) {
273 alen = ALIGN(offs, c->min_io_size); 273 alen = ALIGN(offs, c->min_io_size);
274 upd_ltab(c, lnum, c->leb_size - alen, alen - offs); 274 upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
275 dbg_chk_lpt_sz(c, 2, alen - offs); 275 dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
276 err = alloc_lpt_leb(c, &lnum); 276 err = alloc_lpt_leb(c, &lnum);
277 if (err) 277 if (err)
278 goto no_space; 278 goto no_space;
@@ -292,7 +292,7 @@ static int layout_cnodes(struct ubifs_info *c)
292 if (offs + c->ltab_sz > c->leb_size) { 292 if (offs + c->ltab_sz > c->leb_size) {
293 alen = ALIGN(offs, c->min_io_size); 293 alen = ALIGN(offs, c->min_io_size);
294 upd_ltab(c, lnum, c->leb_size - alen, alen - offs); 294 upd_ltab(c, lnum, c->leb_size - alen, alen - offs);
295 dbg_chk_lpt_sz(c, 2, alen - offs); 295 dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
296 err = alloc_lpt_leb(c, &lnum); 296 err = alloc_lpt_leb(c, &lnum);
297 if (err) 297 if (err)
298 goto no_space; 298 goto no_space;
@@ -416,14 +416,12 @@ static int write_cnodes(struct ubifs_info *c)
416 alen, UBI_SHORTTERM); 416 alen, UBI_SHORTTERM);
417 if (err) 417 if (err)
418 return err; 418 return err;
419 dbg_chk_lpt_sz(c, 4, alen - wlen);
420 } 419 }
421 dbg_chk_lpt_sz(c, 2, 0); 420 dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
422 err = realloc_lpt_leb(c, &lnum); 421 err = realloc_lpt_leb(c, &lnum);
423 if (err) 422 if (err)
424 goto no_space; 423 goto no_space;
425 offs = 0; 424 offs = from = 0;
426 from = 0;
427 ubifs_assert(lnum >= c->lpt_first && 425 ubifs_assert(lnum >= c->lpt_first &&
428 lnum <= c->lpt_last); 426 lnum <= c->lpt_last);
429 err = ubifs_leb_unmap(c, lnum); 427 err = ubifs_leb_unmap(c, lnum);
@@ -477,11 +475,11 @@ static int write_cnodes(struct ubifs_info *c)
477 UBI_SHORTTERM); 475 UBI_SHORTTERM);
478 if (err) 476 if (err)
479 return err; 477 return err;
480 dbg_chk_lpt_sz(c, 2, alen - wlen); 478 dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
481 err = realloc_lpt_leb(c, &lnum); 479 err = realloc_lpt_leb(c, &lnum);
482 if (err) 480 if (err)
483 goto no_space; 481 goto no_space;
484 offs = 0; 482 offs = from = 0;
485 ubifs_assert(lnum >= c->lpt_first && 483 ubifs_assert(lnum >= c->lpt_first &&
486 lnum <= c->lpt_last); 484 lnum <= c->lpt_last);
487 err = ubifs_leb_unmap(c, lnum); 485 err = ubifs_leb_unmap(c, lnum);
@@ -504,11 +502,11 @@ static int write_cnodes(struct ubifs_info *c)
504 UBI_SHORTTERM); 502 UBI_SHORTTERM);
505 if (err) 503 if (err)
506 return err; 504 return err;
507 dbg_chk_lpt_sz(c, 2, alen - wlen); 505 dbg_chk_lpt_sz(c, 2, c->leb_size - offs);
508 err = realloc_lpt_leb(c, &lnum); 506 err = realloc_lpt_leb(c, &lnum);
509 if (err) 507 if (err)
510 goto no_space; 508 goto no_space;
511 offs = 0; 509 offs = from = 0;
512 ubifs_assert(lnum >= c->lpt_first && 510 ubifs_assert(lnum >= c->lpt_first &&
513 lnum <= c->lpt_last); 511 lnum <= c->lpt_last);
514 err = ubifs_leb_unmap(c, lnum); 512 err = ubifs_leb_unmap(c, lnum);
@@ -1756,10 +1754,16 @@ int dbg_chk_lpt_free_spc(struct ubifs_info *c)
1756/** 1754/**
1757 * dbg_chk_lpt_sz - check LPT does not write more than LPT size. 1755 * dbg_chk_lpt_sz - check LPT does not write more than LPT size.
1758 * @c: the UBIFS file-system description object 1756 * @c: the UBIFS file-system description object
1759 * @action: action 1757 * @action: what to do
1760 * @len: length written 1758 * @len: length written
1761 * 1759 *
1762 * This function returns %0 on success and a negative error code on failure. 1760 * This function returns %0 on success and a negative error code on failure.
1761 * The @action argument may be one of:
1762 * o %0 - LPT debugging checking starts, initialize debugging variables;
1763 * o %1 - wrote an LPT node, increase LPT size by @len bytes;
1764 * o %2 - switched to a different LEB and wasted @len bytes;
1765 * o %3 - check that we've written the right number of bytes.
1766 * o %4 - wasted @len bytes;
1763 */ 1767 */
1764int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len) 1768int dbg_chk_lpt_sz(struct ubifs_info *c, int action, int len)
1765{ 1769{
@@ -1917,12 +1921,12 @@ static void dump_lpt_leb(const struct ubifs_info *c, int lnum)
1917 lnum, offs); 1921 lnum, offs);
1918 err = ubifs_unpack_nnode(c, buf, &nnode); 1922 err = ubifs_unpack_nnode(c, buf, &nnode);
1919 for (i = 0; i < UBIFS_LPT_FANOUT; i++) { 1923 for (i = 0; i < UBIFS_LPT_FANOUT; i++) {
1920 printk("%d:%d", nnode.nbranch[i].lnum, 1924 printk(KERN_CONT "%d:%d", nnode.nbranch[i].lnum,
1921 nnode.nbranch[i].offs); 1925 nnode.nbranch[i].offs);
1922 if (i != UBIFS_LPT_FANOUT - 1) 1926 if (i != UBIFS_LPT_FANOUT - 1)
1923 printk(", "); 1927 printk(KERN_CONT ", ");
1924 } 1928 }
1925 printk("\n"); 1929 printk(KERN_CONT "\n");
1926 break; 1930 break;
1927 } 1931 }
1928 case UBIFS_LPT_LTAB: 1932 case UBIFS_LPT_LTAB:
diff --git a/fs/ubifs/recovery.c b/fs/ubifs/recovery.c
index 90acac603e63..10662975d2ef 100644
--- a/fs/ubifs/recovery.c
+++ b/fs/ubifs/recovery.c
@@ -425,59 +425,35 @@ static void clean_buf(const struct ubifs_info *c, void **buf, int lnum,
425 * @lnum: LEB number of the LEB from which @buf was read 425 * @lnum: LEB number of the LEB from which @buf was read
426 * @offs: offset from which @buf was read 426 * @offs: offset from which @buf was read
427 * 427 *
428 * This function scans @buf for more nodes and returns %0 is a node is found and 428 * This function ensures that the corrupted node at @offs is the last thing
429 * %1 if no more nodes are found. 429 * written to a LEB. This function returns %1 if more data is not found and
430 * %0 if more data is found.
430 */ 431 */
431static int no_more_nodes(const struct ubifs_info *c, void *buf, int len, 432static int no_more_nodes(const struct ubifs_info *c, void *buf, int len,
432 int lnum, int offs) 433 int lnum, int offs)
433{ 434{
434 int skip, next_offs = 0; 435 struct ubifs_ch *ch = buf;
436 int skip, dlen = le32_to_cpu(ch->len);
435 437
436 if (len > UBIFS_DATA_NODE_SZ) { 438 /* Check for empty space after the corrupt node's common header */
437 struct ubifs_ch *ch = buf; 439 skip = ALIGN(offs + UBIFS_CH_SZ, c->min_io_size) - offs;
438 int dlen = le32_to_cpu(ch->len); 440 if (is_empty(buf + skip, len - skip))
439 441 return 1;
440 if (ch->node_type == UBIFS_DATA_NODE && dlen >= UBIFS_CH_SZ && 442 /*
441 dlen <= UBIFS_MAX_DATA_NODE_SZ) 443 * The area after the common header size is not empty, so the common
442 /* The corrupt node looks like a data node */ 444 * header must be intact. Check it.
443 next_offs = ALIGN(offs + dlen, 8); 445 */
444 } 446 if (ubifs_check_node(c, buf, lnum, offs, 1, 0) != -EUCLEAN) {
445 447 dbg_rcvry("unexpected bad common header at %d:%d", lnum, offs);
446 if (c->min_io_size == 1) 448 return 0;
447 skip = 8;
448 else
449 skip = ALIGN(offs + 1, c->min_io_size) - offs;
450
451 offs += skip;
452 buf += skip;
453 len -= skip;
454 while (len > 8) {
455 struct ubifs_ch *ch = buf;
456 uint32_t magic = le32_to_cpu(ch->magic);
457 int ret;
458
459 if (magic == UBIFS_NODE_MAGIC) {
460 ret = ubifs_scan_a_node(c, buf, len, lnum, offs, 1);
461 if (ret == SCANNED_A_NODE || ret > 0) {
462 /*
463 * There is a small chance this is just data in
464 * a data node, so check that possibility. e.g.
465 * this is part of a file that itself contains
466 * a UBIFS image.
467 */
468 if (next_offs && offs + le32_to_cpu(ch->len) <=
469 next_offs)
470 continue;
471 dbg_rcvry("unexpected node at %d:%d", lnum,
472 offs);
473 return 0;
474 }
475 }
476 offs += 8;
477 buf += 8;
478 len -= 8;
479 } 449 }
480 return 1; 450 /* Now we know the corrupt node's length we can skip over it */
451 skip = ALIGN(offs + dlen, c->min_io_size) - offs;
452 /* After which there should be empty space */
453 if (is_empty(buf + skip, len - skip))
454 return 1;
455 dbg_rcvry("unexpected data at %d:%d", lnum, offs + skip);
456 return 0;
481} 457}
482 458
483/** 459/**
diff --git a/fs/ubifs/replay.c b/fs/ubifs/replay.c
index ce42a7b0ca5a..11cc80125a49 100644
--- a/fs/ubifs/replay.c
+++ b/fs/ubifs/replay.c
@@ -143,7 +143,7 @@ static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r)
143 dirty -= c->leb_size - lp->free; 143 dirty -= c->leb_size - lp->free;
144 /* 144 /*
145 * If the replay order was perfect the dirty space would now be 145 * If the replay order was perfect the dirty space would now be
146 * zero. The order is not perfect because the the journal heads 146 * zero. The order is not perfect because the journal heads
147 * race with each other. This is not a problem but is does mean 147 * race with each other. This is not a problem but is does mean
148 * that the dirty space may temporarily exceed c->leb_size 148 * that the dirty space may temporarily exceed c->leb_size
149 * during the replay. 149 * during the replay.
diff --git a/fs/ubifs/sb.c b/fs/ubifs/sb.c
index e070c643d1bb..57085e43320f 100644
--- a/fs/ubifs/sb.c
+++ b/fs/ubifs/sb.c
@@ -193,6 +193,7 @@ static int create_default_filesystem(struct ubifs_info *c)
193 if (tmp64 > DEFAULT_MAX_RP_SIZE) 193 if (tmp64 > DEFAULT_MAX_RP_SIZE)
194 tmp64 = DEFAULT_MAX_RP_SIZE; 194 tmp64 = DEFAULT_MAX_RP_SIZE;
195 sup->rp_size = cpu_to_le64(tmp64); 195 sup->rp_size = cpu_to_le64(tmp64);
196 sup->ro_compat_version = cpu_to_le32(UBIFS_RO_COMPAT_VERSION);
196 197
197 err = ubifs_write_node(c, sup, UBIFS_SB_NODE_SZ, 0, 0, UBI_LONGTERM); 198 err = ubifs_write_node(c, sup, UBIFS_SB_NODE_SZ, 0, 0, UBI_LONGTERM);
198 kfree(sup); 199 kfree(sup);
@@ -532,17 +533,39 @@ int ubifs_read_superblock(struct ubifs_info *c)
532 if (IS_ERR(sup)) 533 if (IS_ERR(sup))
533 return PTR_ERR(sup); 534 return PTR_ERR(sup);
534 535
536 c->fmt_version = le32_to_cpu(sup->fmt_version);
537 c->ro_compat_version = le32_to_cpu(sup->ro_compat_version);
538
535 /* 539 /*
536 * The software supports all previous versions but not future versions, 540 * The software supports all previous versions but not future versions,
537 * due to the unavailability of time-travelling equipment. 541 * due to the unavailability of time-travelling equipment.
538 */ 542 */
539 c->fmt_version = le32_to_cpu(sup->fmt_version);
540 if (c->fmt_version > UBIFS_FORMAT_VERSION) { 543 if (c->fmt_version > UBIFS_FORMAT_VERSION) {
541 ubifs_err("on-flash format version is %d, but software only " 544 struct super_block *sb = c->vfs_sb;
542 "supports up to version %d", c->fmt_version, 545 int mounting_ro = sb->s_flags & MS_RDONLY;
543 UBIFS_FORMAT_VERSION); 546
544 err = -EINVAL; 547 ubifs_assert(!c->ro_media || mounting_ro);
545 goto out; 548 if (!mounting_ro ||
549 c->ro_compat_version > UBIFS_RO_COMPAT_VERSION) {
550 ubifs_err("on-flash format version is w%d/r%d, but "
551 "software only supports up to version "
552 "w%d/r%d", c->fmt_version,
553 c->ro_compat_version, UBIFS_FORMAT_VERSION,
554 UBIFS_RO_COMPAT_VERSION);
555 if (c->ro_compat_version <= UBIFS_RO_COMPAT_VERSION) {
556 ubifs_msg("only R/O mounting is possible");
557 err = -EROFS;
558 } else
559 err = -EINVAL;
560 goto out;
561 }
562
563 /*
564 * The FS is mounted R/O, and the media format is
565 * R/O-compatible with the UBIFS implementation, so we can
566 * mount.
567 */
568 c->rw_incompat = 1;
546 } 569 }
547 570
548 if (c->fmt_version < 3) { 571 if (c->fmt_version < 3) {
@@ -623,7 +646,6 @@ int ubifs_read_superblock(struct ubifs_info *c)
623 c->main_lebs = c->leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS; 646 c->main_lebs = c->leb_cnt - UBIFS_SB_LEBS - UBIFS_MST_LEBS;
624 c->main_lebs -= c->log_lebs + c->lpt_lebs + c->orph_lebs; 647 c->main_lebs -= c->log_lebs + c->lpt_lebs + c->orph_lebs;
625 c->main_first = c->leb_cnt - c->main_lebs; 648 c->main_first = c->leb_cnt - c->main_lebs;
626 c->report_rp_size = ubifs_reported_space(c, c->rp_size);
627 649
628 err = validate_sb(c, sup); 650 err = validate_sb(c, sup);
629out: 651out:
diff --git a/fs/ubifs/shrinker.c b/fs/ubifs/shrinker.c
index e7bab52a1410..02feb59cefca 100644
--- a/fs/ubifs/shrinker.c
+++ b/fs/ubifs/shrinker.c
@@ -206,8 +206,7 @@ static int shrink_tnc_trees(int nr, int age, int *contention)
206 * Move this one to the end of the list to provide some 206 * Move this one to the end of the list to provide some
207 * fairness. 207 * fairness.
208 */ 208 */
209 list_del(&c->infos_list); 209 list_move_tail(&c->infos_list, &ubifs_infos);
210 list_add_tail(&c->infos_list, &ubifs_infos);
211 mutex_unlock(&c->umount_mutex); 210 mutex_unlock(&c->umount_mutex);
212 if (freed >= nr) 211 if (freed >= nr)
213 break; 212 break;
@@ -263,8 +262,7 @@ static int kick_a_thread(void)
263 } 262 }
264 263
265 if (i == 1) { 264 if (i == 1) {
266 list_del(&c->infos_list); 265 list_move_tail(&c->infos_list, &ubifs_infos);
267 list_add_tail(&c->infos_list, &ubifs_infos);
268 spin_unlock(&ubifs_infos_lock); 266 spin_unlock(&ubifs_infos_lock);
269 267
270 ubifs_request_bg_commit(c); 268 ubifs_request_bg_commit(c);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index c5c98355459a..faa44f90608a 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -421,8 +421,8 @@ static int ubifs_show_options(struct seq_file *s, struct vfsmount *mnt)
421 seq_printf(s, ",no_chk_data_crc"); 421 seq_printf(s, ",no_chk_data_crc");
422 422
423 if (c->mount_opts.override_compr) { 423 if (c->mount_opts.override_compr) {
424 seq_printf(s, ",compr="); 424 seq_printf(s, ",compr=%s",
425 seq_printf(s, ubifs_compr_name(c->mount_opts.compr_type)); 425 ubifs_compr_name(c->mount_opts.compr_type));
426 } 426 }
427 427
428 return 0; 428 return 0;
@@ -700,6 +700,8 @@ static int init_constants_sb(struct ubifs_info *c)
700 if (err) 700 if (err)
701 return err; 701 return err;
702 702
703 /* Initialize effective LEB size used in budgeting calculations */
704 c->idx_leb_size = c->leb_size - c->max_idx_node_sz;
703 return 0; 705 return 0;
704} 706}
705 707
@@ -716,6 +718,7 @@ static void init_constants_master(struct ubifs_info *c)
716 long long tmp64; 718 long long tmp64;
717 719
718 c->min_idx_lebs = ubifs_calc_min_idx_lebs(c); 720 c->min_idx_lebs = ubifs_calc_min_idx_lebs(c);
721 c->report_rp_size = ubifs_reported_space(c, c->rp_size);
719 722
720 /* 723 /*
721 * Calculate total amount of FS blocks. This number is not used 724 * Calculate total amount of FS blocks. This number is not used
@@ -1201,7 +1204,7 @@ static int mount_ubifs(struct ubifs_info *c)
1201 goto out_cbuf; 1204 goto out_cbuf;
1202 1205
1203 /* Create background thread */ 1206 /* Create background thread */
1204 c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name); 1207 c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name);
1205 if (IS_ERR(c->bgt)) { 1208 if (IS_ERR(c->bgt)) {
1206 err = PTR_ERR(c->bgt); 1209 err = PTR_ERR(c->bgt);
1207 c->bgt = NULL; 1210 c->bgt = NULL;
@@ -1318,11 +1321,15 @@ static int mount_ubifs(struct ubifs_info *c)
1318 else { 1321 else {
1319 c->need_recovery = 0; 1322 c->need_recovery = 0;
1320 ubifs_msg("recovery completed"); 1323 ubifs_msg("recovery completed");
1321 /* GC LEB has to be empty and taken at this point */ 1324 /*
1322 ubifs_assert(c->lst.taken_empty_lebs == 1); 1325 * GC LEB has to be empty and taken at this point. But
1326 * the journal head LEBs may also be accounted as
1327 * "empty taken" if they are empty.
1328 */
1329 ubifs_assert(c->lst.taken_empty_lebs > 0);
1323 } 1330 }
1324 } else 1331 } else
1325 ubifs_assert(c->lst.taken_empty_lebs == 1); 1332 ubifs_assert(c->lst.taken_empty_lebs > 0);
1326 1333
1327 err = dbg_check_filesystem(c); 1334 err = dbg_check_filesystem(c);
1328 if (err) 1335 if (err)
@@ -1344,8 +1351,9 @@ static int mount_ubifs(struct ubifs_info *c)
1344 x = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes; 1351 x = (long long)c->log_lebs * c->leb_size + c->max_bud_bytes;
1345 ubifs_msg("journal size: %lld bytes (%lld KiB, %lld MiB, %d " 1352 ubifs_msg("journal size: %lld bytes (%lld KiB, %lld MiB, %d "
1346 "LEBs)", x, x >> 10, x >> 20, c->log_lebs + c->max_bud_cnt); 1353 "LEBs)", x, x >> 10, x >> 20, c->log_lebs + c->max_bud_cnt);
1347 ubifs_msg("media format: %d (latest is %d)", 1354 ubifs_msg("media format: w%d/r%d (latest is w%d/r%d)",
1348 c->fmt_version, UBIFS_FORMAT_VERSION); 1355 c->fmt_version, c->ro_compat_version,
1356 UBIFS_FORMAT_VERSION, UBIFS_RO_COMPAT_VERSION);
1349 ubifs_msg("default compressor: %s", ubifs_compr_name(c->default_compr)); 1357 ubifs_msg("default compressor: %s", ubifs_compr_name(c->default_compr));
1350 ubifs_msg("reserved for root: %llu bytes (%llu KiB)", 1358 ubifs_msg("reserved for root: %llu bytes (%llu KiB)",
1351 c->report_rp_size, c->report_rp_size >> 10); 1359 c->report_rp_size, c->report_rp_size >> 10);
@@ -1485,6 +1493,15 @@ static int ubifs_remount_rw(struct ubifs_info *c)
1485{ 1493{
1486 int err, lnum; 1494 int err, lnum;
1487 1495
1496 if (c->rw_incompat) {
1497 ubifs_err("the file-system is not R/W-compatible");
1498 ubifs_msg("on-flash format version is w%d/r%d, but software "
1499 "only supports up to version w%d/r%d", c->fmt_version,
1500 c->ro_compat_version, UBIFS_FORMAT_VERSION,
1501 UBIFS_RO_COMPAT_VERSION);
1502 return -EROFS;
1503 }
1504
1488 mutex_lock(&c->umount_mutex); 1505 mutex_lock(&c->umount_mutex);
1489 dbg_save_space_info(c); 1506 dbg_save_space_info(c);
1490 c->remounting_rw = 1; 1507 c->remounting_rw = 1;
@@ -1554,7 +1571,7 @@ static int ubifs_remount_rw(struct ubifs_info *c)
1554 ubifs_create_buds_lists(c); 1571 ubifs_create_buds_lists(c);
1555 1572
1556 /* Create background thread */ 1573 /* Create background thread */
1557 c->bgt = kthread_create(ubifs_bg_thread, c, c->bgt_name); 1574 c->bgt = kthread_create(ubifs_bg_thread, c, "%s", c->bgt_name);
1558 if (IS_ERR(c->bgt)) { 1575 if (IS_ERR(c->bgt)) {
1559 err = PTR_ERR(c->bgt); 1576 err = PTR_ERR(c->bgt);
1560 c->bgt = NULL; 1577 c->bgt = NULL;
@@ -1775,7 +1792,7 @@ static int ubifs_remount_fs(struct super_block *sb, int *flags, char *data)
1775 c->bu.buf = NULL; 1792 c->bu.buf = NULL;
1776 } 1793 }
1777 1794
1778 ubifs_assert(c->lst.taken_empty_lebs == 1); 1795 ubifs_assert(c->lst.taken_empty_lebs > 0);
1779 return 0; 1796 return 0;
1780} 1797}
1781 1798
diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c
index fa28a84c6a1b..f249f7b0d656 100644
--- a/fs/ubifs/tnc.c
+++ b/fs/ubifs/tnc.c
@@ -1252,7 +1252,7 @@ int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key,
1252 * splitting in the middle of the colliding sequence. Also, when 1252 * splitting in the middle of the colliding sequence. Also, when
1253 * removing the leftmost key, we would have to correct the key of the 1253 * removing the leftmost key, we would have to correct the key of the
1254 * parent node, which would introduce additional complications. Namely, 1254 * parent node, which would introduce additional complications. Namely,
1255 * if we changed the the leftmost key of the parent znode, the garbage 1255 * if we changed the leftmost key of the parent znode, the garbage
1256 * collector would be unable to find it (GC is doing this when GC'ing 1256 * collector would be unable to find it (GC is doing this when GC'ing
1257 * indexing LEBs). Although we already have an additional RB-tree where 1257 * indexing LEBs). Although we already have an additional RB-tree where
1258 * we save such changed znodes (see 'ins_clr_old_idx_znode()') until 1258 * we save such changed znodes (see 'ins_clr_old_idx_znode()') until
diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h
index b25fc36cf72f..3eee07e0c495 100644
--- a/fs/ubifs/ubifs-media.h
+++ b/fs/ubifs/ubifs-media.h
@@ -36,9 +36,31 @@
36/* UBIFS node magic number (must not have the padding byte first or last) */ 36/* UBIFS node magic number (must not have the padding byte first or last) */
37#define UBIFS_NODE_MAGIC 0x06101831 37#define UBIFS_NODE_MAGIC 0x06101831
38 38
39/* UBIFS on-flash format version */ 39/*
40 * UBIFS on-flash format version. This version is increased when the on-flash
41 * format is changing. If this happens, UBIFS is will support older versions as
42 * well. But older UBIFS code will not support newer formats. Format changes
43 * will be rare and only when absolutely necessary, e.g. to fix a bug or to add
44 * a new feature.
45 *
46 * UBIFS went into mainline kernel with format version 4. The older formats
47 * were development formats.
48 */
40#define UBIFS_FORMAT_VERSION 4 49#define UBIFS_FORMAT_VERSION 4
41 50
51/*
52 * Read-only compatibility version. If the UBIFS format is changed, older UBIFS
53 * implementations will not be able to mount newer formats in read-write mode.
54 * However, depending on the change, it may be possible to mount newer formats
55 * in R/O mode. This is indicated by the R/O compatibility version which is
56 * stored in the super-block.
57 *
58 * This is needed to support boot-loaders which only need R/O mounting. With
59 * this flag it is possible to do UBIFS format changes without a need to update
60 * boot-loaders.
61 */
62#define UBIFS_RO_COMPAT_VERSION 0
63
42/* Minimum logical eraseblock size in bytes */ 64/* Minimum logical eraseblock size in bytes */
43#define UBIFS_MIN_LEB_SZ (15*1024) 65#define UBIFS_MIN_LEB_SZ (15*1024)
44 66
@@ -53,7 +75,7 @@
53 75
54/* 76/*
55 * If compressed data length is less than %UBIFS_MIN_COMPRESS_DIFF bytes 77 * If compressed data length is less than %UBIFS_MIN_COMPRESS_DIFF bytes
56 * shorter than uncompressed data length, UBIFS preferes to leave this data 78 * shorter than uncompressed data length, UBIFS prefers to leave this data
57 * node uncompress, because it'll be read faster. 79 * node uncompress, because it'll be read faster.
58 */ 80 */
59#define UBIFS_MIN_COMPRESS_DIFF 64 81#define UBIFS_MIN_COMPRESS_DIFF 64
@@ -586,6 +608,7 @@ struct ubifs_pad_node {
586 * @padding2: reserved for future, zeroes 608 * @padding2: reserved for future, zeroes
587 * @time_gran: time granularity in nanoseconds 609 * @time_gran: time granularity in nanoseconds
588 * @uuid: UUID generated when the file system image was created 610 * @uuid: UUID generated when the file system image was created
611 * @ro_compat_version: UBIFS R/O compatibility version
589 */ 612 */
590struct ubifs_sb_node { 613struct ubifs_sb_node {
591 struct ubifs_ch ch; 614 struct ubifs_ch ch;
@@ -612,7 +635,8 @@ struct ubifs_sb_node {
612 __le64 rp_size; 635 __le64 rp_size;
613 __le32 time_gran; 636 __le32 time_gran;
614 __u8 uuid[16]; 637 __u8 uuid[16];
615 __u8 padding2[3972]; 638 __le32 ro_compat_version;
639 __u8 padding2[3968];
616} __attribute__ ((packed)); 640} __attribute__ ((packed));
617 641
618/** 642/**
diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h
index 039a68bee29a..0a8341e14088 100644
--- a/fs/ubifs/ubifs.h
+++ b/fs/ubifs/ubifs.h
@@ -934,6 +934,7 @@ struct ubifs_debug_info;
934 * by @commit_sem 934 * by @commit_sem
935 * @cnt_lock: protects @highest_inum and @max_sqnum counters 935 * @cnt_lock: protects @highest_inum and @max_sqnum counters
936 * @fmt_version: UBIFS on-flash format version 936 * @fmt_version: UBIFS on-flash format version
937 * @ro_compat_version: R/O compatibility version
937 * @uuid: UUID from super block 938 * @uuid: UUID from super block
938 * 939 *
939 * @lhead_lnum: log head logical eraseblock number 940 * @lhead_lnum: log head logical eraseblock number
@@ -966,6 +967,7 @@ struct ubifs_debug_info;
966 * recovery) 967 * recovery)
967 * @bulk_read: enable bulk-reads 968 * @bulk_read: enable bulk-reads
968 * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc) 969 * @default_compr: default compression algorithm (%UBIFS_COMPR_LZO, etc)
970 * @rw_incompat: the media is not R/W compatible
969 * 971 *
970 * @tnc_mutex: protects the Tree Node Cache (TNC), @zroot, @cnext, @enext, and 972 * @tnc_mutex: protects the Tree Node Cache (TNC), @zroot, @cnext, @enext, and
971 * @calc_idx_sz 973 * @calc_idx_sz
@@ -1015,6 +1017,8 @@ struct ubifs_debug_info;
1015 * @min_io_shift: number of bits in @min_io_size minus one 1017 * @min_io_shift: number of bits in @min_io_size minus one
1016 * @leb_size: logical eraseblock size in bytes 1018 * @leb_size: logical eraseblock size in bytes
1017 * @half_leb_size: half LEB size 1019 * @half_leb_size: half LEB size
1020 * @idx_leb_size: how many bytes of an LEB are effectively available when it is
1021 * used to store indexing nodes (@leb_size - @max_idx_node_sz)
1018 * @leb_cnt: count of logical eraseblocks 1022 * @leb_cnt: count of logical eraseblocks
1019 * @max_leb_cnt: maximum count of logical eraseblocks 1023 * @max_leb_cnt: maximum count of logical eraseblocks
1020 * @old_leb_cnt: count of logical eraseblocks before re-size 1024 * @old_leb_cnt: count of logical eraseblocks before re-size
@@ -1132,8 +1136,8 @@ struct ubifs_debug_info;
1132 * previous commit start 1136 * previous commit start
1133 * @uncat_list: list of un-categorized LEBs 1137 * @uncat_list: list of un-categorized LEBs
1134 * @empty_list: list of empty LEBs 1138 * @empty_list: list of empty LEBs
1135 * @freeable_list: list of freeable non-index LEBs (free + dirty == leb_size) 1139 * @freeable_list: list of freeable non-index LEBs (free + dirty == @leb_size)
1136 * @frdi_idx_list: list of freeable index LEBs (free + dirty == leb_size) 1140 * @frdi_idx_list: list of freeable index LEBs (free + dirty == @leb_size)
1137 * @freeable_cnt: number of freeable LEBs in @freeable_list 1141 * @freeable_cnt: number of freeable LEBs in @freeable_list
1138 * 1142 *
1139 * @ltab_lnum: LEB number of LPT's own lprops table 1143 * @ltab_lnum: LEB number of LPT's own lprops table
@@ -1177,6 +1181,7 @@ struct ubifs_info {
1177 unsigned long long cmt_no; 1181 unsigned long long cmt_no;
1178 spinlock_t cnt_lock; 1182 spinlock_t cnt_lock;
1179 int fmt_version; 1183 int fmt_version;
1184 int ro_compat_version;
1180 unsigned char uuid[16]; 1185 unsigned char uuid[16];
1181 1186
1182 int lhead_lnum; 1187 int lhead_lnum;
@@ -1205,6 +1210,7 @@ struct ubifs_info {
1205 unsigned int no_chk_data_crc:1; 1210 unsigned int no_chk_data_crc:1;
1206 unsigned int bulk_read:1; 1211 unsigned int bulk_read:1;
1207 unsigned int default_compr:2; 1212 unsigned int default_compr:2;
1213 unsigned int rw_incompat:1;
1208 1214
1209 struct mutex tnc_mutex; 1215 struct mutex tnc_mutex;
1210 struct ubifs_zbranch zroot; 1216 struct ubifs_zbranch zroot;
@@ -1253,6 +1259,7 @@ struct ubifs_info {
1253 int min_io_shift; 1259 int min_io_shift;
1254 int leb_size; 1260 int leb_size;
1255 int half_leb_size; 1261 int half_leb_size;
1262 int idx_leb_size;
1256 int leb_cnt; 1263 int leb_cnt;
1257 int max_leb_cnt; 1264 int max_leb_cnt;
1258 int old_leb_cnt; 1265 int old_leb_cnt;
@@ -1500,7 +1507,7 @@ long long ubifs_reported_space(const struct ubifs_info *c, long long free);
1500long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs); 1507long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs);
1501 1508
1502/* find.c */ 1509/* find.c */
1503int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *free, 1510int ubifs_find_free_space(struct ubifs_info *c, int min_space, int *offs,
1504 int squeeze); 1511 int squeeze);
1505int ubifs_find_free_leb_for_idx(struct ubifs_info *c); 1512int ubifs_find_free_leb_for_idx(struct ubifs_info *c);
1506int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp, 1513int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp,