aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--Documentation/filesystems/00-INDEX2
-rw-r--r--Documentation/filesystems/logfs.txt241
-rw-r--r--fs/Kconfig1
-rw-r--r--fs/Makefile1
-rw-r--r--fs/logfs/Kconfig17
-rw-r--r--fs/logfs/Makefile13
-rw-r--r--fs/logfs/compr.c95
-rw-r--r--fs/logfs/dev_bdev.c263
-rw-r--r--fs/logfs/dev_mtd.c253
-rw-r--r--fs/logfs/dir.c818
-rw-r--r--fs/logfs/file.c263
-rw-r--r--fs/logfs/gc.c730
-rw-r--r--fs/logfs/inode.c417
-rw-r--r--fs/logfs/journal.c879
-rw-r--r--fs/logfs/logfs.h722
-rw-r--r--fs/logfs/logfs_abi.h627
-rw-r--r--fs/logfs/readwrite.c2246
-rw-r--r--fs/logfs/segment.c924
-rw-r--r--fs/logfs/super.c634
-rw-r--r--include/linux/btree-128.h109
-rw-r--r--include/linux/btree-type.h147
-rw-r--r--include/linux/btree.h243
-rw-r--r--lib/Kconfig3
-rw-r--r--lib/Makefile1
-rw-r--r--lib/btree.c797
25 files changed, 10446 insertions, 0 deletions
diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX
index f15621ee5599..d362aa543b27 100644
--- a/Documentation/filesystems/00-INDEX
+++ b/Documentation/filesystems/00-INDEX
@@ -62,6 +62,8 @@ jfs.txt
62 - info and mount options for the JFS filesystem. 62 - info and mount options for the JFS filesystem.
63locks.txt 63locks.txt
64 - info on file locking implementations, flock() vs. fcntl(), etc. 64 - info on file locking implementations, flock() vs. fcntl(), etc.
65logfs.txt
66 - info on the LogFS flash filesystem.
65mandatory-locking.txt 67mandatory-locking.txt
66 - info on the Linux implementation of Sys V mandatory file locking. 68 - info on the Linux implementation of Sys V mandatory file locking.
67ncpfs.txt 69ncpfs.txt
diff --git a/Documentation/filesystems/logfs.txt b/Documentation/filesystems/logfs.txt
new file mode 100644
index 000000000000..e64c94ba401a
--- /dev/null
+++ b/Documentation/filesystems/logfs.txt
@@ -0,0 +1,241 @@
1
2The LogFS Flash Filesystem
3==========================
4
5Specification
6=============
7
8Superblocks
9-----------
10
11Two superblocks exist at the beginning and end of the filesystem.
12Each superblock is 256 Bytes large, with another 3840 Bytes reserved
13for future purposes, making a total of 4096 Bytes.
14
15Superblock locations may differ for MTD and block devices. On MTD the
16first non-bad block contains a superblock in the first 4096 Bytes and
17the last non-bad block contains a superblock in the last 4096 Bytes.
18On block devices, the first 4096 Bytes of the device contain the first
19superblock and the last aligned 4096 Byte-block contains the second
20superblock.
21
22For the most part, the superblocks can be considered read-only. They
23are written only to correct errors detected within the superblocks,
24move the journal and change the filesystem parameters through tunefs.
25As a result, the superblock does not contain any fields that require
26constant updates, like the amount of free space, etc.
27
28Segments
29--------
30
31The space in the device is split up into equal-sized segments.
32Segments are the primary write unit of LogFS. Within each segments,
33writes happen from front (low addresses) to back (high addresses. If
34only a partial segment has been written, the segment number, the
35current position within and optionally a write buffer are stored in
36the journal.
37
38Segments are erased as a whole. Therefore Garbage Collection may be
39required to completely free a segment before doing so.
40
41Journal
42--------
43
44The journal contains all global information about the filesystem that
45is subject to frequent change. At mount time, it has to be scanned
46for the most recent commit entry, which contains a list of pointers to
47all currently valid entries.
48
49Object Store
50------------
51
52All space except for the superblocks and journal is part of the object
53store. Each segment contains a segment header and a number of
54objects, each consisting of the object header and the payload.
55Objects are either inodes, directory entries (dentries), file data
56blocks or indirect blocks.
57
58Levels
59------
60
61Garbage collection (GC) may fail if all data is written
62indiscriminately. One requirement of GC is that data is seperated
63roughly according to the distance between the tree root and the data.
64Effectively that means all file data is on level 0, indirect blocks
65are on levels 1, 2, 3 4 or 5 for 1x, 2x, 3x, 4x or 5x indirect blocks,
66respectively. Inode file data is on level 6 for the inodes and 7-11
67for indirect blocks.
68
69Each segment contains objects of a single level only. As a result,
70each level requires its own seperate segment to be open for writing.
71
72Inode File
73----------
74
75All inodes are stored in a special file, the inode file. Single
76exception is the inode file's inode (master inode) which for obvious
77reasons is stored in the journal instead. Instead of data blocks, the
78leaf nodes of the inode files are inodes.
79
80Aliases
81-------
82
83Writes in LogFS are done by means of a wandering tree. A naïve
84implementation would require that for each write or a block, all
85parent blocks are written as well, since the block pointers have
86changed. Such an implementation would not be very efficient.
87
88In LogFS, the block pointer changes are cached in the journal by means
89of alias entries. Each alias consists of its logical address - inode
90number, block index, level and child number (index into block) - and
91the changed data. Any 8-byte word can be changes in this manner.
92
93Currently aliases are used for block pointers, file size, file used
94bytes and the height of an inodes indirect tree.
95
96Segment Aliases
97---------------
98
99Related to regular aliases, these are used to handle bad blocks.
100Initially, bad blocks are handled by moving the affected segment
101content to a spare segment and noting this move in the journal with a
102segment alias, a simple (to, from) tupel. GC will later empty this
103segment and the alias can be removed again. This is used on MTD only.
104
105Vim
106---
107
108By cleverly predicting the life time of data, it is possible to
109seperate long-living data from short-living data and thereby reduce
110the GC overhead later. Each type of distinc life expectency (vim) can
111have a seperate segment open for writing. Each (level, vim) tupel can
112be open just once. If an open segment with unknown vim is encountered
113at mount time, it is closed and ignored henceforth.
114
115Indirect Tree
116-------------
117
118Inodes in LogFS are similar to FFS-style filesystems with direct and
119indirect block pointers. One difference is that LogFS uses a single
120indirect pointer that can be either a 1x, 2x, etc. indirect pointer.
121A height field in the inode defines the height of the indirect tree
122and thereby the indirection of the pointer.
123
124Another difference is the addressing of indirect blocks. In LogFS,
125the first 16 pointers in the first indirect block are left empty,
126corresponding to the 16 direct pointers in the inode. In ext2 (maybe
127others as well) the first pointer in the first indirect block
128corresponds to logical block 12, skipping the 12 direct pointers.
129So where ext2 is using arithmetic to better utilize space, LogFS keeps
130arithmetic simple and uses compression to save space.
131
132Compression
133-----------
134
135Both file data and metadata can be compressed. Compression for file
136data can be enabled with chattr +c and disabled with chattr -c. Doing
137so has no effect on existing data, but new data will be stored
138accordingly. New inodes will inherit the compression flag of the
139parent directory.
140
141Metadata is always compressed. However, the space accounting ignores
142this and charges for the uncompressed size. Failing to do so could
143result in GC failures when, after moving some data, indirect blocks
144compress worse than previously. Even on a 100% full medium, GC may
145not consume any extra space, so the compression gains are lost space
146to the user.
147
148However, they are not lost space to the filesystem internals. By
149cheating the user for those bytes, the filesystem gained some slack
150space and GC will run less often and faster.
151
152Garbage Collection and Wear Leveling
153------------------------------------
154
155Garbage collection is invoked whenever the number of free segments
156falls below a threshold. The best (known) candidate is picked based
157on the least amount of valid data contained in the segment. All
158remaining valid data is copied elsewhere, thereby invalidating it.
159
160The GC code also checks for aliases and writes then back if their
161number gets too large.
162
163Wear leveling is done by occasionally picking a suboptimal segment for
164garbage collection. If a stale segments erase count is significantly
165lower than the active segments' erase counts, it will be picked. Wear
166leveling is rate limited, so it will never monopolize the device for
167more than one segment worth at a time.
168
169Values for "occasionally", "significantly lower" are compile time
170constants.
171
172Hashed directories
173------------------
174
175To satisfy efficient lookup(), directory entries are hashed and
176located based on the hash. In order to both support large directories
177and not be overly inefficient for small directories, several hash
178tables of increasing size are used. For each table, the hash value
179modulo the table size gives the table index.
180
181Tables sizes are chosen to limit the number of indirect blocks with a
182fully populated table to 0, 1, 2 or 3 respectively. So the first
183table contains 16 entries, the second 512-16, etc.
184
185The last table is special in several ways. First its size depends on
186the effective 32bit limit on telldir/seekdir cookies. Since logfs
187uses the upper half of the address space for indirect blocks, the size
188is limited to 2^31. Secondly the table contains hash buckets with 16
189entries each.
190
191Using single-entry buckets would result in birthday "attacks". At
192just 2^16 used entries, hash collisions would be likely (P >= 0.5).
193My math skills are insufficient to do the combinatorics for the 17x
194collisions necessary to overflow a bucket, but testing showed that in
19510,000 runs the lowest directory fill before a bucket overflow was
196188,057,130 entries with an average of 315,149,915 entries. So for
197directory sizes of up to a million, bucket overflows should be
198virtually impossible under normal circumstances.
199
200With carefully chosen filenames, it is obviously possible to cause an
201overflow with just 21 entries (4 higher tables + 16 entries + 1). So
202there may be a security concern if a malicious user has write access
203to a directory.
204
205Open For Discussion
206===================
207
208Device Address Space
209--------------------
210
211A device address space is used for caching. Both block devices and
212MTD provide functions to either read a single page or write a segment.
213Partial segments may be written for data integrity, but where possible
214complete segments are written for performance on simple block device
215flash media.
216
217Meta Inodes
218-----------
219
220Inodes are stored in the inode file, which is just a regular file for
221most purposes. At umount time, however, the inode file needs to
222remain open until all dirty inodes are written. So
223generic_shutdown_super() may not close this inode, but shouldn't
224complain about remaining inodes due to the inode file either. Same
225goes for mapping inode of the device address space.
226
227Currently logfs uses a hack that essentially copies part of fs/inode.c
228code over. A general solution would be preferred.
229
230Indirect block mapping
231----------------------
232
233With compression, the block device (or mapping inode) cannot be used
234to cache indirect blocks. Some other place is required. Currently
235logfs uses the top half of each inode's address space. The low 8TB
236(on 32bit) are filled with file data, the high 8TB are used for
237indirect blocks.
238
239One problem is that 16TB files created on 64bit systems actually have
240data in the top 8TB. But files >16TB would cause problems anyway, so
241only the limit has changed.
diff --git a/fs/Kconfig b/fs/Kconfig
index 64d44efad7a5..7405f071be67 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -177,6 +177,7 @@ source "fs/efs/Kconfig"
177source "fs/jffs2/Kconfig" 177source "fs/jffs2/Kconfig"
178# UBIFS File system configuration 178# UBIFS File system configuration
179source "fs/ubifs/Kconfig" 179source "fs/ubifs/Kconfig"
180source "fs/logfs/Kconfig"
180source "fs/cramfs/Kconfig" 181source "fs/cramfs/Kconfig"
181source "fs/squashfs/Kconfig" 182source "fs/squashfs/Kconfig"
182source "fs/freevxfs/Kconfig" 183source "fs/freevxfs/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index af6d04700d9c..c3633aa46911 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -99,6 +99,7 @@ obj-$(CONFIG_NTFS_FS) += ntfs/
99obj-$(CONFIG_UFS_FS) += ufs/ 99obj-$(CONFIG_UFS_FS) += ufs/
100obj-$(CONFIG_EFS_FS) += efs/ 100obj-$(CONFIG_EFS_FS) += efs/
101obj-$(CONFIG_JFFS2_FS) += jffs2/ 101obj-$(CONFIG_JFFS2_FS) += jffs2/
102obj-$(CONFIG_LOGFS) += logfs/
102obj-$(CONFIG_UBIFS_FS) += ubifs/ 103obj-$(CONFIG_UBIFS_FS) += ubifs/
103obj-$(CONFIG_AFFS_FS) += affs/ 104obj-$(CONFIG_AFFS_FS) += affs/
104obj-$(CONFIG_ROMFS_FS) += romfs/ 105obj-$(CONFIG_ROMFS_FS) += romfs/
diff --git a/fs/logfs/Kconfig b/fs/logfs/Kconfig
new file mode 100644
index 000000000000..daf9a9b32dd3
--- /dev/null
+++ b/fs/logfs/Kconfig
@@ -0,0 +1,17 @@
1config LOGFS
2 tristate "LogFS file system (EXPERIMENTAL)"
3 depends on (MTD || BLOCK) && EXPERIMENTAL
4 select ZLIB_INFLATE
5 select ZLIB_DEFLATE
6 select CRC32
7 select BTREE
8 help
9 Flash filesystem aimed to scale efficiently to large devices.
10 In comparison to JFFS2 it offers significantly faster mount
11 times and potentially less RAM usage, although the latter has
12 not been measured yet.
13
14 In its current state it is still very experimental and should
15 not be used for other than testing purposes.
16
17 If unsure, say N.
diff --git a/fs/logfs/Makefile b/fs/logfs/Makefile
new file mode 100644
index 000000000000..4820027787ee
--- /dev/null
+++ b/fs/logfs/Makefile
@@ -0,0 +1,13 @@
1obj-$(CONFIG_LOGFS) += logfs.o
2
3logfs-y += compr.o
4logfs-y += dir.o
5logfs-y += file.o
6logfs-y += gc.o
7logfs-y += inode.o
8logfs-y += journal.o
9logfs-y += readwrite.o
10logfs-y += segment.o
11logfs-y += super.o
12logfs-$(CONFIG_BLOCK) += dev_bdev.o
13logfs-$(CONFIG_MTD) += dev_mtd.o
diff --git a/fs/logfs/compr.c b/fs/logfs/compr.c
new file mode 100644
index 000000000000..44bbfd249abc
--- /dev/null
+++ b/fs/logfs/compr.c
@@ -0,0 +1,95 @@
1/*
2 * fs/logfs/compr.c - compression routines
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */
8#include "logfs.h"
9#include <linux/vmalloc.h>
10#include <linux/zlib.h>
11
12#define COMPR_LEVEL 3
13
14static DEFINE_MUTEX(compr_mutex);
15static struct z_stream_s stream;
16
17int logfs_compress(void *in, void *out, size_t inlen, size_t outlen)
18{
19 int err, ret;
20
21 ret = -EIO;
22 mutex_lock(&compr_mutex);
23 err = zlib_deflateInit(&stream, COMPR_LEVEL);
24 if (err != Z_OK)
25 goto error;
26
27 stream.next_in = in;
28 stream.avail_in = inlen;
29 stream.total_in = 0;
30 stream.next_out = out;
31 stream.avail_out = outlen;
32 stream.total_out = 0;
33
34 err = zlib_deflate(&stream, Z_FINISH);
35 if (err != Z_STREAM_END)
36 goto error;
37
38 err = zlib_deflateEnd(&stream);
39 if (err != Z_OK)
40 goto error;
41
42 if (stream.total_out >= stream.total_in)
43 goto error;
44
45 ret = stream.total_out;
46error:
47 mutex_unlock(&compr_mutex);
48 return ret;
49}
50
51int logfs_uncompress(void *in, void *out, size_t inlen, size_t outlen)
52{
53 int err, ret;
54
55 ret = -EIO;
56 mutex_lock(&compr_mutex);
57 err = zlib_inflateInit(&stream);
58 if (err != Z_OK)
59 goto error;
60
61 stream.next_in = in;
62 stream.avail_in = inlen;
63 stream.total_in = 0;
64 stream.next_out = out;
65 stream.avail_out = outlen;
66 stream.total_out = 0;
67
68 err = zlib_inflate(&stream, Z_FINISH);
69 if (err != Z_STREAM_END)
70 goto error;
71
72 err = zlib_inflateEnd(&stream);
73 if (err != Z_OK)
74 goto error;
75
76 ret = 0;
77error:
78 mutex_unlock(&compr_mutex);
79 return ret;
80}
81
82int __init logfs_compr_init(void)
83{
84 size_t size = max(zlib_deflate_workspacesize(),
85 zlib_inflate_workspacesize());
86 stream.workspace = vmalloc(size);
87 if (!stream.workspace)
88 return -ENOMEM;
89 return 0;
90}
91
92void logfs_compr_exit(void)
93{
94 vfree(stream.workspace);
95}
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
new file mode 100644
index 000000000000..58a057b6e1af
--- /dev/null
+++ b/fs/logfs/dev_bdev.c
@@ -0,0 +1,263 @@
1/*
2 * fs/logfs/dev_bdev.c - Device access methods for block devices
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */
8#include "logfs.h"
9#include <linux/bio.h>
10#include <linux/blkdev.h>
11#include <linux/buffer_head.h>
12
13#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
14
15static void request_complete(struct bio *bio, int err)
16{
17 complete((struct completion *)bio->bi_private);
18}
19
20static int sync_request(struct page *page, struct block_device *bdev, int rw)
21{
22 struct bio bio;
23 struct bio_vec bio_vec;
24 struct completion complete;
25
26 bio_init(&bio);
27 bio.bi_io_vec = &bio_vec;
28 bio_vec.bv_page = page;
29 bio_vec.bv_len = PAGE_SIZE;
30 bio_vec.bv_offset = 0;
31 bio.bi_vcnt = 1;
32 bio.bi_idx = 0;
33 bio.bi_size = PAGE_SIZE;
34 bio.bi_bdev = bdev;
35 bio.bi_sector = page->index * (PAGE_SIZE >> 9);
36 init_completion(&complete);
37 bio.bi_private = &complete;
38 bio.bi_end_io = request_complete;
39
40 submit_bio(rw, &bio);
41 generic_unplug_device(bdev_get_queue(bdev));
42 wait_for_completion(&complete);
43 return test_bit(BIO_UPTODATE, &bio.bi_flags) ? 0 : -EIO;
44}
45
46static int bdev_readpage(void *_sb, struct page *page)
47{
48 struct super_block *sb = _sb;
49 struct block_device *bdev = logfs_super(sb)->s_bdev;
50 int err;
51
52 err = sync_request(page, bdev, READ);
53 if (err) {
54 ClearPageUptodate(page);
55 SetPageError(page);
56 } else {
57 SetPageUptodate(page);
58 ClearPageError(page);
59 }
60 unlock_page(page);
61 return err;
62}
63
64static DECLARE_WAIT_QUEUE_HEAD(wq);
65
66static void writeseg_end_io(struct bio *bio, int err)
67{
68 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
69 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
70 struct super_block *sb = bio->bi_private;
71 struct logfs_super *super = logfs_super(sb);
72 struct page *page;
73
74 BUG_ON(!uptodate); /* FIXME: Retry io or write elsewhere */
75 BUG_ON(err);
76 BUG_ON(bio->bi_vcnt == 0);
77 do {
78 page = bvec->bv_page;
79 if (--bvec >= bio->bi_io_vec)
80 prefetchw(&bvec->bv_page->flags);
81
82 end_page_writeback(page);
83 } while (bvec >= bio->bi_io_vec);
84 bio_put(bio);
85 if (atomic_dec_and_test(&super->s_pending_writes))
86 wake_up(&wq);
87}
88
89static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
90 size_t nr_pages)
91{
92 struct logfs_super *super = logfs_super(sb);
93 struct address_space *mapping = super->s_mapping_inode->i_mapping;
94 struct bio *bio;
95 struct page *page;
96 struct request_queue *q = bdev_get_queue(sb->s_bdev);
97 unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9);
98 int i;
99
100 bio = bio_alloc(GFP_NOFS, max_pages);
101 BUG_ON(!bio); /* FIXME: handle this */
102
103 for (i = 0; i < nr_pages; i++) {
104 if (i >= max_pages) {
105 /* Block layer cannot split bios :( */
106 bio->bi_vcnt = i;
107 bio->bi_idx = 0;
108 bio->bi_size = i * PAGE_SIZE;
109 bio->bi_bdev = super->s_bdev;
110 bio->bi_sector = ofs >> 9;
111 bio->bi_private = sb;
112 bio->bi_end_io = writeseg_end_io;
113 atomic_inc(&super->s_pending_writes);
114 submit_bio(WRITE, bio);
115
116 ofs += i * PAGE_SIZE;
117 index += i;
118 nr_pages -= i;
119 i = 0;
120
121 bio = bio_alloc(GFP_NOFS, max_pages);
122 BUG_ON(!bio);
123 }
124 page = find_lock_page(mapping, index + i);
125 BUG_ON(!page);
126 bio->bi_io_vec[i].bv_page = page;
127 bio->bi_io_vec[i].bv_len = PAGE_SIZE;
128 bio->bi_io_vec[i].bv_offset = 0;
129
130 BUG_ON(PageWriteback(page));
131 set_page_writeback(page);
132 unlock_page(page);
133 }
134 bio->bi_vcnt = nr_pages;
135 bio->bi_idx = 0;
136 bio->bi_size = nr_pages * PAGE_SIZE;
137 bio->bi_bdev = super->s_bdev;
138 bio->bi_sector = ofs >> 9;
139 bio->bi_private = sb;
140 bio->bi_end_io = writeseg_end_io;
141 atomic_inc(&super->s_pending_writes);
142 submit_bio(WRITE, bio);
143 return 0;
144}
145
146static void bdev_writeseg(struct super_block *sb, u64 ofs, size_t len)
147{
148 struct logfs_super *super = logfs_super(sb);
149 int head;
150
151 BUG_ON(super->s_flags & LOGFS_SB_FLAG_RO);
152
153 if (len == 0) {
154 /* This can happen when the object fit perfectly into a
155 * segment, the segment gets written per sync and subsequently
156 * closed.
157 */
158 return;
159 }
160 head = ofs & (PAGE_SIZE - 1);
161 if (head) {
162 ofs -= head;
163 len += head;
164 }
165 len = PAGE_ALIGN(len);
166 __bdev_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT);
167 generic_unplug_device(bdev_get_queue(logfs_super(sb)->s_bdev));
168}
169
170static int bdev_erase(struct super_block *sb, loff_t to, size_t len)
171{
172 struct logfs_super *super = logfs_super(sb);
173 struct address_space *mapping = super->s_mapping_inode->i_mapping;
174 struct page *page;
175 pgoff_t index = to >> PAGE_SHIFT;
176 int i, nr_pages = len >> PAGE_SHIFT;
177
178 BUG_ON(to & (PAGE_SIZE - 1));
179 BUG_ON(len & (PAGE_SIZE - 1));
180
181 if (logfs_super(sb)->s_flags & LOGFS_SB_FLAG_RO)
182 return -EROFS;
183
184 for (i = 0; i < nr_pages; i++) {
185 page = find_get_page(mapping, index + i);
186 if (page) {
187 memset(page_address(page), 0xFF, PAGE_SIZE);
188 page_cache_release(page);
189 }
190 }
191 return 0;
192}
193
194static void bdev_sync(struct super_block *sb)
195{
196 struct logfs_super *super = logfs_super(sb);
197
198 wait_event(wq, atomic_read(&super->s_pending_writes) == 0);
199}
200
201static struct page *bdev_find_first_sb(struct super_block *sb, u64 *ofs)
202{
203 struct logfs_super *super = logfs_super(sb);
204 struct address_space *mapping = super->s_mapping_inode->i_mapping;
205 filler_t *filler = bdev_readpage;
206
207 *ofs = 0;
208 return read_cache_page(mapping, 0, filler, sb);
209}
210
211static struct page *bdev_find_last_sb(struct super_block *sb, u64 *ofs)
212{
213 struct logfs_super *super = logfs_super(sb);
214 struct address_space *mapping = super->s_mapping_inode->i_mapping;
215 filler_t *filler = bdev_readpage;
216 u64 pos = (super->s_bdev->bd_inode->i_size & ~0xfffULL) - 0x1000;
217 pgoff_t index = pos >> PAGE_SHIFT;
218
219 *ofs = pos;
220 return read_cache_page(mapping, index, filler, sb);
221}
222
223static int bdev_write_sb(struct super_block *sb, struct page *page)
224{
225 struct block_device *bdev = logfs_super(sb)->s_bdev;
226
227 /* Nothing special to do for block devices. */
228 return sync_request(page, bdev, WRITE);
229}
230
231static void bdev_put_device(struct super_block *sb)
232{
233 close_bdev_exclusive(logfs_super(sb)->s_bdev, FMODE_READ|FMODE_WRITE);
234}
235
236static const struct logfs_device_ops bd_devops = {
237 .find_first_sb = bdev_find_first_sb,
238 .find_last_sb = bdev_find_last_sb,
239 .write_sb = bdev_write_sb,
240 .readpage = bdev_readpage,
241 .writeseg = bdev_writeseg,
242 .erase = bdev_erase,
243 .sync = bdev_sync,
244 .put_device = bdev_put_device,
245};
246
247int logfs_get_sb_bdev(struct file_system_type *type, int flags,
248 const char *devname, struct vfsmount *mnt)
249{
250 struct block_device *bdev;
251
252 bdev = open_bdev_exclusive(devname, FMODE_READ|FMODE_WRITE, type);
253 if (IS_ERR(bdev))
254 return PTR_ERR(bdev);
255
256 if (MAJOR(bdev->bd_dev) == MTD_BLOCK_MAJOR) {
257 int mtdnr = MINOR(bdev->bd_dev);
258 close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE);
259 return logfs_get_sb_mtd(type, flags, mtdnr, mnt);
260 }
261
262 return logfs_get_sb_device(type, flags, NULL, bdev, &bd_devops, mnt);
263}
diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c
new file mode 100644
index 000000000000..68e99d046c23
--- /dev/null
+++ b/fs/logfs/dev_mtd.c
@@ -0,0 +1,253 @@
1/*
2 * fs/logfs/dev_mtd.c - Device access methods for MTD
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */
8#include "logfs.h"
9#include <linux/completion.h>
10#include <linux/mount.h>
11#include <linux/sched.h>
12
13#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
14
15static int mtd_read(struct super_block *sb, loff_t ofs, size_t len, void *buf)
16{
17 struct mtd_info *mtd = logfs_super(sb)->s_mtd;
18 size_t retlen;
19 int ret;
20
21 ret = mtd->read(mtd, ofs, len, &retlen, buf);
22 BUG_ON(ret == -EINVAL);
23 if (ret)
24 return ret;
25
26 /* Not sure if we should loop instead. */
27 if (retlen != len)
28 return -EIO;
29
30 return 0;
31}
32
33static int mtd_write(struct super_block *sb, loff_t ofs, size_t len, void *buf)
34{
35 struct logfs_super *super = logfs_super(sb);
36 struct mtd_info *mtd = super->s_mtd;
37 size_t retlen;
38 loff_t page_start, page_end;
39 int ret;
40
41 if (super->s_flags & LOGFS_SB_FLAG_RO)
42 return -EROFS;
43
44 BUG_ON((ofs >= mtd->size) || (len > mtd->size - ofs));
45 BUG_ON(ofs != (ofs >> super->s_writeshift) << super->s_writeshift);
46 BUG_ON(len > PAGE_CACHE_SIZE);
47 page_start = ofs & PAGE_CACHE_MASK;
48 page_end = PAGE_CACHE_ALIGN(ofs + len) - 1;
49 ret = mtd->write(mtd, ofs, len, &retlen, buf);
50 if (ret || (retlen != len))
51 return -EIO;
52
53 return 0;
54}
55
56/*
57 * For as long as I can remember (since about 2001) mtd->erase has been an
58 * asynchronous interface lacking the first driver to actually use the
59 * asynchronous properties. So just to prevent the first implementor of such
60 * a thing from breaking logfs in 2350, we do the usual pointless dance to
61 * declare a completion variable and wait for completion before returning
62 * from mtd_erase(). What an excercise in futility!
63 */
64static void logfs_erase_callback(struct erase_info *ei)
65{
66 complete((struct completion *)ei->priv);
67}
68
69static int mtd_erase_mapping(struct super_block *sb, loff_t ofs, size_t len)
70{
71 struct logfs_super *super = logfs_super(sb);
72 struct address_space *mapping = super->s_mapping_inode->i_mapping;
73 struct page *page;
74 pgoff_t index = ofs >> PAGE_SHIFT;
75
76 for (index = ofs >> PAGE_SHIFT; index < (ofs + len) >> PAGE_SHIFT; index++) {
77 page = find_get_page(mapping, index);
78 if (!page)
79 continue;
80 memset(page_address(page), 0xFF, PAGE_SIZE);
81 page_cache_release(page);
82 }
83 return 0;
84}
85
86static int mtd_erase(struct super_block *sb, loff_t ofs, size_t len)
87{
88 struct mtd_info *mtd = logfs_super(sb)->s_mtd;
89 struct erase_info ei;
90 DECLARE_COMPLETION_ONSTACK(complete);
91 int ret;
92
93 BUG_ON(len % mtd->erasesize);
94 if (logfs_super(sb)->s_flags & LOGFS_SB_FLAG_RO)
95 return -EROFS;
96
97 memset(&ei, 0, sizeof(ei));
98 ei.mtd = mtd;
99 ei.addr = ofs;
100 ei.len = len;
101 ei.callback = logfs_erase_callback;
102 ei.priv = (long)&complete;
103 ret = mtd->erase(mtd, &ei);
104 if (ret)
105 return -EIO;
106
107 wait_for_completion(&complete);
108 if (ei.state != MTD_ERASE_DONE)
109 return -EIO;
110 return mtd_erase_mapping(sb, ofs, len);
111}
112
113static void mtd_sync(struct super_block *sb)
114{
115 struct mtd_info *mtd = logfs_super(sb)->s_mtd;
116
117 if (mtd->sync)
118 mtd->sync(mtd);
119}
120
121static int mtd_readpage(void *_sb, struct page *page)
122{
123 struct super_block *sb = _sb;
124 int err;
125
126 err = mtd_read(sb, page->index << PAGE_SHIFT, PAGE_SIZE,
127 page_address(page));
128 if (err == -EUCLEAN) {
129 err = 0;
130 /* FIXME: force GC this segment */
131 }
132 if (err) {
133 ClearPageUptodate(page);
134 SetPageError(page);
135 } else {
136 SetPageUptodate(page);
137 ClearPageError(page);
138 }
139 unlock_page(page);
140 return err;
141}
142
143static struct page *mtd_find_first_sb(struct super_block *sb, u64 *ofs)
144{
145 struct logfs_super *super = logfs_super(sb);
146 struct address_space *mapping = super->s_mapping_inode->i_mapping;
147 filler_t *filler = mtd_readpage;
148 struct mtd_info *mtd = super->s_mtd;
149
150 if (!mtd->block_isbad)
151 return NULL;
152
153 *ofs = 0;
154 while (mtd->block_isbad(mtd, *ofs)) {
155 *ofs += mtd->erasesize;
156 if (*ofs >= mtd->size)
157 return NULL;
158 }
159 BUG_ON(*ofs & ~PAGE_MASK);
160 return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb);
161}
162
163static struct page *mtd_find_last_sb(struct super_block *sb, u64 *ofs)
164{
165 struct logfs_super *super = logfs_super(sb);
166 struct address_space *mapping = super->s_mapping_inode->i_mapping;
167 filler_t *filler = mtd_readpage;
168 struct mtd_info *mtd = super->s_mtd;
169
170 if (!mtd->block_isbad)
171 return NULL;
172
173 *ofs = mtd->size - mtd->erasesize;
174 while (mtd->block_isbad(mtd, *ofs)) {
175 *ofs -= mtd->erasesize;
176 if (*ofs <= 0)
177 return NULL;
178 }
179 *ofs = *ofs + mtd->erasesize - 0x1000;
180 BUG_ON(*ofs & ~PAGE_MASK);
181 return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb);
182}
183
184static int __mtd_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
185 size_t nr_pages)
186{
187 struct logfs_super *super = logfs_super(sb);
188 struct address_space *mapping = super->s_mapping_inode->i_mapping;
189 struct page *page;
190 int i, err;
191
192 for (i = 0; i < nr_pages; i++) {
193 page = find_lock_page(mapping, index + i);
194 BUG_ON(!page);
195
196 err = mtd_write(sb, page->index << PAGE_SHIFT, PAGE_SIZE,
197 page_address(page));
198 unlock_page(page);
199 page_cache_release(page);
200 if (err)
201 return err;
202 }
203 return 0;
204}
205
206static void mtd_writeseg(struct super_block *sb, u64 ofs, size_t len)
207{
208 struct logfs_super *super = logfs_super(sb);
209 int head;
210
211 if (super->s_flags & LOGFS_SB_FLAG_RO)
212 return;
213
214 if (len == 0) {
215 /* This can happen when the object fit perfectly into a
216 * segment, the segment gets written per sync and subsequently
217 * closed.
218 */
219 return;
220 }
221 head = ofs & (PAGE_SIZE - 1);
222 if (head) {
223 ofs -= head;
224 len += head;
225 }
226 len = PAGE_ALIGN(len);
227 __mtd_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT);
228}
229
230static void mtd_put_device(struct super_block *sb)
231{
232 put_mtd_device(logfs_super(sb)->s_mtd);
233}
234
235static const struct logfs_device_ops mtd_devops = {
236 .find_first_sb = mtd_find_first_sb,
237 .find_last_sb = mtd_find_last_sb,
238 .readpage = mtd_readpage,
239 .writeseg = mtd_writeseg,
240 .erase = mtd_erase,
241 .sync = mtd_sync,
242 .put_device = mtd_put_device,
243};
244
245int logfs_get_sb_mtd(struct file_system_type *type, int flags,
246 int mtdnr, struct vfsmount *mnt)
247{
248 struct mtd_info *mtd;
249 const struct logfs_device_ops *devops = &mtd_devops;
250
251 mtd = get_mtd_device(NULL, mtdnr);
252 return logfs_get_sb_device(type, flags, mtd, NULL, devops, mnt);
253}
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
new file mode 100644
index 000000000000..89104e6f81c4
--- /dev/null
+++ b/fs/logfs/dir.c
@@ -0,0 +1,818 @@
1/*
2 * fs/logfs/dir.c - directory-related code
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */
8#include "logfs.h"
9
10
11/*
12 * Atomic dir operations
13 *
14 * Directory operations are by default not atomic. Dentries and Inodes are
15 * created/removed/altered in seperate operations. Therefore we need to do
16 * a small amount of journaling.
17 *
18 * Create, link, mkdir, mknod and symlink all share the same function to do
19 * the work: __logfs_create. This function works in two atomic steps:
20 * 1. allocate inode (remember in journal)
21 * 2. allocate dentry (clear journal)
22 *
23 * As we can only get interrupted between the two, when the inode we just
24 * created is simply stored in the anchor. On next mount, if we were
25 * interrupted, we delete the inode. From a users point of view the
26 * operation never happened.
27 *
28 * Unlink and rmdir also share the same function: unlink. Again, this
29 * function works in two atomic steps
30 * 1. remove dentry (remember inode in journal)
31 * 2. unlink inode (clear journal)
32 *
33 * And again, on the next mount, if we were interrupted, we delete the inode.
34 * From a users point of view the operation succeeded.
35 *
36 * Rename is the real pain to deal with, harder than all the other methods
37 * combined. Depending on the circumstances we can run into three cases.
38 * A "target rename" where the target dentry already existed, a "local
39 * rename" where both parent directories are identical or a "cross-directory
40 * rename" in the remaining case.
41 *
42 * Local rename is atomic, as the old dentry is simply rewritten with a new
43 * name.
44 *
45 * Cross-directory rename works in two steps, similar to __logfs_create and
46 * logfs_unlink:
47 * 1. Write new dentry (remember old dentry in journal)
48 * 2. Remove old dentry (clear journal)
49 *
50 * Here we remember a dentry instead of an inode. On next mount, if we were
51 * interrupted, we delete the dentry. From a users point of view, the
52 * operation succeeded.
53 *
54 * Target rename works in three atomic steps:
55 * 1. Attach old inode to new dentry (remember old dentry and new inode)
56 * 2. Remove old dentry (still remember the new inode)
57 * 3. Remove victim inode
58 *
59 * Here we remember both an inode an a dentry. If we get interrupted
60 * between steps 1 and 2, we delete both the dentry and the inode. If
61 * we get interrupted between steps 2 and 3, we delete just the inode.
62 * In either case, the remaining objects are deleted on next mount. From
63 * a users point of view, the operation succeeded.
64 */
65
66static int write_dir(struct inode *dir, struct logfs_disk_dentry *dd,
67 loff_t pos)
68{
69 return logfs_inode_write(dir, dd, sizeof(*dd), pos, WF_LOCK, NULL);
70}
71
72static int write_inode(struct inode *inode)
73{
74 return __logfs_write_inode(inode, WF_LOCK);
75}
76
77static s64 dir_seek_data(struct inode *inode, s64 pos)
78{
79 s64 new_pos = logfs_seek_data(inode, pos);
80
81 return max(pos, new_pos - 1);
82}
83
84static int beyond_eof(struct inode *inode, loff_t bix)
85{
86 loff_t pos = bix << inode->i_sb->s_blocksize_bits;
87 return pos >= i_size_read(inode);
88}
89
90/*
91 * Prime value was chosen to be roughly 256 + 26. r5 hash uses 11,
92 * so short names (len <= 9) don't even occupy the complete 32bit name
93 * space. A prime >256 ensures short names quickly spread the 32bit
94 * name space. Add about 26 for the estimated amount of information
95 * of each character and pick a prime nearby, preferrably a bit-sparse
96 * one.
97 */
98static u32 hash_32(const char *s, int len, u32 seed)
99{
100 u32 hash = seed;
101 int i;
102
103 for (i = 0; i < len; i++)
104 hash = hash * 293 + s[i];
105 return hash;
106}
107
108/*
109 * We have to satisfy several conflicting requirements here. Small
110 * directories should stay fairly compact and not require too many
111 * indirect blocks. The number of possible locations for a given hash
112 * should be small to make lookup() fast. And we should try hard not
113 * to overflow the 32bit name space or nfs and 32bit host systems will
114 * be unhappy.
115 *
116 * So we use the following scheme. First we reduce the hash to 0..15
117 * and try a direct block. If that is occupied we reduce the hash to
118 * 16..255 and try an indirect block. Same for 2x and 3x indirect
119 * blocks. Lastly we reduce the hash to 0x800_0000 .. 0xffff_ffff,
120 * but use buckets containing eight entries instead of a single one.
121 *
122 * Using 16 entries should allow for a reasonable amount of hash
123 * collisions, so the 32bit name space can be packed fairly tight
124 * before overflowing. Oh and currently we don't overflow but return
125 * and error.
126 *
127 * How likely are collisions? Doing the appropriate math is beyond me
128 * and the Bronstein textbook. But running a test program to brute
129 * force collisions for a couple of days showed that on average the
130 * first collision occurs after 598M entries, with 290M being the
131 * smallest result. Obviously 21 entries could already cause a
132 * collision if all entries are carefully chosen.
133 */
134static pgoff_t hash_index(u32 hash, int round)
135{
136 switch (round) {
137 case 0:
138 return hash % I0_BLOCKS;
139 case 1:
140 return I0_BLOCKS + hash % (I1_BLOCKS - I0_BLOCKS);
141 case 2:
142 return I1_BLOCKS + hash % (I2_BLOCKS - I1_BLOCKS);
143 case 3:
144 return I2_BLOCKS + hash % (I3_BLOCKS - I2_BLOCKS);
145 case 4 ... 19:
146 return I3_BLOCKS + 16 * (hash % (((1<<31) - I3_BLOCKS) / 16))
147 + round - 4;
148 }
149 BUG();
150}
151
152static struct page *logfs_get_dd_page(struct inode *dir, struct dentry *dentry)
153{
154 struct qstr *name = &dentry->d_name;
155 struct page *page;
156 struct logfs_disk_dentry *dd;
157 u32 hash = hash_32(name->name, name->len, 0);
158 pgoff_t index;
159 int round;
160
161 if (name->len > LOGFS_MAX_NAMELEN)
162 return ERR_PTR(-ENAMETOOLONG);
163
164 for (round = 0; round < 20; round++) {
165 index = hash_index(hash, round);
166
167 if (beyond_eof(dir, index))
168 return NULL;
169 if (!logfs_exist_block(dir, index))
170 continue;
171 page = read_cache_page(dir->i_mapping, index,
172 (filler_t *)logfs_readpage, NULL);
173 if (IS_ERR(page))
174 return page;
175 dd = kmap_atomic(page, KM_USER0);
176 BUG_ON(dd->namelen == 0);
177
178 if (name->len != be16_to_cpu(dd->namelen) ||
179 memcmp(name->name, dd->name, name->len)) {
180 kunmap_atomic(dd, KM_USER0);
181 page_cache_release(page);
182 continue;
183 }
184
185 kunmap_atomic(dd, KM_USER0);
186 return page;
187 }
188 return NULL;
189}
190
191static int logfs_remove_inode(struct inode *inode)
192{
193 int ret;
194
195 inode->i_nlink--;
196 ret = write_inode(inode);
197 LOGFS_BUG_ON(ret, inode->i_sb);
198 return ret;
199}
200
201static void abort_transaction(struct inode *inode, struct logfs_transaction *ta)
202{
203 if (logfs_inode(inode)->li_block)
204 logfs_inode(inode)->li_block->ta = NULL;
205 kfree(ta);
206}
207
208static int logfs_unlink(struct inode *dir, struct dentry *dentry)
209{
210 struct logfs_super *super = logfs_super(dir->i_sb);
211 struct inode *inode = dentry->d_inode;
212 struct logfs_transaction *ta;
213 struct page *page;
214 pgoff_t index;
215 int ret;
216
217 ta = kzalloc(sizeof(*ta), GFP_KERNEL);
218 if (!ta)
219 return -ENOMEM;
220
221 ta->state = UNLINK_1;
222 ta->ino = inode->i_ino;
223
224 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
225
226 page = logfs_get_dd_page(dir, dentry);
227 if (!page)
228 return -ENOENT;
229 if (IS_ERR(page))
230 return PTR_ERR(page);
231 index = page->index;
232 page_cache_release(page);
233
234 mutex_lock(&super->s_dirop_mutex);
235 logfs_add_transaction(dir, ta);
236
237 ret = logfs_delete(dir, index, NULL);
238 if (!ret)
239 ret = write_inode(dir);
240
241 if (ret) {
242 abort_transaction(dir, ta);
243 printk(KERN_ERR"LOGFS: unable to delete inode\n");
244 goto out;
245 }
246
247 ta->state = UNLINK_2;
248 logfs_add_transaction(inode, ta);
249 ret = logfs_remove_inode(inode);
250out:
251 mutex_unlock(&super->s_dirop_mutex);
252 return ret;
253}
254
255static inline int logfs_empty_dir(struct inode *dir)
256{
257 u64 data;
258
259 data = logfs_seek_data(dir, 0) << dir->i_sb->s_blocksize_bits;
260 return data >= i_size_read(dir);
261}
262
263static int logfs_rmdir(struct inode *dir, struct dentry *dentry)
264{
265 struct inode *inode = dentry->d_inode;
266
267 if (!logfs_empty_dir(inode))
268 return -ENOTEMPTY;
269
270 return logfs_unlink(dir, dentry);
271}
272
273/* FIXME: readdir currently has it's own dir_walk code. I don't see a good
274 * way to combine the two copies */
275#define IMPLICIT_NODES 2
276static int __logfs_readdir(struct file *file, void *buf, filldir_t filldir)
277{
278 struct inode *dir = file->f_dentry->d_inode;
279 loff_t pos = file->f_pos - IMPLICIT_NODES;
280 struct page *page;
281 struct logfs_disk_dentry *dd;
282 int full;
283
284 BUG_ON(pos < 0);
285 for (;; pos++) {
286 if (beyond_eof(dir, pos))
287 break;
288 if (!logfs_exist_block(dir, pos)) {
289 /* deleted dentry */
290 pos = dir_seek_data(dir, pos);
291 continue;
292 }
293 page = read_cache_page(dir->i_mapping, pos,
294 (filler_t *)logfs_readpage, NULL);
295 if (IS_ERR(page))
296 return PTR_ERR(page);
297 dd = kmap_atomic(page, KM_USER0);
298 BUG_ON(dd->namelen == 0);
299
300 full = filldir(buf, (char *)dd->name, be16_to_cpu(dd->namelen),
301 pos, be64_to_cpu(dd->ino), dd->type);
302 kunmap_atomic(dd, KM_USER0);
303 page_cache_release(page);
304 if (full)
305 break;
306 }
307
308 file->f_pos = pos + IMPLICIT_NODES;
309 return 0;
310}
311
312static int logfs_readdir(struct file *file, void *buf, filldir_t filldir)
313{
314 struct inode *inode = file->f_dentry->d_inode;
315 ino_t pino = parent_ino(file->f_dentry);
316 int err;
317
318 if (file->f_pos < 0)
319 return -EINVAL;
320
321 if (file->f_pos == 0) {
322 if (filldir(buf, ".", 1, 1, inode->i_ino, DT_DIR) < 0)
323 return 0;
324 file->f_pos++;
325 }
326 if (file->f_pos == 1) {
327 if (filldir(buf, "..", 2, 2, pino, DT_DIR) < 0)
328 return 0;
329 file->f_pos++;
330 }
331
332 err = __logfs_readdir(file, buf, filldir);
333 return err;
334}
335
336static void logfs_set_name(struct logfs_disk_dentry *dd, struct qstr *name)
337{
338 dd->namelen = cpu_to_be16(name->len);
339 memcpy(dd->name, name->name, name->len);
340}
341
342static struct dentry *logfs_lookup(struct inode *dir, struct dentry *dentry,
343 struct nameidata *nd)
344{
345 struct page *page;
346 struct logfs_disk_dentry *dd;
347 pgoff_t index;
348 u64 ino = 0;
349 struct inode *inode;
350
351 page = logfs_get_dd_page(dir, dentry);
352 if (IS_ERR(page))
353 return ERR_CAST(page);
354 if (!page) {
355 d_add(dentry, NULL);
356 return NULL;
357 }
358 index = page->index;
359 dd = kmap_atomic(page, KM_USER0);
360 ino = be64_to_cpu(dd->ino);
361 kunmap_atomic(dd, KM_USER0);
362 page_cache_release(page);
363
364 inode = logfs_iget(dir->i_sb, ino);
365 if (IS_ERR(inode)) {
366 printk(KERN_ERR"LogFS: Cannot read inode #%llx for dentry (%lx, %lx)n",
367 ino, dir->i_ino, index);
368 return ERR_CAST(inode);
369 }
370 return d_splice_alias(inode, dentry);
371}
372
373static void grow_dir(struct inode *dir, loff_t index)
374{
375 index = (index + 1) << dir->i_sb->s_blocksize_bits;
376 if (i_size_read(dir) < index)
377 i_size_write(dir, index);
378}
379
380static int logfs_write_dir(struct inode *dir, struct dentry *dentry,
381 struct inode *inode)
382{
383 struct page *page;
384 struct logfs_disk_dentry *dd;
385 u32 hash = hash_32(dentry->d_name.name, dentry->d_name.len, 0);
386 pgoff_t index;
387 int round, err;
388
389 for (round = 0; round < 20; round++) {
390 index = hash_index(hash, round);
391
392 if (logfs_exist_block(dir, index))
393 continue;
394 page = find_or_create_page(dir->i_mapping, index, GFP_KERNEL);
395 if (!page)
396 return -ENOMEM;
397
398 dd = kmap_atomic(page, KM_USER0);
399 memset(dd, 0, sizeof(*dd));
400 dd->ino = cpu_to_be64(inode->i_ino);
401 dd->type = logfs_type(inode);
402 logfs_set_name(dd, &dentry->d_name);
403 kunmap_atomic(dd, KM_USER0);
404
405 err = logfs_write_buf(dir, page, WF_LOCK);
406 unlock_page(page);
407 page_cache_release(page);
408 if (!err)
409 grow_dir(dir, index);
410 return err;
411 }
412 /* FIXME: Is there a better return value? In most cases neither
413 * the filesystem nor the directory are full. But we have had
414 * too many collisions for this particular hash and no fallback.
415 */
416 return -ENOSPC;
417}
418
419static int __logfs_create(struct inode *dir, struct dentry *dentry,
420 struct inode *inode, const char *dest, long destlen)
421{
422 struct logfs_super *super = logfs_super(dir->i_sb);
423 struct logfs_inode *li = logfs_inode(inode);
424 struct logfs_transaction *ta;
425 int ret;
426
427 ta = kzalloc(sizeof(*ta), GFP_KERNEL);
428 if (!ta)
429 return -ENOMEM;
430
431 ta->state = CREATE_1;
432 ta->ino = inode->i_ino;
433 mutex_lock(&super->s_dirop_mutex);
434 logfs_add_transaction(inode, ta);
435
436 if (dest) {
437 /* symlink */
438 ret = logfs_inode_write(inode, dest, destlen, 0, WF_LOCK, NULL);
439 if (!ret)
440 ret = write_inode(inode);
441 } else {
442 /* creat/mkdir/mknod */
443 ret = write_inode(inode);
444 }
445 if (ret) {
446 abort_transaction(inode, ta);
447 li->li_flags |= LOGFS_IF_STILLBORN;
448 /* FIXME: truncate symlink */
449 inode->i_nlink--;
450 iput(inode);
451 goto out;
452 }
453
454 ta->state = CREATE_2;
455 logfs_add_transaction(dir, ta);
456 ret = logfs_write_dir(dir, dentry, inode);
457 /* sync directory */
458 if (!ret)
459 ret = write_inode(dir);
460
461 if (ret) {
462 logfs_del_transaction(dir, ta);
463 ta->state = CREATE_2;
464 logfs_add_transaction(inode, ta);
465 logfs_remove_inode(inode);
466 iput(inode);
467 goto out;
468 }
469 d_instantiate(dentry, inode);
470out:
471 mutex_unlock(&super->s_dirop_mutex);
472 return ret;
473}
474
475static int logfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
476{
477 struct inode *inode;
478
479 /*
480 * FIXME: why do we have to fill in S_IFDIR, while the mode is
481 * correct for mknod, creat, etc.? Smells like the vfs *should*
482 * do it for us but for some reason fails to do so.
483 */
484 inode = logfs_new_inode(dir, S_IFDIR | mode);
485 if (IS_ERR(inode))
486 return PTR_ERR(inode);
487
488 inode->i_op = &logfs_dir_iops;
489 inode->i_fop = &logfs_dir_fops;
490
491 return __logfs_create(dir, dentry, inode, NULL, 0);
492}
493
494static int logfs_create(struct inode *dir, struct dentry *dentry, int mode,
495 struct nameidata *nd)
496{
497 struct inode *inode;
498
499 inode = logfs_new_inode(dir, mode);
500 if (IS_ERR(inode))
501 return PTR_ERR(inode);
502
503 inode->i_op = &logfs_reg_iops;
504 inode->i_fop = &logfs_reg_fops;
505 inode->i_mapping->a_ops = &logfs_reg_aops;
506
507 return __logfs_create(dir, dentry, inode, NULL, 0);
508}
509
510static int logfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
511 dev_t rdev)
512{
513 struct inode *inode;
514
515 if (dentry->d_name.len > LOGFS_MAX_NAMELEN)
516 return -ENAMETOOLONG;
517
518 inode = logfs_new_inode(dir, mode);
519 if (IS_ERR(inode))
520 return PTR_ERR(inode);
521
522 init_special_inode(inode, mode, rdev);
523
524 return __logfs_create(dir, dentry, inode, NULL, 0);
525}
526
527static int logfs_symlink(struct inode *dir, struct dentry *dentry,
528 const char *target)
529{
530 struct inode *inode;
531 size_t destlen = strlen(target) + 1;
532
533 if (destlen > dir->i_sb->s_blocksize)
534 return -ENAMETOOLONG;
535
536 inode = logfs_new_inode(dir, S_IFLNK | 0777);
537 if (IS_ERR(inode))
538 return PTR_ERR(inode);
539
540 inode->i_op = &logfs_symlink_iops;
541 inode->i_mapping->a_ops = &logfs_reg_aops;
542
543 return __logfs_create(dir, dentry, inode, target, destlen);
544}
545
546static int logfs_permission(struct inode *inode, int mask)
547{
548 return generic_permission(inode, mask, NULL);
549}
550
551static int logfs_link(struct dentry *old_dentry, struct inode *dir,
552 struct dentry *dentry)
553{
554 struct inode *inode = old_dentry->d_inode;
555
556 if (inode->i_nlink >= LOGFS_LINK_MAX)
557 return -EMLINK;
558
559 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
560 atomic_inc(&inode->i_count);
561 inode->i_nlink++;
562 mark_inode_dirty_sync(inode);
563
564 return __logfs_create(dir, dentry, inode, NULL, 0);
565}
566
567static int logfs_get_dd(struct inode *dir, struct dentry *dentry,
568 struct logfs_disk_dentry *dd, loff_t *pos)
569{
570 struct page *page;
571 void *map;
572
573 page = logfs_get_dd_page(dir, dentry);
574 if (IS_ERR(page))
575 return PTR_ERR(page);
576 *pos = page->index;
577 map = kmap_atomic(page, KM_USER0);
578 memcpy(dd, map, sizeof(*dd));
579 kunmap_atomic(map, KM_USER0);
580 page_cache_release(page);
581 return 0;
582}
583
584static int logfs_delete_dd(struct inode *dir, loff_t pos)
585{
586 /*
587 * Getting called with pos somewhere beyond eof is either a goofup
588 * within this file or means someone maliciously edited the
589 * (crc-protected) journal.
590 */
591 BUG_ON(beyond_eof(dir, pos));
592 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
593 log_dir(" Delete dentry (%lx, %llx)\n", dir->i_ino, pos);
594 return logfs_delete(dir, pos, NULL);
595}
596
597/*
598 * Cross-directory rename, target does not exist. Just a little nasty.
599 * Create a new dentry in the target dir, then remove the old dentry,
600 * all the while taking care to remember our operation in the journal.
601 */
602static int logfs_rename_cross(struct inode *old_dir, struct dentry *old_dentry,
603 struct inode *new_dir, struct dentry *new_dentry)
604{
605 struct logfs_super *super = logfs_super(old_dir->i_sb);
606 struct logfs_disk_dentry dd;
607 struct logfs_transaction *ta;
608 loff_t pos;
609 int err;
610
611 /* 1. locate source dd */
612 err = logfs_get_dd(old_dir, old_dentry, &dd, &pos);
613 if (err)
614 return err;
615
616 ta = kzalloc(sizeof(*ta), GFP_KERNEL);
617 if (!ta)
618 return -ENOMEM;
619
620 ta->state = CROSS_RENAME_1;
621 ta->dir = old_dir->i_ino;
622 ta->pos = pos;
623
624 /* 2. write target dd */
625 mutex_lock(&super->s_dirop_mutex);
626 logfs_add_transaction(new_dir, ta);
627 err = logfs_write_dir(new_dir, new_dentry, old_dentry->d_inode);
628 if (!err)
629 err = write_inode(new_dir);
630
631 if (err) {
632 super->s_rename_dir = 0;
633 super->s_rename_pos = 0;
634 abort_transaction(new_dir, ta);
635 goto out;
636 }
637
638 /* 3. remove source dd */
639 ta->state = CROSS_RENAME_2;
640 logfs_add_transaction(old_dir, ta);
641 err = logfs_delete_dd(old_dir, pos);
642 if (!err)
643 err = write_inode(old_dir);
644 LOGFS_BUG_ON(err, old_dir->i_sb);
645out:
646 mutex_unlock(&super->s_dirop_mutex);
647 return err;
648}
649
650static int logfs_replace_inode(struct inode *dir, struct dentry *dentry,
651 struct logfs_disk_dentry *dd, struct inode *inode)
652{
653 loff_t pos;
654 int err;
655
656 err = logfs_get_dd(dir, dentry, dd, &pos);
657 if (err)
658 return err;
659 dd->ino = cpu_to_be64(inode->i_ino);
660 dd->type = logfs_type(inode);
661
662 err = write_dir(dir, dd, pos);
663 if (err)
664 return err;
665 log_dir("Replace dentry (%lx, %llx) %s -> %llx\n", dir->i_ino, pos,
666 dd->name, be64_to_cpu(dd->ino));
667 return write_inode(dir);
668}
669
670/* Target dentry exists - the worst case. We need to attach the source
671 * inode to the target dentry, then remove the orphaned target inode and
672 * source dentry.
673 */
674static int logfs_rename_target(struct inode *old_dir, struct dentry *old_dentry,
675 struct inode *new_dir, struct dentry *new_dentry)
676{
677 struct logfs_super *super = logfs_super(old_dir->i_sb);
678 struct inode *old_inode = old_dentry->d_inode;
679 struct inode *new_inode = new_dentry->d_inode;
680 int isdir = S_ISDIR(old_inode->i_mode);
681 struct logfs_disk_dentry dd;
682 struct logfs_transaction *ta;
683 loff_t pos;
684 int err;
685
686 BUG_ON(isdir != S_ISDIR(new_inode->i_mode));
687 if (isdir) {
688 if (!logfs_empty_dir(new_inode))
689 return -ENOTEMPTY;
690 }
691
692 /* 1. locate source dd */
693 err = logfs_get_dd(old_dir, old_dentry, &dd, &pos);
694 if (err)
695 return err;
696
697 ta = kzalloc(sizeof(*ta), GFP_KERNEL);
698 if (!ta)
699 return -ENOMEM;
700
701 ta->state = TARGET_RENAME_1;
702 ta->dir = old_dir->i_ino;
703 ta->pos = pos;
704 ta->ino = new_inode->i_ino;
705
706 /* 2. attach source inode to target dd */
707 mutex_lock(&super->s_dirop_mutex);
708 logfs_add_transaction(new_dir, ta);
709 err = logfs_replace_inode(new_dir, new_dentry, &dd, old_inode);
710 if (err) {
711 super->s_rename_dir = 0;
712 super->s_rename_pos = 0;
713 super->s_victim_ino = 0;
714 abort_transaction(new_dir, ta);
715 goto out;
716 }
717
718 /* 3. remove source dd */
719 ta->state = TARGET_RENAME_2;
720 logfs_add_transaction(old_dir, ta);
721 err = logfs_delete_dd(old_dir, pos);
722 if (!err)
723 err = write_inode(old_dir);
724 LOGFS_BUG_ON(err, old_dir->i_sb);
725
726 /* 4. remove target inode */
727 ta->state = TARGET_RENAME_3;
728 logfs_add_transaction(new_inode, ta);
729 err = logfs_remove_inode(new_inode);
730
731out:
732 mutex_unlock(&super->s_dirop_mutex);
733 return err;
734}
735
736static int logfs_rename(struct inode *old_dir, struct dentry *old_dentry,
737 struct inode *new_dir, struct dentry *new_dentry)
738{
739 if (new_dentry->d_inode)
740 return logfs_rename_target(old_dir, old_dentry,
741 new_dir, new_dentry);
742 return logfs_rename_cross(old_dir, old_dentry, new_dir, new_dentry);
743}
744
745/* No locking done here, as this is called before .get_sb() returns. */
746int logfs_replay_journal(struct super_block *sb)
747{
748 struct logfs_super *super = logfs_super(sb);
749 struct inode *inode;
750 u64 ino, pos;
751 int err;
752
753 if (super->s_victim_ino) {
754 /* delete victim inode */
755 ino = super->s_victim_ino;
756 printk(KERN_INFO"LogFS: delete unmapped inode #%llx\n", ino);
757 inode = logfs_iget(sb, ino);
758 if (IS_ERR(inode))
759 goto fail;
760
761 LOGFS_BUG_ON(i_size_read(inode) > 0, sb);
762 super->s_victim_ino = 0;
763 err = logfs_remove_inode(inode);
764 iput(inode);
765 if (err) {
766 super->s_victim_ino = ino;
767 goto fail;
768 }
769 }
770 if (super->s_rename_dir) {
771 /* delete old dd from rename */
772 ino = super->s_rename_dir;
773 pos = super->s_rename_pos;
774 printk(KERN_INFO"LogFS: delete unbacked dentry (%llx, %llx)\n",
775 ino, pos);
776 inode = logfs_iget(sb, ino);
777 if (IS_ERR(inode))
778 goto fail;
779
780 super->s_rename_dir = 0;
781 super->s_rename_pos = 0;
782 err = logfs_delete_dd(inode, pos);
783 iput(inode);
784 if (err) {
785 super->s_rename_dir = ino;
786 super->s_rename_pos = pos;
787 goto fail;
788 }
789 }
790 return 0;
791fail:
792 LOGFS_BUG(sb);
793 return -EIO;
794}
795
796const struct inode_operations logfs_symlink_iops = {
797 .readlink = generic_readlink,
798 .follow_link = page_follow_link_light,
799};
800
801const struct inode_operations logfs_dir_iops = {
802 .create = logfs_create,
803 .link = logfs_link,
804 .lookup = logfs_lookup,
805 .mkdir = logfs_mkdir,
806 .mknod = logfs_mknod,
807 .rename = logfs_rename,
808 .rmdir = logfs_rmdir,
809 .permission = logfs_permission,
810 .symlink = logfs_symlink,
811 .unlink = logfs_unlink,
812};
813const struct file_operations logfs_dir_fops = {
814 .fsync = logfs_fsync,
815 .ioctl = logfs_ioctl,
816 .readdir = logfs_readdir,
817 .read = generic_read_dir,
818};
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
new file mode 100644
index 000000000000..370f367a933e
--- /dev/null
+++ b/fs/logfs/file.c
@@ -0,0 +1,263 @@
1/*
2 * fs/logfs/file.c - prepare_write, commit_write and friends
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */
8#include "logfs.h"
9#include <linux/sched.h>
10#include <linux/writeback.h>
11
12static int logfs_write_begin(struct file *file, struct address_space *mapping,
13 loff_t pos, unsigned len, unsigned flags,
14 struct page **pagep, void **fsdata)
15{
16 struct inode *inode = mapping->host;
17 struct page *page;
18 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
19
20 page = grab_cache_page_write_begin(mapping, index, flags);
21 if (!page)
22 return -ENOMEM;
23 *pagep = page;
24
25 if ((len == PAGE_CACHE_SIZE) || PageUptodate(page))
26 return 0;
27 if ((pos & PAGE_CACHE_MASK) >= i_size_read(inode)) {
28 unsigned start = pos & (PAGE_CACHE_SIZE - 1);
29 unsigned end = start + len;
30
31 /* Reading beyond i_size is simple: memset to zero */
32 zero_user_segments(page, 0, start, end, PAGE_CACHE_SIZE);
33 return 0;
34 }
35 return logfs_readpage_nolock(page);
36}
37
38static int logfs_write_end(struct file *file, struct address_space *mapping,
39 loff_t pos, unsigned len, unsigned copied, struct page *page,
40 void *fsdata)
41{
42 struct inode *inode = mapping->host;
43 pgoff_t index = page->index;
44 unsigned start = pos & (PAGE_CACHE_SIZE - 1);
45 unsigned end = start + copied;
46 int ret = 0;
47
48 BUG_ON(PAGE_CACHE_SIZE != inode->i_sb->s_blocksize);
49 BUG_ON(page->index > I3_BLOCKS);
50
51 if (copied < len) {
52 /*
53 * Short write of a non-initialized paged. Just tell userspace
54 * to retry the entire page.
55 */
56 if (!PageUptodate(page)) {
57 copied = 0;
58 goto out;
59 }
60 }
61 if (copied == 0)
62 goto out; /* FIXME: do we need to update inode? */
63
64 if (i_size_read(inode) < (index << PAGE_CACHE_SHIFT) + end) {
65 i_size_write(inode, (index << PAGE_CACHE_SHIFT) + end);
66 mark_inode_dirty_sync(inode);
67 }
68
69 SetPageUptodate(page);
70 if (!PageDirty(page)) {
71 if (!get_page_reserve(inode, page))
72 __set_page_dirty_nobuffers(page);
73 else
74 ret = logfs_write_buf(inode, page, WF_LOCK);
75 }
76out:
77 unlock_page(page);
78 page_cache_release(page);
79 return ret ? ret : copied;
80}
81
82int logfs_readpage(struct file *file, struct page *page)
83{
84 int ret;
85
86 ret = logfs_readpage_nolock(page);
87 unlock_page(page);
88 return ret;
89}
90
91/* Clear the page's dirty flag in the radix tree. */
92/* TODO: mucking with PageWriteback is silly. Add a generic function to clear
93 * the dirty bit from the radix tree for filesystems that don't have to wait
94 * for page writeback to finish (i.e. any compressing filesystem).
95 */
96static void clear_radix_tree_dirty(struct page *page)
97{
98 BUG_ON(PagePrivate(page) || page->private);
99 set_page_writeback(page);
100 end_page_writeback(page);
101}
102
103static int __logfs_writepage(struct page *page)
104{
105 struct inode *inode = page->mapping->host;
106 int err;
107
108 err = logfs_write_buf(inode, page, WF_LOCK);
109 if (err)
110 set_page_dirty(page);
111 else
112 clear_radix_tree_dirty(page);
113 unlock_page(page);
114 return err;
115}
116
117static int logfs_writepage(struct page *page, struct writeback_control *wbc)
118{
119 struct inode *inode = page->mapping->host;
120 loff_t i_size = i_size_read(inode);
121 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
122 unsigned offset;
123 u64 bix;
124 level_t level;
125
126 log_file("logfs_writepage(%lx, %lx, %p)\n", inode->i_ino, page->index,
127 page);
128
129 logfs_unpack_index(page->index, &bix, &level);
130
131 /* Indirect blocks are never truncated */
132 if (level != 0)
133 return __logfs_writepage(page);
134
135 /*
136 * TODO: everything below is a near-verbatim copy of nobh_writepage().
137 * The relevant bits should be factored out after logfs is merged.
138 */
139
140 /* Is the page fully inside i_size? */
141 if (bix < end_index)
142 return __logfs_writepage(page);
143
144 /* Is the page fully outside i_size? (truncate in progress) */
145 offset = i_size & (PAGE_CACHE_SIZE-1);
146 if (bix > end_index || offset == 0) {
147 unlock_page(page);
148 return 0; /* don't care */
149 }
150
151 /*
152 * The page straddles i_size. It must be zeroed out on each and every
153 * writepage invokation because it may be mmapped. "A file is mapped
154 * in multiples of the page size. For a file that is not a multiple of
155 * the page size, the remaining memory is zeroed when mapped, and
156 * writes to that region are not written out to the file."
157 */
158 zero_user_segment(page, offset, PAGE_CACHE_SIZE);
159 return __logfs_writepage(page);
160}
161
162static void logfs_invalidatepage(struct page *page, unsigned long offset)
163{
164 move_page_to_btree(page);
165 BUG_ON(PagePrivate(page) || page->private);
166}
167
168static int logfs_releasepage(struct page *page, gfp_t only_xfs_uses_this)
169{
170 return 0; /* None of these are easy to release */
171}
172
173
174int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
175 unsigned long arg)
176{
177 struct logfs_inode *li = logfs_inode(inode);
178 unsigned int oldflags, flags;
179 int err;
180
181 switch (cmd) {
182 case FS_IOC_GETFLAGS:
183 flags = li->li_flags & LOGFS_FL_USER_VISIBLE;
184 return put_user(flags, (int __user *)arg);
185 case FS_IOC_SETFLAGS:
186 if (IS_RDONLY(inode))
187 return -EROFS;
188
189 if (!is_owner_or_cap(inode))
190 return -EACCES;
191
192 err = get_user(flags, (int __user *)arg);
193 if (err)
194 return err;
195
196 mutex_lock(&inode->i_mutex);
197 oldflags = li->li_flags;
198 flags &= LOGFS_FL_USER_MODIFIABLE;
199 flags |= oldflags & ~LOGFS_FL_USER_MODIFIABLE;
200 li->li_flags = flags;
201 mutex_unlock(&inode->i_mutex);
202
203 inode->i_ctime = CURRENT_TIME;
204 mark_inode_dirty_sync(inode);
205 return 0;
206
207 default:
208 return -ENOTTY;
209 }
210}
211
212int logfs_fsync(struct file *file, struct dentry *dentry, int datasync)
213{
214 struct super_block *sb = dentry->d_inode->i_sb;
215 struct logfs_super *super = logfs_super(sb);
216
217 /* FIXME: write anchor */
218 super->s_devops->sync(sb);
219 return 0;
220}
221
222static int logfs_setattr(struct dentry *dentry, struct iattr *attr)
223{
224 struct inode *inode = dentry->d_inode;
225 int err = 0;
226
227 if (attr->ia_valid & ATTR_SIZE)
228 err = logfs_truncate(inode, attr->ia_size);
229 attr->ia_valid &= ~ATTR_SIZE;
230
231 if (!err)
232 err = inode_change_ok(inode, attr);
233 if (!err)
234 err = inode_setattr(inode, attr);
235 return err;
236}
237
238const struct inode_operations logfs_reg_iops = {
239 .setattr = logfs_setattr,
240};
241
242const struct file_operations logfs_reg_fops = {
243 .aio_read = generic_file_aio_read,
244 .aio_write = generic_file_aio_write,
245 .fsync = logfs_fsync,
246 .ioctl = logfs_ioctl,
247 .llseek = generic_file_llseek,
248 .mmap = generic_file_readonly_mmap,
249 .open = generic_file_open,
250 .read = do_sync_read,
251 .write = do_sync_write,
252};
253
254const struct address_space_operations logfs_reg_aops = {
255 .invalidatepage = logfs_invalidatepage,
256 .readpage = logfs_readpage,
257 .releasepage = logfs_releasepage,
258 .set_page_dirty = __set_page_dirty_nobuffers,
259 .writepage = logfs_writepage,
260 .writepages = generic_writepages,
261 .write_begin = logfs_write_begin,
262 .write_end = logfs_write_end,
263};
diff --git a/fs/logfs/gc.c b/fs/logfs/gc.c
new file mode 100644
index 000000000000..b3656c44190e
--- /dev/null
+++ b/fs/logfs/gc.c
@@ -0,0 +1,730 @@
1/*
2 * fs/logfs/gc.c - garbage collection code
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */
8#include "logfs.h"
9#include <linux/sched.h>
10
11/*
12 * Wear leveling needs to kick in when the difference between low erase
13 * counts and high erase counts gets too big. A good value for "too big"
14 * may be somewhat below 10% of maximum erase count for the device.
15 * Why not 397, to pick a nice round number with no specific meaning? :)
16 *
17 * WL_RATELIMIT is the minimum time between two wear level events. A huge
18 * number of segments may fulfil the requirements for wear leveling at the
19 * same time. If that happens we don't want to cause a latency from hell,
20 * but just gently pick one segment every so often and minimize overhead.
21 */
22#define WL_DELTA 397
23#define WL_RATELIMIT 100
24#define MAX_OBJ_ALIASES 2600
25#define SCAN_RATIO 512 /* number of scanned segments per gc'd segment */
26#define LIST_SIZE 64 /* base size of candidate lists */
27#define SCAN_ROUNDS 128 /* maximum number of complete medium scans */
28#define SCAN_ROUNDS_HIGH 4 /* maximum number of higher-level scans */
29
30static int no_free_segments(struct super_block *sb)
31{
32 struct logfs_super *super = logfs_super(sb);
33
34 return super->s_free_list.count;
35}
36
37/* journal has distance -1, top-most ifile layer distance 0 */
38static u8 root_distance(struct super_block *sb, gc_level_t __gc_level)
39{
40 struct logfs_super *super = logfs_super(sb);
41 u8 gc_level = (__force u8)__gc_level;
42
43 switch (gc_level) {
44 case 0: /* fall through */
45 case 1: /* fall through */
46 case 2: /* fall through */
47 case 3:
48 /* file data or indirect blocks */
49 return super->s_ifile_levels + super->s_iblock_levels - gc_level;
50 case 6: /* fall through */
51 case 7: /* fall through */
52 case 8: /* fall through */
53 case 9:
54 /* inode file data or indirect blocks */
55 return super->s_ifile_levels - (gc_level - 6);
56 default:
57 printk(KERN_ERR"LOGFS: segment of unknown level %x found\n",
58 gc_level);
59 WARN_ON(1);
60 return super->s_ifile_levels + super->s_iblock_levels;
61 }
62}
63
64static int segment_is_reserved(struct super_block *sb, u32 segno)
65{
66 struct logfs_super *super = logfs_super(sb);
67 struct logfs_area *area;
68 void *reserved;
69 int i;
70
71 /* Some segments are reserved. Just pretend they were all valid */
72 reserved = btree_lookup32(&super->s_reserved_segments, segno);
73 if (reserved)
74 return 1;
75
76 /* Currently open segments */
77 for_each_area(i) {
78 area = super->s_area[i];
79 if (area->a_is_open && area->a_segno == segno)
80 return 1;
81 }
82
83 return 0;
84}
85
86static void logfs_mark_segment_bad(struct super_block *sb, u32 segno)
87{
88 BUG();
89}
90
91/*
92 * Returns the bytes consumed by valid objects in this segment. Object headers
93 * are counted, the segment header is not.
94 */
95static u32 logfs_valid_bytes(struct super_block *sb, u32 segno, u32 *ec,
96 gc_level_t *gc_level)
97{
98 struct logfs_segment_entry se;
99 u32 ec_level;
100
101 logfs_get_segment_entry(sb, segno, &se);
102 if (se.ec_level == cpu_to_be32(BADSEG) ||
103 se.valid == cpu_to_be32(RESERVED))
104 return RESERVED;
105
106 ec_level = be32_to_cpu(se.ec_level);
107 *ec = ec_level >> 4;
108 *gc_level = GC_LEVEL(ec_level & 0xf);
109 return be32_to_cpu(se.valid);
110}
111
112static void logfs_cleanse_block(struct super_block *sb, u64 ofs, u64 ino,
113 u64 bix, gc_level_t gc_level)
114{
115 struct inode *inode;
116 int err, cookie;
117
118 inode = logfs_safe_iget(sb, ino, &cookie);
119 err = logfs_rewrite_block(inode, bix, ofs, gc_level, 0);
120 BUG_ON(err);
121 logfs_safe_iput(inode, cookie);
122}
123
124static u32 logfs_gc_segment(struct super_block *sb, u32 segno, u8 dist)
125{
126 struct logfs_super *super = logfs_super(sb);
127 struct logfs_segment_header sh;
128 struct logfs_object_header oh;
129 u64 ofs, ino, bix;
130 u32 seg_ofs, logical_segno, cleaned = 0;
131 int err, len, valid;
132 gc_level_t gc_level;
133
134 LOGFS_BUG_ON(segment_is_reserved(sb, segno), sb);
135
136 btree_insert32(&super->s_reserved_segments, segno, (void *)1, GFP_NOFS);
137 err = wbuf_read(sb, dev_ofs(sb, segno, 0), sizeof(sh), &sh);
138 BUG_ON(err);
139 gc_level = GC_LEVEL(sh.level);
140 logical_segno = be32_to_cpu(sh.segno);
141 if (sh.crc != logfs_crc32(&sh, sizeof(sh), 4)) {
142 logfs_mark_segment_bad(sb, segno);
143 cleaned = -1;
144 goto out;
145 }
146
147 for (seg_ofs = LOGFS_SEGMENT_HEADERSIZE;
148 seg_ofs + sizeof(oh) < super->s_segsize; ) {
149 ofs = dev_ofs(sb, logical_segno, seg_ofs);
150 err = wbuf_read(sb, dev_ofs(sb, segno, seg_ofs), sizeof(oh),
151 &oh);
152 BUG_ON(err);
153
154 if (!memchr_inv(&oh, 0xff, sizeof(oh)))
155 break;
156
157 if (oh.crc != logfs_crc32(&oh, sizeof(oh) - 4, 4)) {
158 logfs_mark_segment_bad(sb, segno);
159 cleaned = super->s_segsize - 1;
160 goto out;
161 }
162
163 ino = be64_to_cpu(oh.ino);
164 bix = be64_to_cpu(oh.bix);
165 len = sizeof(oh) + be16_to_cpu(oh.len);
166 valid = logfs_is_valid_block(sb, ofs, ino, bix, gc_level);
167 if (valid == 1) {
168 logfs_cleanse_block(sb, ofs, ino, bix, gc_level);
169 cleaned += len;
170 } else if (valid == 2) {
171 /* Will be invalid upon journal commit */
172 cleaned += len;
173 }
174 seg_ofs += len;
175 }
176out:
177 btree_remove32(&super->s_reserved_segments, segno);
178 return cleaned;
179}
180
181static struct gc_candidate *add_list(struct gc_candidate *cand,
182 struct candidate_list *list)
183{
184 struct rb_node **p = &list->rb_tree.rb_node;
185 struct rb_node *parent = NULL;
186 struct gc_candidate *cur;
187 int comp;
188
189 cand->list = list;
190 while (*p) {
191 parent = *p;
192 cur = rb_entry(parent, struct gc_candidate, rb_node);
193
194 if (list->sort_by_ec)
195 comp = cand->erase_count < cur->erase_count;
196 else
197 comp = cand->valid < cur->valid;
198
199 if (comp)
200 p = &parent->rb_left;
201 else
202 p = &parent->rb_right;
203 }
204 rb_link_node(&cand->rb_node, parent, p);
205 rb_insert_color(&cand->rb_node, &list->rb_tree);
206
207 if (list->count <= list->maxcount) {
208 list->count++;
209 return NULL;
210 }
211 cand = rb_entry(rb_last(&list->rb_tree), struct gc_candidate, rb_node);
212 rb_erase(&cand->rb_node, &list->rb_tree);
213 cand->list = NULL;
214 return cand;
215}
216
217static void remove_from_list(struct gc_candidate *cand)
218{
219 struct candidate_list *list = cand->list;
220
221 rb_erase(&cand->rb_node, &list->rb_tree);
222 list->count--;
223}
224
225static void free_candidate(struct super_block *sb, struct gc_candidate *cand)
226{
227 struct logfs_super *super = logfs_super(sb);
228
229 btree_remove32(&super->s_cand_tree, cand->segno);
230 kfree(cand);
231}
232
233u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec)
234{
235 struct gc_candidate *cand;
236 u32 segno;
237
238 BUG_ON(list->count == 0);
239
240 cand = rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node);
241 remove_from_list(cand);
242 segno = cand->segno;
243 if (ec)
244 *ec = cand->erase_count;
245 free_candidate(sb, cand);
246 return segno;
247}
248
249/*
250 * We have several lists to manage segments with. The reserve_list is used to
251 * deal with bad blocks. We try to keep the best (lowest ec) segments on this
252 * list.
253 * The free_list contains free segments for normal usage. It usually gets the
254 * second pick after the reserve_list. But when the free_list is running short
255 * it is more important to keep the free_list full than to keep a reserve.
256 *
257 * Segments that are not free are put onto a per-level low_list. If we have
258 * to run garbage collection, we pick a candidate from there. All segments on
259 * those lists should have at least some free space so GC will make progress.
260 *
261 * And last we have the ec_list, which is used to pick segments for wear
262 * leveling.
263 *
264 * If all appropriate lists are full, we simply free the candidate and forget
265 * about that segment for a while. We have better candidates for each purpose.
266 */
267static void __add_candidate(struct super_block *sb, struct gc_candidate *cand)
268{
269 struct logfs_super *super = logfs_super(sb);
270 u32 full = super->s_segsize - LOGFS_SEGMENT_RESERVE;
271
272 if (cand->valid == 0) {
273 /* 100% free segments */
274 log_gc_noisy("add reserve segment %x (ec %x) at %llx\n",
275 cand->segno, cand->erase_count,
276 dev_ofs(sb, cand->segno, 0));
277 cand = add_list(cand, &super->s_reserve_list);
278 if (cand) {
279 log_gc_noisy("add free segment %x (ec %x) at %llx\n",
280 cand->segno, cand->erase_count,
281 dev_ofs(sb, cand->segno, 0));
282 cand = add_list(cand, &super->s_free_list);
283 }
284 } else {
285 /* good candidates for Garbage Collection */
286 if (cand->valid < full)
287 cand = add_list(cand, &super->s_low_list[cand->dist]);
288 /* good candidates for wear leveling,
289 * segments that were recently written get ignored */
290 if (cand)
291 cand = add_list(cand, &super->s_ec_list);
292 }
293 if (cand)
294 free_candidate(sb, cand);
295}
296
297static int add_candidate(struct super_block *sb, u32 segno, u32 valid, u32 ec,
298 u8 dist)
299{
300 struct logfs_super *super = logfs_super(sb);
301 struct gc_candidate *cand;
302
303 cand = kmalloc(sizeof(*cand), GFP_NOFS);
304 if (!cand)
305 return -ENOMEM;
306
307 cand->segno = segno;
308 cand->valid = valid;
309 cand->erase_count = ec;
310 cand->dist = dist;
311
312 btree_insert32(&super->s_cand_tree, segno, cand, GFP_NOFS);
313 __add_candidate(sb, cand);
314 return 0;
315}
316
317static void remove_segment_from_lists(struct super_block *sb, u32 segno)
318{
319 struct logfs_super *super = logfs_super(sb);
320 struct gc_candidate *cand;
321
322 cand = btree_lookup32(&super->s_cand_tree, segno);
323 if (cand) {
324 remove_from_list(cand);
325 free_candidate(sb, cand);
326 }
327}
328
329static void scan_segment(struct super_block *sb, u32 segno)
330{
331 u32 valid, ec = 0;
332 gc_level_t gc_level = 0;
333 u8 dist;
334
335 if (segment_is_reserved(sb, segno))
336 return;
337
338 remove_segment_from_lists(sb, segno);
339 valid = logfs_valid_bytes(sb, segno, &ec, &gc_level);
340 if (valid == RESERVED)
341 return;
342
343 dist = root_distance(sb, gc_level);
344 add_candidate(sb, segno, valid, ec, dist);
345}
346
347static struct gc_candidate *first_in_list(struct candidate_list *list)
348{
349 if (list->count == 0)
350 return NULL;
351 return rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node);
352}
353
354/*
355 * Find the best segment for garbage collection. Main criterion is
356 * the segment requiring the least effort to clean. Secondary
357 * criterion is to GC on the lowest level available.
358 *
359 * So we search the least effort segment on the lowest level first,
360 * then move up and pick another segment iff is requires significantly
361 * less effort. Hence the LOGFS_MAX_OBJECTSIZE in the comparison.
362 */
363static struct gc_candidate *get_candidate(struct super_block *sb)
364{
365 struct logfs_super *super = logfs_super(sb);
366 int i, max_dist;
367 struct gc_candidate *cand = NULL, *this;
368
369 max_dist = min(no_free_segments(sb), LOGFS_NO_AREAS);
370
371 for (i = max_dist; i >= 0; i--) {
372 this = first_in_list(&super->s_low_list[i]);
373 if (!this)
374 continue;
375 if (!cand)
376 cand = this;
377 if (this->valid + LOGFS_MAX_OBJECTSIZE <= cand->valid)
378 cand = this;
379 }
380 return cand;
381}
382
383static int __logfs_gc_once(struct super_block *sb, struct gc_candidate *cand)
384{
385 struct logfs_super *super = logfs_super(sb);
386 gc_level_t gc_level;
387 u32 cleaned, valid, segno, ec;
388 u8 dist;
389
390 if (!cand) {
391 log_gc("GC attempted, but no candidate found\n");
392 return 0;
393 }
394
395 segno = cand->segno;
396 dist = cand->dist;
397 valid = logfs_valid_bytes(sb, segno, &ec, &gc_level);
398 free_candidate(sb, cand);
399 log_gc("GC segment #%02x at %llx, %x required, %x free, %x valid, %llx free\n",
400 segno, (u64)segno << super->s_segshift,
401 dist, no_free_segments(sb), valid,
402 super->s_free_bytes);
403 cleaned = logfs_gc_segment(sb, segno, dist);
404 log_gc("GC segment #%02x complete - now %x valid\n", segno,
405 valid - cleaned);
406 BUG_ON(cleaned != valid);
407 return 1;
408}
409
410static int logfs_gc_once(struct super_block *sb)
411{
412 struct gc_candidate *cand;
413
414 cand = get_candidate(sb);
415 if (cand)
416 remove_from_list(cand);
417 return __logfs_gc_once(sb, cand);
418}
419
420/* returns 1 if a wrap occurs, 0 otherwise */
421static int logfs_scan_some(struct super_block *sb)
422{
423 struct logfs_super *super = logfs_super(sb);
424 u32 segno;
425 int i, ret = 0;
426
427 segno = super->s_sweeper;
428 for (i = SCAN_RATIO; i > 0; i--) {
429 segno++;
430 if (segno >= super->s_no_segs) {
431 segno = 0;
432 ret = 1;
433 /* Break out of the loop. We want to read a single
434 * block from the segment size on next invocation if
435 * SCAN_RATIO is set to match block size
436 */
437 break;
438 }
439
440 scan_segment(sb, segno);
441 }
442 super->s_sweeper = segno;
443 return ret;
444}
445
446/*
447 * In principle, this function should loop forever, looking for GC candidates
448 * and moving data. LogFS is designed in such a way that this loop is
449 * guaranteed to terminate.
450 *
451 * Limiting the loop to some iterations serves purely to catch cases when
452 * these guarantees have failed. An actual endless loop is an obvious bug
453 * and should be reported as such.
454 */
455static void __logfs_gc_pass(struct super_block *sb, int target)
456{
457 struct logfs_super *super = logfs_super(sb);
458 struct logfs_block *block;
459 int round, progress, last_progress = 0;
460
461 if (no_free_segments(sb) >= target &&
462 super->s_no_object_aliases < MAX_OBJ_ALIASES)
463 return;
464
465 log_gc("__logfs_gc_pass(%x)\n", target);
466 for (round = 0; round < SCAN_ROUNDS; ) {
467 if (no_free_segments(sb) >= target)
468 goto write_alias;
469
470 /* Sync in-memory state with on-medium state in case they
471 * diverged */
472 logfs_write_anchor(super->s_master_inode);
473 round += logfs_scan_some(sb);
474 if (no_free_segments(sb) >= target)
475 goto write_alias;
476 progress = logfs_gc_once(sb);
477 if (progress)
478 last_progress = round;
479 else if (round - last_progress > 2)
480 break;
481 continue;
482
483 /*
484 * The goto logic is nasty, I just don't know a better way to
485 * code it. GC is supposed to ensure two things:
486 * 1. Enough free segments are available.
487 * 2. The number of aliases is bounded.
488 * When 1. is achieved, we take a look at 2. and write back
489 * some alias-containing blocks, if necessary. However, after
490 * each such write we need to go back to 1., as writes can
491 * consume free segments.
492 */
493write_alias:
494 if (super->s_no_object_aliases < MAX_OBJ_ALIASES)
495 return;
496 if (list_empty(&super->s_object_alias)) {
497 /* All aliases are still in btree */
498 return;
499 }
500 log_gc("Write back one alias\n");
501 block = list_entry(super->s_object_alias.next,
502 struct logfs_block, alias_list);
503 block->ops->write_block(block);
504 /*
505 * To round off the nasty goto logic, we reset round here. It
506 * is a safety-net for GC not making any progress and limited
507 * to something reasonably small. If incremented it for every
508 * single alias, the loop could terminate rather quickly.
509 */
510 round = 0;
511 }
512 LOGFS_BUG(sb);
513}
514
515static int wl_ratelimit(struct super_block *sb, u64 *next_event)
516{
517 struct logfs_super *super = logfs_super(sb);
518
519 if (*next_event < super->s_gec) {
520 *next_event = super->s_gec + WL_RATELIMIT;
521 return 0;
522 }
523 return 1;
524}
525
526static void logfs_wl_pass(struct super_block *sb)
527{
528 struct logfs_super *super = logfs_super(sb);
529 struct gc_candidate *wl_cand, *free_cand;
530
531 if (wl_ratelimit(sb, &super->s_wl_gec_ostore))
532 return;
533
534 wl_cand = first_in_list(&super->s_ec_list);
535 if (!wl_cand)
536 return;
537 free_cand = first_in_list(&super->s_free_list);
538 if (!free_cand)
539 return;
540
541 if (wl_cand->erase_count < free_cand->erase_count + WL_DELTA) {
542 remove_from_list(wl_cand);
543 __logfs_gc_once(sb, wl_cand);
544 }
545}
546
547/*
548 * The journal needs wear leveling as well. But moving the journal is an
549 * expensive operation so we try to avoid it as much as possible. And if we
550 * have to do it, we move the whole journal, not individual segments.
551 *
552 * Ratelimiting is not strictly necessary here, it mainly serves to avoid the
553 * calculations. First we check whether moving the journal would be a
554 * significant improvement. That means that a) the current journal segments
555 * have more wear than the future journal segments and b) the current journal
556 * segments have more wear than normal ostore segments.
557 * Rationale for b) is that we don't have to move the journal if it is aging
558 * less than the ostore, even if the reserve segments age even less (they are
559 * excluded from wear leveling, after all).
560 * Next we check that the superblocks have less wear than the journal. Since
561 * moving the journal requires writing the superblocks, we have to protect the
562 * superblocks even more than the journal.
563 *
564 * Also we double the acceptable wear difference, compared to ostore wear
565 * leveling. Journal data is read and rewritten rapidly, comparatively. So
566 * soft errors have much less time to accumulate and we allow the journal to
567 * be a bit worse than the ostore.
568 */
569static void logfs_journal_wl_pass(struct super_block *sb)
570{
571 struct logfs_super *super = logfs_super(sb);
572 struct gc_candidate *cand;
573 u32 min_journal_ec = -1, max_reserve_ec = 0;
574 int i;
575
576 if (wl_ratelimit(sb, &super->s_wl_gec_journal))
577 return;
578
579 if (super->s_reserve_list.count < super->s_no_journal_segs) {
580 /* Reserve is not full enough to move complete journal */
581 return;
582 }
583
584 journal_for_each(i)
585 if (super->s_journal_seg[i])
586 min_journal_ec = min(min_journal_ec,
587 super->s_journal_ec[i]);
588 cand = rb_entry(rb_first(&super->s_free_list.rb_tree),
589 struct gc_candidate, rb_node);
590 max_reserve_ec = cand->erase_count;
591 for (i = 0; i < 2; i++) {
592 struct logfs_segment_entry se;
593 u32 segno = seg_no(sb, super->s_sb_ofs[i]);
594 u32 ec;
595
596 logfs_get_segment_entry(sb, segno, &se);
597 ec = be32_to_cpu(se.ec_level) >> 4;
598 max_reserve_ec = max(max_reserve_ec, ec);
599 }
600
601 if (min_journal_ec > max_reserve_ec + 2 * WL_DELTA) {
602 do_logfs_journal_wl_pass(sb);
603 }
604}
605
606void logfs_gc_pass(struct super_block *sb)
607{
608 struct logfs_super *super = logfs_super(sb);
609
610 //BUG_ON(mutex_trylock(&logfs_super(sb)->s_w_mutex));
611 /* Write journal before free space is getting saturated with dirty
612 * objects.
613 */
614 if (super->s_dirty_used_bytes + super->s_dirty_free_bytes
615 + LOGFS_MAX_OBJECTSIZE >= super->s_free_bytes)
616 logfs_write_anchor(super->s_master_inode);
617 __logfs_gc_pass(sb, logfs_super(sb)->s_total_levels);
618 logfs_wl_pass(sb);
619 logfs_journal_wl_pass(sb);
620}
621
622static int check_area(struct super_block *sb, int i)
623{
624 struct logfs_super *super = logfs_super(sb);
625 struct logfs_area *area = super->s_area[i];
626 struct logfs_object_header oh;
627 u32 segno = area->a_segno;
628 u32 ofs = area->a_used_bytes;
629 __be32 crc;
630 int err;
631
632 if (!area->a_is_open)
633 return 0;
634
635 for (ofs = area->a_used_bytes;
636 ofs <= super->s_segsize - sizeof(oh);
637 ofs += (u32)be16_to_cpu(oh.len) + sizeof(oh)) {
638 err = wbuf_read(sb, dev_ofs(sb, segno, ofs), sizeof(oh), &oh);
639 if (err)
640 return err;
641
642 if (!memchr_inv(&oh, 0xff, sizeof(oh)))
643 break;
644
645 crc = logfs_crc32(&oh, sizeof(oh) - 4, 4);
646 if (crc != oh.crc) {
647 printk(KERN_INFO "interrupted header at %llx\n",
648 dev_ofs(sb, segno, ofs));
649 return 0;
650 }
651 }
652 if (ofs != area->a_used_bytes) {
653 printk(KERN_INFO "%x bytes unaccounted data found at %llx\n",
654 ofs - area->a_used_bytes,
655 dev_ofs(sb, segno, area->a_used_bytes));
656 area->a_used_bytes = ofs;
657 }
658 return 0;
659}
660
661int logfs_check_areas(struct super_block *sb)
662{
663 int i, err;
664
665 for_each_area(i) {
666 err = check_area(sb, i);
667 if (err)
668 return err;
669 }
670 return 0;
671}
672
673static void logfs_init_candlist(struct candidate_list *list, int maxcount,
674 int sort_by_ec)
675{
676 list->count = 0;
677 list->maxcount = maxcount;
678 list->sort_by_ec = sort_by_ec;
679 list->rb_tree = RB_ROOT;
680}
681
682int logfs_init_gc(struct super_block *sb)
683{
684 struct logfs_super *super = logfs_super(sb);
685 int i;
686
687 btree_init_mempool32(&super->s_cand_tree, super->s_btree_pool);
688 logfs_init_candlist(&super->s_free_list, LIST_SIZE + SCAN_RATIO, 1);
689 logfs_init_candlist(&super->s_reserve_list,
690 super->s_bad_seg_reserve, 1);
691 for_each_area(i)
692 logfs_init_candlist(&super->s_low_list[i], LIST_SIZE, 0);
693 logfs_init_candlist(&super->s_ec_list, LIST_SIZE, 1);
694 return 0;
695}
696
697static void logfs_cleanup_list(struct super_block *sb,
698 struct candidate_list *list)
699{
700 struct gc_candidate *cand;
701
702 while (list->count) {
703 cand = rb_entry(list->rb_tree.rb_node, struct gc_candidate,
704 rb_node);
705 remove_from_list(cand);
706 free_candidate(sb, cand);
707 }
708 BUG_ON(list->rb_tree.rb_node);
709}
710
711void logfs_cleanup_gc(struct super_block *sb)
712{
713 struct logfs_super *super = logfs_super(sb);
714 int i;
715
716 if (!super->s_free_list.count)
717 return;
718
719 /*
720 * FIXME: The btree may still contain a single empty node. So we
721 * call the grim visitor to clean up that mess. Btree code should
722 * do it for us, really.
723 */
724 btree_grim_visitor32(&super->s_cand_tree, 0, NULL);
725 logfs_cleanup_list(sb, &super->s_free_list);
726 logfs_cleanup_list(sb, &super->s_reserve_list);
727 for_each_area(i)
728 logfs_cleanup_list(sb, &super->s_low_list[i]);
729 logfs_cleanup_list(sb, &super->s_ec_list);
730}
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
new file mode 100644
index 000000000000..6d08b3762641
--- /dev/null
+++ b/fs/logfs/inode.c
@@ -0,0 +1,417 @@
1/*
2 * fs/logfs/inode.c - inode handling code
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */
8#include "logfs.h"
9#include <linux/writeback.h>
10#include <linux/backing-dev.h>
11
12/*
13 * How soon to reuse old inode numbers? LogFS doesn't store deleted inodes
14 * on the medium. It therefore also lacks a method to store the previous
15 * generation number for deleted inodes. Instead a single generation number
16 * is stored which will be used for new inodes. Being just a 32bit counter,
17 * this can obvious wrap relatively quickly. So we only reuse inodes if we
18 * know that a fair number of inodes can be created before we have to increment
19 * the generation again - effectively adding some bits to the counter.
20 * But being too aggressive here means we keep a very large and very sparse
21 * inode file, wasting space on indirect blocks.
22 * So what is a good value? Beats me. 64k seems moderately bad on both
23 * fronts, so let's use that for now...
24 *
25 * NFS sucks, as everyone already knows.
26 */
27#define INOS_PER_WRAP (0x10000)
28
29/*
30 * Logfs' requirement to read inodes for garbage collection makes life a bit
31 * harder. GC may have to read inodes that are in I_FREEING state, when they
32 * are being written out - and waiting for GC to make progress, naturally.
33 *
34 * So we cannot just call iget() or some variant of it, but first have to check
35 * wether the inode in question might be in I_FREEING state. Therefore we
36 * maintain our own per-sb list of "almost deleted" inodes and check against
37 * that list first. Normally this should be at most 1-2 entries long.
38 *
39 * Also, inodes have logfs-specific reference counting on top of what the vfs
40 * does. When .destroy_inode is called, normally the reference count will drop
41 * to zero and the inode gets deleted. But if GC accessed the inode, its
42 * refcount will remain nonzero and final deletion will have to wait.
43 *
44 * As a result we have two sets of functions to get/put inodes:
45 * logfs_safe_iget/logfs_safe_iput - safe to call from GC context
46 * logfs_iget/iput - normal version
47 */
48static struct kmem_cache *logfs_inode_cache;
49
50static DEFINE_SPINLOCK(logfs_inode_lock);
51
52static void logfs_inode_setops(struct inode *inode)
53{
54 switch (inode->i_mode & S_IFMT) {
55 case S_IFDIR:
56 inode->i_op = &logfs_dir_iops;
57 inode->i_fop = &logfs_dir_fops;
58 inode->i_mapping->a_ops = &logfs_reg_aops;
59 break;
60 case S_IFREG:
61 inode->i_op = &logfs_reg_iops;
62 inode->i_fop = &logfs_reg_fops;
63 inode->i_mapping->a_ops = &logfs_reg_aops;
64 break;
65 case S_IFLNK:
66 inode->i_op = &logfs_symlink_iops;
67 inode->i_mapping->a_ops = &logfs_reg_aops;
68 break;
69 case S_IFSOCK: /* fall through */
70 case S_IFBLK: /* fall through */
71 case S_IFCHR: /* fall through */
72 case S_IFIFO:
73 init_special_inode(inode, inode->i_mode, inode->i_rdev);
74 break;
75 default:
76 BUG();
77 }
78}
79
80static struct inode *__logfs_iget(struct super_block *sb, ino_t ino)
81{
82 struct inode *inode = iget_locked(sb, ino);
83 int err;
84
85 if (!inode)
86 return ERR_PTR(-ENOMEM);
87 if (!(inode->i_state & I_NEW))
88 return inode;
89
90 err = logfs_read_inode(inode);
91 if (err || inode->i_nlink == 0) {
92 /* inode->i_nlink == 0 can be true when called from
93 * block validator */
94 /* set i_nlink to 0 to prevent caching */
95 inode->i_nlink = 0;
96 logfs_inode(inode)->li_flags |= LOGFS_IF_ZOMBIE;
97 iget_failed(inode);
98 if (!err)
99 err = -ENOENT;
100 return ERR_PTR(err);
101 }
102
103 logfs_inode_setops(inode);
104 unlock_new_inode(inode);
105 return inode;
106}
107
108struct inode *logfs_iget(struct super_block *sb, ino_t ino)
109{
110 BUG_ON(ino == LOGFS_INO_MASTER);
111 BUG_ON(ino == LOGFS_INO_SEGFILE);
112 return __logfs_iget(sb, ino);
113}
114
115/*
116 * is_cached is set to 1 if we hand out a cached inode, 0 otherwise.
117 * this allows logfs_iput to do the right thing later
118 */
119struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *is_cached)
120{
121 struct logfs_super *super = logfs_super(sb);
122 struct logfs_inode *li;
123
124 if (ino == LOGFS_INO_MASTER)
125 return super->s_master_inode;
126 if (ino == LOGFS_INO_SEGFILE)
127 return super->s_segfile_inode;
128
129 spin_lock(&logfs_inode_lock);
130 list_for_each_entry(li, &super->s_freeing_list, li_freeing_list)
131 if (li->vfs_inode.i_ino == ino) {
132 li->li_refcount++;
133 spin_unlock(&logfs_inode_lock);
134 *is_cached = 1;
135 return &li->vfs_inode;
136 }
137 spin_unlock(&logfs_inode_lock);
138
139 *is_cached = 0;
140 return __logfs_iget(sb, ino);
141}
142
143static void __logfs_destroy_inode(struct inode *inode)
144{
145 struct logfs_inode *li = logfs_inode(inode);
146
147 BUG_ON(li->li_block);
148 list_del(&li->li_freeing_list);
149 kmem_cache_free(logfs_inode_cache, li);
150}
151
152static void logfs_destroy_inode(struct inode *inode)
153{
154 struct logfs_inode *li = logfs_inode(inode);
155
156 BUG_ON(list_empty(&li->li_freeing_list));
157 spin_lock(&logfs_inode_lock);
158 li->li_refcount--;
159 if (li->li_refcount == 0)
160 __logfs_destroy_inode(inode);
161 spin_unlock(&logfs_inode_lock);
162}
163
164void logfs_safe_iput(struct inode *inode, int is_cached)
165{
166 if (inode->i_ino == LOGFS_INO_MASTER)
167 return;
168 if (inode->i_ino == LOGFS_INO_SEGFILE)
169 return;
170
171 if (is_cached) {
172 logfs_destroy_inode(inode);
173 return;
174 }
175
176 iput(inode);
177}
178
179static void logfs_init_inode(struct super_block *sb, struct inode *inode)
180{
181 struct logfs_inode *li = logfs_inode(inode);
182 int i;
183
184 li->li_flags = 0;
185 li->li_height = 0;
186 li->li_used_bytes = 0;
187 li->li_block = NULL;
188 inode->i_uid = 0;
189 inode->i_gid = 0;
190 inode->i_size = 0;
191 inode->i_blocks = 0;
192 inode->i_ctime = CURRENT_TIME;
193 inode->i_mtime = CURRENT_TIME;
194 inode->i_nlink = 1;
195 INIT_LIST_HEAD(&li->li_freeing_list);
196
197 for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
198 li->li_data[i] = 0;
199
200 return;
201}
202
203static struct inode *logfs_alloc_inode(struct super_block *sb)
204{
205 struct logfs_inode *li;
206
207 li = kmem_cache_alloc(logfs_inode_cache, GFP_NOFS);
208 if (!li)
209 return NULL;
210 logfs_init_inode(sb, &li->vfs_inode);
211 return &li->vfs_inode;
212}
213
214/*
215 * In logfs inodes are written to an inode file. The inode file, like any
216 * other file, is managed with a inode. The inode file's inode, aka master
217 * inode, requires special handling in several respects. First, it cannot be
218 * written to the inode file, so it is stored in the journal instead.
219 *
220 * Secondly, this inode cannot be written back and destroyed before all other
221 * inodes have been written. The ordering is important. Linux' VFS is happily
222 * unaware of the ordering constraint and would ordinarily destroy the master
223 * inode at umount time while other inodes are still in use and dirty. Not
224 * good.
225 *
226 * So logfs makes sure the master inode is not written until all other inodes
227 * have been destroyed. Sadly, this method has another side-effect. The VFS
228 * will notice one remaining inode and print a frightening warning message.
229 * Worse, it is impossible to judge whether such a warning was caused by the
230 * master inode or any other inodes have leaked as well.
231 *
232 * Our attempt of solving this is with logfs_new_meta_inode() below. Its
233 * purpose is to create a new inode that will not trigger the warning if such
234 * an inode is still in use. An ugly hack, no doubt. Suggections for
235 * improvement are welcome.
236 */
237struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino)
238{
239 struct inode *inode;
240
241 inode = logfs_alloc_inode(sb);
242 if (!inode)
243 return ERR_PTR(-ENOMEM);
244
245 inode->i_mode = S_IFREG;
246 inode->i_ino = ino;
247 inode->i_sb = sb;
248
249 /* This is a blatant copy of alloc_inode code. We'd need alloc_inode
250 * to be nonstatic, alas. */
251 {
252 struct address_space * const mapping = &inode->i_data;
253
254 mapping->a_ops = &logfs_reg_aops;
255 mapping->host = inode;
256 mapping->flags = 0;
257 mapping_set_gfp_mask(mapping, GFP_NOFS);
258 mapping->assoc_mapping = NULL;
259 mapping->backing_dev_info = &default_backing_dev_info;
260 inode->i_mapping = mapping;
261 inode->i_nlink = 1;
262 }
263
264 return inode;
265}
266
267struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino)
268{
269 struct inode *inode;
270 int err;
271
272 inode = logfs_new_meta_inode(sb, ino);
273 if (IS_ERR(inode))
274 return inode;
275
276 err = logfs_read_inode(inode);
277 if (err) {
278 destroy_meta_inode(inode);
279 return ERR_PTR(err);
280 }
281 logfs_inode_setops(inode);
282 return inode;
283}
284
285static int logfs_write_inode(struct inode *inode, int do_sync)
286{
287 int ret;
288 long flags = WF_LOCK;
289
290 /* Can only happen if creat() failed. Safe to skip. */
291 if (logfs_inode(inode)->li_flags & LOGFS_IF_STILLBORN)
292 return 0;
293
294 ret = __logfs_write_inode(inode, flags);
295 LOGFS_BUG_ON(ret, inode->i_sb);
296 return ret;
297}
298
299void destroy_meta_inode(struct inode *inode)
300{
301 if (inode) {
302 if (inode->i_data.nrpages)
303 truncate_inode_pages(&inode->i_data, 0);
304 logfs_clear_inode(inode);
305 kmem_cache_free(logfs_inode_cache, logfs_inode(inode));
306 }
307}
308
309/* called with inode_lock held */
310static void logfs_drop_inode(struct inode *inode)
311{
312 struct logfs_super *super = logfs_super(inode->i_sb);
313 struct logfs_inode *li = logfs_inode(inode);
314
315 spin_lock(&logfs_inode_lock);
316 list_move(&li->li_freeing_list, &super->s_freeing_list);
317 spin_unlock(&logfs_inode_lock);
318 generic_drop_inode(inode);
319}
320
321static void logfs_set_ino_generation(struct super_block *sb,
322 struct inode *inode)
323{
324 struct logfs_super *super = logfs_super(sb);
325 u64 ino;
326
327 mutex_lock(&super->s_journal_mutex);
328 ino = logfs_seek_hole(super->s_master_inode, super->s_last_ino);
329 super->s_last_ino = ino;
330 super->s_inos_till_wrap--;
331 if (super->s_inos_till_wrap < 0) {
332 super->s_last_ino = LOGFS_RESERVED_INOS;
333 super->s_generation++;
334 super->s_inos_till_wrap = INOS_PER_WRAP;
335 }
336 inode->i_ino = ino;
337 inode->i_generation = super->s_generation;
338 mutex_unlock(&super->s_journal_mutex);
339}
340
341struct inode *logfs_new_inode(struct inode *dir, int mode)
342{
343 struct super_block *sb = dir->i_sb;
344 struct inode *inode;
345
346 inode = new_inode(sb);
347 if (!inode)
348 return ERR_PTR(-ENOMEM);
349
350 logfs_init_inode(sb, inode);
351
352 /* inherit parent flags */
353 logfs_inode(inode)->li_flags |=
354 logfs_inode(dir)->li_flags & LOGFS_FL_INHERITED;
355
356 inode->i_mode = mode;
357 logfs_set_ino_generation(sb, inode);
358
359 inode->i_uid = current_fsuid();
360 inode->i_gid = current_fsgid();
361 if (dir->i_mode & S_ISGID) {
362 inode->i_gid = dir->i_gid;
363 if (S_ISDIR(mode))
364 inode->i_mode |= S_ISGID;
365 }
366
367 logfs_inode_setops(inode);
368 insert_inode_hash(inode);
369
370 return inode;
371}
372
373static void logfs_init_once(void *_li)
374{
375 struct logfs_inode *li = _li;
376 int i;
377
378 li->li_flags = 0;
379 li->li_used_bytes = 0;
380 li->li_refcount = 1;
381 for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
382 li->li_data[i] = 0;
383 inode_init_once(&li->vfs_inode);
384}
385
386static int logfs_sync_fs(struct super_block *sb, int wait)
387{
388 /* FIXME: write anchor */
389 logfs_super(sb)->s_devops->sync(sb);
390 return 0;
391}
392
393const struct super_operations logfs_super_operations = {
394 .alloc_inode = logfs_alloc_inode,
395 .clear_inode = logfs_clear_inode,
396 .delete_inode = logfs_delete_inode,
397 .destroy_inode = logfs_destroy_inode,
398 .drop_inode = logfs_drop_inode,
399 .write_inode = logfs_write_inode,
400 .statfs = logfs_statfs,
401 .sync_fs = logfs_sync_fs,
402};
403
404int logfs_init_inode_cache(void)
405{
406 logfs_inode_cache = kmem_cache_create("logfs_inode_cache",
407 sizeof(struct logfs_inode), 0, SLAB_RECLAIM_ACCOUNT,
408 logfs_init_once);
409 if (!logfs_inode_cache)
410 return -ENOMEM;
411 return 0;
412}
413
414void logfs_destroy_inode_cache(void)
415{
416 kmem_cache_destroy(logfs_inode_cache);
417}
diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c
new file mode 100644
index 000000000000..7a023dbba9f8
--- /dev/null
+++ b/fs/logfs/journal.c
@@ -0,0 +1,879 @@
1/*
2 * fs/logfs/journal.c - journal handling code
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */
8#include "logfs.h"
9
10static void logfs_calc_free(struct super_block *sb)
11{
12 struct logfs_super *super = logfs_super(sb);
13 u64 reserve, no_segs = super->s_no_segs;
14 s64 free;
15 int i;
16
17 /* superblock segments */
18 no_segs -= 2;
19 super->s_no_journal_segs = 0;
20 /* journal */
21 journal_for_each(i)
22 if (super->s_journal_seg[i]) {
23 no_segs--;
24 super->s_no_journal_segs++;
25 }
26
27 /* open segments plus one extra per level for GC */
28 no_segs -= 2 * super->s_total_levels;
29
30 free = no_segs * (super->s_segsize - LOGFS_SEGMENT_RESERVE);
31 free -= super->s_used_bytes;
32 /* just a bit extra */
33 free -= super->s_total_levels * 4096;
34
35 /* Bad blocks are 'paid' for with speed reserve - the filesystem
36 * simply gets slower as bad blocks accumulate. Until the bad blocks
37 * exceed the speed reserve - then the filesystem gets smaller.
38 */
39 reserve = super->s_bad_segments + super->s_bad_seg_reserve;
40 reserve *= super->s_segsize - LOGFS_SEGMENT_RESERVE;
41 reserve = max(reserve, super->s_speed_reserve);
42 free -= reserve;
43 if (free < 0)
44 free = 0;
45
46 super->s_free_bytes = free;
47}
48
49static void reserve_sb_and_journal(struct super_block *sb)
50{
51 struct logfs_super *super = logfs_super(sb);
52 struct btree_head32 *head = &super->s_reserved_segments;
53 int i, err;
54
55 err = btree_insert32(head, seg_no(sb, super->s_sb_ofs[0]), (void *)1,
56 GFP_KERNEL);
57 BUG_ON(err);
58
59 err = btree_insert32(head, seg_no(sb, super->s_sb_ofs[1]), (void *)1,
60 GFP_KERNEL);
61 BUG_ON(err);
62
63 journal_for_each(i) {
64 if (!super->s_journal_seg[i])
65 continue;
66 err = btree_insert32(head, super->s_journal_seg[i], (void *)1,
67 GFP_KERNEL);
68 BUG_ON(err);
69 }
70}
71
72static void read_dynsb(struct super_block *sb,
73 struct logfs_je_dynsb *dynsb)
74{
75 struct logfs_super *super = logfs_super(sb);
76
77 super->s_gec = be64_to_cpu(dynsb->ds_gec);
78 super->s_sweeper = be64_to_cpu(dynsb->ds_sweeper);
79 super->s_victim_ino = be64_to_cpu(dynsb->ds_victim_ino);
80 super->s_rename_dir = be64_to_cpu(dynsb->ds_rename_dir);
81 super->s_rename_pos = be64_to_cpu(dynsb->ds_rename_pos);
82 super->s_used_bytes = be64_to_cpu(dynsb->ds_used_bytes);
83 super->s_generation = be32_to_cpu(dynsb->ds_generation);
84}
85
86static void read_anchor(struct super_block *sb,
87 struct logfs_je_anchor *da)
88{
89 struct logfs_super *super = logfs_super(sb);
90 struct inode *inode = super->s_master_inode;
91 struct logfs_inode *li = logfs_inode(inode);
92 int i;
93
94 super->s_last_ino = be64_to_cpu(da->da_last_ino);
95 li->li_flags = 0;
96 li->li_height = da->da_height;
97 i_size_write(inode, be64_to_cpu(da->da_size));
98 li->li_used_bytes = be64_to_cpu(da->da_used_bytes);
99
100 for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
101 li->li_data[i] = be64_to_cpu(da->da_data[i]);
102}
103
104static void read_erasecount(struct super_block *sb,
105 struct logfs_je_journal_ec *ec)
106{
107 struct logfs_super *super = logfs_super(sb);
108 int i;
109
110 journal_for_each(i)
111 super->s_journal_ec[i] = be32_to_cpu(ec->ec[i]);
112}
113
114static int read_area(struct super_block *sb, struct logfs_je_area *a)
115{
116 struct logfs_super *super = logfs_super(sb);
117 struct logfs_area *area = super->s_area[a->gc_level];
118 u64 ofs;
119 u32 writemask = ~(super->s_writesize - 1);
120
121 if (a->gc_level >= LOGFS_NO_AREAS)
122 return -EIO;
123 if (a->vim != VIM_DEFAULT)
124 return -EIO; /* TODO: close area and continue */
125
126 area->a_used_bytes = be32_to_cpu(a->used_bytes);
127 area->a_written_bytes = area->a_used_bytes & writemask;
128 area->a_segno = be32_to_cpu(a->segno);
129 if (area->a_segno)
130 area->a_is_open = 1;
131
132 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
133 if (super->s_writesize > 1)
134 logfs_buf_recover(area, ofs, a + 1, super->s_writesize);
135 else
136 logfs_buf_recover(area, ofs, NULL, 0);
137 return 0;
138}
139
140static void *unpack(void *from, void *to)
141{
142 struct logfs_journal_header *jh = from;
143 void *data = from + sizeof(struct logfs_journal_header);
144 int err;
145 size_t inlen, outlen;
146
147 inlen = be16_to_cpu(jh->h_len);
148 outlen = be16_to_cpu(jh->h_datalen);
149
150 if (jh->h_compr == COMPR_NONE)
151 memcpy(to, data, inlen);
152 else {
153 err = logfs_uncompress(data, to, inlen, outlen);
154 BUG_ON(err);
155 }
156 return to;
157}
158
159static int __read_je_header(struct super_block *sb, u64 ofs,
160 struct logfs_journal_header *jh)
161{
162 struct logfs_super *super = logfs_super(sb);
163 size_t bufsize = max_t(size_t, sb->s_blocksize, super->s_writesize)
164 + MAX_JOURNAL_HEADER;
165 u16 type, len, datalen;
166 int err;
167
168 /* read header only */
169 err = wbuf_read(sb, ofs, sizeof(*jh), jh);
170 if (err)
171 return err;
172 type = be16_to_cpu(jh->h_type);
173 len = be16_to_cpu(jh->h_len);
174 datalen = be16_to_cpu(jh->h_datalen);
175 if (len > sb->s_blocksize)
176 return -EIO;
177 if ((type < JE_FIRST) || (type > JE_LAST))
178 return -EIO;
179 if (datalen > bufsize)
180 return -EIO;
181 return 0;
182}
183
184static int __read_je_payload(struct super_block *sb, u64 ofs,
185 struct logfs_journal_header *jh)
186{
187 u16 len;
188 int err;
189
190 len = be16_to_cpu(jh->h_len);
191 err = wbuf_read(sb, ofs + sizeof(*jh), len, jh + 1);
192 if (err)
193 return err;
194 if (jh->h_crc != logfs_crc32(jh, len + sizeof(*jh), 4)) {
195 /* Old code was confused. It forgot about the header length
196 * and stopped calculating the crc 16 bytes before the end
197 * of data - ick!
198 * FIXME: Remove this hack once the old code is fixed.
199 */
200 if (jh->h_crc == logfs_crc32(jh, len, 4))
201 WARN_ON_ONCE(1);
202 else
203 return -EIO;
204 }
205 return 0;
206}
207
208/*
209 * jh needs to be large enough to hold the complete entry, not just the header
210 */
211static int __read_je(struct super_block *sb, u64 ofs,
212 struct logfs_journal_header *jh)
213{
214 int err;
215
216 err = __read_je_header(sb, ofs, jh);
217 if (err)
218 return err;
219 return __read_je_payload(sb, ofs, jh);
220}
221
222static int read_je(struct super_block *sb, u64 ofs)
223{
224 struct logfs_super *super = logfs_super(sb);
225 struct logfs_journal_header *jh = super->s_compressed_je;
226 void *scratch = super->s_je;
227 u16 type, datalen;
228 int err;
229
230 err = __read_je(sb, ofs, jh);
231 if (err)
232 return err;
233 type = be16_to_cpu(jh->h_type);
234 datalen = be16_to_cpu(jh->h_datalen);
235
236 switch (type) {
237 case JE_DYNSB:
238 read_dynsb(sb, unpack(jh, scratch));
239 break;
240 case JE_ANCHOR:
241 read_anchor(sb, unpack(jh, scratch));
242 break;
243 case JE_ERASECOUNT:
244 read_erasecount(sb, unpack(jh, scratch));
245 break;
246 case JE_AREA:
247 read_area(sb, unpack(jh, scratch));
248 break;
249 case JE_OBJ_ALIAS:
250 err = logfs_load_object_aliases(sb, unpack(jh, scratch),
251 datalen);
252 break;
253 default:
254 WARN_ON_ONCE(1);
255 return -EIO;
256 }
257 return err;
258}
259
260static int logfs_read_segment(struct super_block *sb, u32 segno)
261{
262 struct logfs_super *super = logfs_super(sb);
263 struct logfs_journal_header *jh = super->s_compressed_je;
264 u64 ofs, seg_ofs = dev_ofs(sb, segno, 0);
265 u32 h_ofs, last_ofs = 0;
266 u16 len, datalen, last_len;
267 int i, err;
268
269 /* search for most recent commit */
270 for (h_ofs = 0; h_ofs < super->s_segsize; h_ofs += sizeof(*jh)) {
271 ofs = seg_ofs + h_ofs;
272 err = __read_je_header(sb, ofs, jh);
273 if (err)
274 continue;
275 if (jh->h_type != cpu_to_be16(JE_COMMIT))
276 continue;
277 err = __read_je_payload(sb, ofs, jh);
278 if (err)
279 continue;
280 len = be16_to_cpu(jh->h_len);
281 datalen = be16_to_cpu(jh->h_datalen);
282 if ((datalen > sizeof(super->s_je_array)) ||
283 (datalen % sizeof(__be64)))
284 continue;
285 last_ofs = h_ofs;
286 last_len = datalen;
287 h_ofs += ALIGN(len, sizeof(*jh)) - sizeof(*jh);
288 }
289 /* read commit */
290 if (last_ofs == 0)
291 return -ENOENT;
292 ofs = seg_ofs + last_ofs;
293 log_journal("Read commit from %llx\n", ofs);
294 err = __read_je(sb, ofs, jh);
295 BUG_ON(err); /* We should have caught it in the scan loop already */
296 if (err)
297 return err;
298 /* uncompress */
299 unpack(jh, super->s_je_array);
300 super->s_no_je = last_len / sizeof(__be64);
301 /* iterate over array */
302 for (i = 0; i < super->s_no_je; i++) {
303 err = read_je(sb, be64_to_cpu(super->s_je_array[i]));
304 if (err)
305 return err;
306 }
307 super->s_journal_area->a_segno = segno;
308 return 0;
309}
310
311static u64 read_gec(struct super_block *sb, u32 segno)
312{
313 struct logfs_segment_header sh;
314 __be32 crc;
315 int err;
316
317 if (!segno)
318 return 0;
319 err = wbuf_read(sb, dev_ofs(sb, segno, 0), sizeof(sh), &sh);
320 if (err)
321 return 0;
322 crc = logfs_crc32(&sh, sizeof(sh), 4);
323 if (crc != sh.crc) {
324 WARN_ON(sh.gec != cpu_to_be64(0xffffffffffffffffull));
325 /* Most likely it was just erased */
326 return 0;
327 }
328 return be64_to_cpu(sh.gec);
329}
330
331static int logfs_read_journal(struct super_block *sb)
332{
333 struct logfs_super *super = logfs_super(sb);
334 u64 gec[LOGFS_JOURNAL_SEGS], max;
335 u32 segno;
336 int i, max_i;
337
338 max = 0;
339 max_i = -1;
340 journal_for_each(i) {
341 segno = super->s_journal_seg[i];
342 gec[i] = read_gec(sb, super->s_journal_seg[i]);
343 if (gec[i] > max) {
344 max = gec[i];
345 max_i = i;
346 }
347 }
348 if (max_i == -1)
349 return -EIO;
350 /* FIXME: Try older segments in case of error */
351 return logfs_read_segment(sb, super->s_journal_seg[max_i]);
352}
353
354/*
355 * First search the current segment (outer loop), then pick the next segment
356 * in the array, skipping any zero entries (inner loop).
357 */
358static void journal_get_free_segment(struct logfs_area *area)
359{
360 struct logfs_super *super = logfs_super(area->a_sb);
361 int i;
362
363 journal_for_each(i) {
364 if (area->a_segno != super->s_journal_seg[i])
365 continue;
366
367 do {
368 i++;
369 if (i == LOGFS_JOURNAL_SEGS)
370 i = 0;
371 } while (!super->s_journal_seg[i]);
372
373 area->a_segno = super->s_journal_seg[i];
374 area->a_erase_count = ++(super->s_journal_ec[i]);
375 log_journal("Journal now at %x (ec %x)\n", area->a_segno,
376 area->a_erase_count);
377 return;
378 }
379 BUG();
380}
381
382static void journal_get_erase_count(struct logfs_area *area)
383{
384 /* erase count is stored globally and incremented in
385 * journal_get_free_segment() - nothing to do here */
386}
387
388static int journal_erase_segment(struct logfs_area *area)
389{
390 struct super_block *sb = area->a_sb;
391 struct logfs_segment_header sh;
392 u64 ofs;
393 int err;
394
395 err = logfs_erase_segment(sb, area->a_segno);
396 if (err)
397 return err;
398
399 sh.pad = 0;
400 sh.type = SEG_JOURNAL;
401 sh.level = 0;
402 sh.segno = cpu_to_be32(area->a_segno);
403 sh.ec = cpu_to_be32(area->a_erase_count);
404 sh.gec = cpu_to_be64(logfs_super(sb)->s_gec);
405 sh.crc = logfs_crc32(&sh, sizeof(sh), 4);
406
407 /* This causes a bug in segment.c. Not yet. */
408 //logfs_set_segment_erased(sb, area->a_segno, area->a_erase_count, 0);
409
410 ofs = dev_ofs(sb, area->a_segno, 0);
411 area->a_used_bytes = ALIGN(sizeof(sh), 16);
412 logfs_buf_write(area, ofs, &sh, sizeof(sh));
413 return 0;
414}
415
416static size_t __logfs_write_header(struct logfs_super *super,
417 struct logfs_journal_header *jh, size_t len, size_t datalen,
418 u16 type, u8 compr)
419{
420 jh->h_len = cpu_to_be16(len);
421 jh->h_type = cpu_to_be16(type);
422 jh->h_version = cpu_to_be16(++super->s_last_version);
423 jh->h_datalen = cpu_to_be16(datalen);
424 jh->h_compr = compr;
425 jh->h_pad[0] = 'H';
426 jh->h_pad[1] = 'A';
427 jh->h_pad[2] = 'T';
428 jh->h_crc = logfs_crc32(jh, len + sizeof(*jh), 4);
429 return ALIGN(len, 16) + sizeof(*jh);
430}
431
432static size_t logfs_write_header(struct logfs_super *super,
433 struct logfs_journal_header *jh, size_t datalen, u16 type)
434{
435 size_t len = datalen;
436
437 return __logfs_write_header(super, jh, len, datalen, type, COMPR_NONE);
438}
439
440static inline size_t logfs_journal_erasecount_size(struct logfs_super *super)
441{
442 return LOGFS_JOURNAL_SEGS * sizeof(__be32);
443}
444
445static void *logfs_write_erasecount(struct super_block *sb, void *_ec,
446 u16 *type, size_t *len)
447{
448 struct logfs_super *super = logfs_super(sb);
449 struct logfs_je_journal_ec *ec = _ec;
450 int i;
451
452 journal_for_each(i)
453 ec->ec[i] = cpu_to_be32(super->s_journal_ec[i]);
454 *type = JE_ERASECOUNT;
455 *len = logfs_journal_erasecount_size(super);
456 return ec;
457}
458
459static void account_shadow(void *_shadow, unsigned long _sb, u64 ignore,
460 size_t ignore2)
461{
462 struct logfs_shadow *shadow = _shadow;
463 struct super_block *sb = (void *)_sb;
464 struct logfs_super *super = logfs_super(sb);
465
466 /* consume new space */
467 super->s_free_bytes -= shadow->new_len;
468 super->s_used_bytes += shadow->new_len;
469 super->s_dirty_used_bytes -= shadow->new_len;
470
471 /* free up old space */
472 super->s_free_bytes += shadow->old_len;
473 super->s_used_bytes -= shadow->old_len;
474 super->s_dirty_free_bytes -= shadow->old_len;
475
476 logfs_set_segment_used(sb, shadow->old_ofs, -shadow->old_len);
477 logfs_set_segment_used(sb, shadow->new_ofs, shadow->new_len);
478
479 log_journal("account_shadow(%llx, %llx, %x) %llx->%llx %x->%x\n",
480 shadow->ino, shadow->bix, shadow->gc_level,
481 shadow->old_ofs, shadow->new_ofs,
482 shadow->old_len, shadow->new_len);
483 mempool_free(shadow, super->s_shadow_pool);
484}
485
486static void account_shadows(struct super_block *sb)
487{
488 struct logfs_super *super = logfs_super(sb);
489 struct inode *inode = super->s_master_inode;
490 struct logfs_inode *li = logfs_inode(inode);
491 struct shadow_tree *tree = &super->s_shadow_tree;
492
493 btree_grim_visitor64(&tree->new, (unsigned long)sb, account_shadow);
494 btree_grim_visitor64(&tree->old, (unsigned long)sb, account_shadow);
495
496 if (li->li_block) {
497 /*
498 * We never actually use the structure, when attached to the
499 * master inode. But it is easier to always free it here than
500 * to have checks in several places elsewhere when allocating
501 * it.
502 */
503 li->li_block->ops->free_block(sb, li->li_block);
504 }
505 BUG_ON((s64)li->li_used_bytes < 0);
506}
507
508static void *__logfs_write_anchor(struct super_block *sb, void *_da,
509 u16 *type, size_t *len)
510{
511 struct logfs_super *super = logfs_super(sb);
512 struct logfs_je_anchor *da = _da;
513 struct inode *inode = super->s_master_inode;
514 struct logfs_inode *li = logfs_inode(inode);
515 int i;
516
517 da->da_height = li->li_height;
518 da->da_last_ino = cpu_to_be64(super->s_last_ino);
519 da->da_size = cpu_to_be64(i_size_read(inode));
520 da->da_used_bytes = cpu_to_be64(li->li_used_bytes);
521 for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
522 da->da_data[i] = cpu_to_be64(li->li_data[i]);
523 *type = JE_ANCHOR;
524 *len = sizeof(*da);
525 return da;
526}
527
528static void *logfs_write_dynsb(struct super_block *sb, void *_dynsb,
529 u16 *type, size_t *len)
530{
531 struct logfs_super *super = logfs_super(sb);
532 struct logfs_je_dynsb *dynsb = _dynsb;
533
534 dynsb->ds_gec = cpu_to_be64(super->s_gec);
535 dynsb->ds_sweeper = cpu_to_be64(super->s_sweeper);
536 dynsb->ds_victim_ino = cpu_to_be64(super->s_victim_ino);
537 dynsb->ds_rename_dir = cpu_to_be64(super->s_rename_dir);
538 dynsb->ds_rename_pos = cpu_to_be64(super->s_rename_pos);
539 dynsb->ds_used_bytes = cpu_to_be64(super->s_used_bytes);
540 dynsb->ds_generation = cpu_to_be32(super->s_generation);
541 *type = JE_DYNSB;
542 *len = sizeof(*dynsb);
543 return dynsb;
544}
545
546static void write_wbuf(struct super_block *sb, struct logfs_area *area,
547 void *wbuf)
548{
549 struct logfs_super *super = logfs_super(sb);
550 struct address_space *mapping = super->s_mapping_inode->i_mapping;
551 u64 ofs;
552 pgoff_t index;
553 int page_ofs;
554 struct page *page;
555
556 ofs = dev_ofs(sb, area->a_segno,
557 area->a_used_bytes & ~(super->s_writesize - 1));
558 index = ofs >> PAGE_SHIFT;
559 page_ofs = ofs & (PAGE_SIZE - 1);
560
561 page = find_lock_page(mapping, index);
562 BUG_ON(!page);
563 memcpy(wbuf, page_address(page) + page_ofs, super->s_writesize);
564 unlock_page(page);
565}
566
567static void *logfs_write_area(struct super_block *sb, void *_a,
568 u16 *type, size_t *len)
569{
570 struct logfs_super *super = logfs_super(sb);
571 struct logfs_area *area = super->s_area[super->s_sum_index];
572 struct logfs_je_area *a = _a;
573
574 a->vim = VIM_DEFAULT;
575 a->gc_level = super->s_sum_index;
576 a->used_bytes = cpu_to_be32(area->a_used_bytes);
577 a->segno = cpu_to_be32(area->a_segno);
578 if (super->s_writesize > 1)
579 write_wbuf(sb, area, a + 1);
580
581 *type = JE_AREA;
582 *len = sizeof(*a) + super->s_writesize;
583 return a;
584}
585
586static void *logfs_write_commit(struct super_block *sb, void *h,
587 u16 *type, size_t *len)
588{
589 struct logfs_super *super = logfs_super(sb);
590
591 *type = JE_COMMIT;
592 *len = super->s_no_je * sizeof(__be64);
593 return super->s_je_array;
594}
595
596static size_t __logfs_write_je(struct super_block *sb, void *buf, u16 type,
597 size_t len)
598{
599 struct logfs_super *super = logfs_super(sb);
600 void *header = super->s_compressed_je;
601 void *data = header + sizeof(struct logfs_journal_header);
602 ssize_t compr_len, pad_len;
603 u8 compr = COMPR_ZLIB;
604
605 if (len == 0)
606 return logfs_write_header(super, header, 0, type);
607
608 compr_len = logfs_compress(buf, data, len, sb->s_blocksize);
609 if (compr_len < 0 || type == JE_ANCHOR) {
610 BUG_ON(len > sb->s_blocksize);
611 memcpy(data, buf, len);
612 compr_len = len;
613 compr = COMPR_NONE;
614 }
615
616 pad_len = ALIGN(compr_len, 16);
617 memset(data + compr_len, 0, pad_len - compr_len);
618
619 return __logfs_write_header(super, header, compr_len, len, type, compr);
620}
621
622static s64 logfs_get_free_bytes(struct logfs_area *area, size_t *bytes,
623 int must_pad)
624{
625 u32 writesize = logfs_super(area->a_sb)->s_writesize;
626 s32 ofs;
627 int ret;
628
629 ret = logfs_open_area(area, *bytes);
630 if (ret)
631 return -EAGAIN;
632
633 ofs = area->a_used_bytes;
634 area->a_used_bytes += *bytes;
635
636 if (must_pad) {
637 area->a_used_bytes = ALIGN(area->a_used_bytes, writesize);
638 *bytes = area->a_used_bytes - ofs;
639 }
640
641 return dev_ofs(area->a_sb, area->a_segno, ofs);
642}
643
644static int logfs_write_je_buf(struct super_block *sb, void *buf, u16 type,
645 size_t buf_len)
646{
647 struct logfs_super *super = logfs_super(sb);
648 struct logfs_area *area = super->s_journal_area;
649 struct logfs_journal_header *jh = super->s_compressed_je;
650 size_t len;
651 int must_pad = 0;
652 s64 ofs;
653
654 len = __logfs_write_je(sb, buf, type, buf_len);
655 if (jh->h_type == cpu_to_be16(JE_COMMIT))
656 must_pad = 1;
657
658 ofs = logfs_get_free_bytes(area, &len, must_pad);
659 if (ofs < 0)
660 return ofs;
661 logfs_buf_write(area, ofs, super->s_compressed_je, len);
662 super->s_je_array[super->s_no_je++] = cpu_to_be64(ofs);
663 return 0;
664}
665
666static int logfs_write_je(struct super_block *sb,
667 void* (*write)(struct super_block *sb, void *scratch,
668 u16 *type, size_t *len))
669{
670 void *buf;
671 size_t len;
672 u16 type;
673
674 buf = write(sb, logfs_super(sb)->s_je, &type, &len);
675 return logfs_write_je_buf(sb, buf, type, len);
676}
677
678int write_alias_journal(struct super_block *sb, u64 ino, u64 bix,
679 level_t level, int child_no, __be64 val)
680{
681 struct logfs_super *super = logfs_super(sb);
682 struct logfs_obj_alias *oa = super->s_je;
683 int err = 0, fill = super->s_je_fill;
684
685 log_aliases("logfs_write_obj_aliases #%x(%llx, %llx, %x, %x) %llx\n",
686 fill, ino, bix, level, child_no, be64_to_cpu(val));
687 oa[fill].ino = cpu_to_be64(ino);
688 oa[fill].bix = cpu_to_be64(bix);
689 oa[fill].val = val;
690 oa[fill].level = (__force u8)level;
691 oa[fill].child_no = cpu_to_be16(child_no);
692 fill++;
693 if (fill >= sb->s_blocksize / sizeof(*oa)) {
694 err = logfs_write_je_buf(sb, oa, JE_OBJ_ALIAS, sb->s_blocksize);
695 fill = 0;
696 }
697
698 super->s_je_fill = fill;
699 return err;
700}
701
702static int logfs_write_obj_aliases(struct super_block *sb)
703{
704 struct logfs_super *super = logfs_super(sb);
705 int err;
706
707 log_journal("logfs_write_obj_aliases: %d aliases to write\n",
708 super->s_no_object_aliases);
709 super->s_je_fill = 0;
710 err = logfs_write_obj_aliases_pagecache(sb);
711 if (err)
712 return err;
713
714 if (super->s_je_fill)
715 err = logfs_write_je_buf(sb, super->s_je, JE_OBJ_ALIAS,
716 super->s_je_fill
717 * sizeof(struct logfs_obj_alias));
718 return err;
719}
720
721/*
722 * Write all journal entries. The goto logic ensures that all journal entries
723 * are written whenever a new segment is used. It is ugly and potentially a
724 * bit wasteful, but robustness is more important. With this we can *always*
725 * erase all journal segments except the one containing the most recent commit.
726 */
727void logfs_write_anchor(struct inode *inode)
728{
729 struct super_block *sb = inode->i_sb;
730 struct logfs_super *super = logfs_super(sb);
731 struct logfs_area *area = super->s_journal_area;
732 int i, err;
733
734 BUG_ON(logfs_super(sb)->s_flags & LOGFS_SB_FLAG_SHUTDOWN);
735 mutex_lock(&super->s_journal_mutex);
736
737 /* Do this first or suffer corruption */
738 logfs_sync_segments(sb);
739 account_shadows(sb);
740
741again:
742 super->s_no_je = 0;
743 for_each_area(i) {
744 if (!super->s_area[i]->a_is_open)
745 continue;
746 super->s_sum_index = i;
747 err = logfs_write_je(sb, logfs_write_area);
748 if (err)
749 goto again;
750 }
751 err = logfs_write_obj_aliases(sb);
752 if (err)
753 goto again;
754 err = logfs_write_je(sb, logfs_write_erasecount);
755 if (err)
756 goto again;
757 err = logfs_write_je(sb, __logfs_write_anchor);
758 if (err)
759 goto again;
760 err = logfs_write_je(sb, logfs_write_dynsb);
761 if (err)
762 goto again;
763 /*
764 * Order is imperative. First we sync all writes, including the
765 * non-committed journal writes. Then we write the final commit and
766 * sync the current journal segment.
767 * There is a theoretical bug here. Syncing the journal segment will
768 * write a number of journal entries and the final commit. All these
769 * are written in a single operation. If the device layer writes the
770 * data back-to-front, the commit will precede the other journal
771 * entries, leaving a race window.
772 * Two fixes are possible. Preferred is to fix the device layer to
773 * ensure writes happen front-to-back. Alternatively we can insert
774 * another logfs_sync_area() super->s_devops->sync() combo before
775 * writing the commit.
776 */
777 /*
778 * On another subject, super->s_devops->sync is usually not necessary.
779 * Unless called from sys_sync or friends, a barrier would suffice.
780 */
781 super->s_devops->sync(sb);
782 err = logfs_write_je(sb, logfs_write_commit);
783 if (err)
784 goto again;
785 log_journal("Write commit to %llx\n",
786 be64_to_cpu(super->s_je_array[super->s_no_je - 1]));
787 logfs_sync_area(area);
788 BUG_ON(area->a_used_bytes != area->a_written_bytes);
789 super->s_devops->sync(sb);
790
791 mutex_unlock(&super->s_journal_mutex);
792 return;
793}
794
795void do_logfs_journal_wl_pass(struct super_block *sb)
796{
797 struct logfs_super *super = logfs_super(sb);
798 struct logfs_area *area = super->s_journal_area;
799 u32 segno, ec;
800 int i, err;
801
802 log_journal("Journal requires wear-leveling.\n");
803 /* Drop old segments */
804 journal_for_each(i)
805 if (super->s_journal_seg[i]) {
806 logfs_set_segment_unreserved(sb,
807 super->s_journal_seg[i],
808 super->s_journal_ec[i]);
809 super->s_journal_seg[i] = 0;
810 super->s_journal_ec[i] = 0;
811 }
812 /* Get new segments */
813 for (i = 0; i < super->s_no_journal_segs; i++) {
814 segno = get_best_cand(sb, &super->s_reserve_list, &ec);
815 super->s_journal_seg[i] = segno;
816 super->s_journal_ec[i] = ec;
817 logfs_set_segment_reserved(sb, segno);
818 }
819 /* Manually move journal_area */
820 area->a_segno = super->s_journal_seg[0];
821 area->a_is_open = 0;
822 area->a_used_bytes = 0;
823 /* Write journal */
824 logfs_write_anchor(super->s_master_inode);
825 /* Write superblocks */
826 err = logfs_write_sb(sb);
827 BUG_ON(err);
828}
829
830static const struct logfs_area_ops journal_area_ops = {
831 .get_free_segment = journal_get_free_segment,
832 .get_erase_count = journal_get_erase_count,
833 .erase_segment = journal_erase_segment,
834};
835
836int logfs_init_journal(struct super_block *sb)
837{
838 struct logfs_super *super = logfs_super(sb);
839 size_t bufsize = max_t(size_t, sb->s_blocksize, super->s_writesize)
840 + MAX_JOURNAL_HEADER;
841 int ret = -ENOMEM;
842
843 mutex_init(&super->s_journal_mutex);
844 btree_init_mempool32(&super->s_reserved_segments, super->s_btree_pool);
845
846 super->s_je = kzalloc(bufsize, GFP_KERNEL);
847 if (!super->s_je)
848 return ret;
849
850 super->s_compressed_je = kzalloc(bufsize, GFP_KERNEL);
851 if (!super->s_compressed_je)
852 return ret;
853
854 super->s_master_inode = logfs_new_meta_inode(sb, LOGFS_INO_MASTER);
855 if (IS_ERR(super->s_master_inode))
856 return PTR_ERR(super->s_master_inode);
857
858 ret = logfs_read_journal(sb);
859 if (ret)
860 return -EIO;
861
862 reserve_sb_and_journal(sb);
863 logfs_calc_free(sb);
864
865 super->s_journal_area->a_ops = &journal_area_ops;
866 return 0;
867}
868
869void logfs_cleanup_journal(struct super_block *sb)
870{
871 struct logfs_super *super = logfs_super(sb);
872
873 btree_grim_visitor32(&super->s_reserved_segments, 0, NULL);
874 destroy_meta_inode(super->s_master_inode);
875 super->s_master_inode = NULL;
876
877 kfree(super->s_compressed_je);
878 kfree(super->s_je);
879}
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
new file mode 100644
index 000000000000..e3082abe9e3b
--- /dev/null
+++ b/fs/logfs/logfs.h
@@ -0,0 +1,722 @@
1/*
2 * fs/logfs/logfs.h
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 *
8 * Private header for logfs.
9 */
10#ifndef FS_LOGFS_LOGFS_H
11#define FS_LOGFS_LOGFS_H
12
13#undef __CHECK_ENDIAN__
14#define __CHECK_ENDIAN__
15
16#include <linux/btree.h>
17#include <linux/crc32.h>
18#include <linux/fs.h>
19#include <linux/kernel.h>
20#include <linux/mempool.h>
21#include <linux/pagemap.h>
22#include <linux/mtd/mtd.h>
23#include "logfs_abi.h"
24
25#define LOGFS_DEBUG_SUPER (0x0001)
26#define LOGFS_DEBUG_SEGMENT (0x0002)
27#define LOGFS_DEBUG_JOURNAL (0x0004)
28#define LOGFS_DEBUG_DIR (0x0008)
29#define LOGFS_DEBUG_FILE (0x0010)
30#define LOGFS_DEBUG_INODE (0x0020)
31#define LOGFS_DEBUG_READWRITE (0x0040)
32#define LOGFS_DEBUG_GC (0x0080)
33#define LOGFS_DEBUG_GC_NOISY (0x0100)
34#define LOGFS_DEBUG_ALIASES (0x0200)
35#define LOGFS_DEBUG_BLOCKMOVE (0x0400)
36#define LOGFS_DEBUG_ALL (0xffffffff)
37
38#define LOGFS_DEBUG (0x01)
39/*
40 * To enable specific log messages, simply define LOGFS_DEBUG to match any
41 * or all of the above.
42 */
43#ifndef LOGFS_DEBUG
44#define LOGFS_DEBUG (0)
45#endif
46
47#define log_cond(cond, fmt, arg...) do { \
48 if (cond) \
49 printk(KERN_DEBUG fmt, ##arg); \
50} while (0)
51
52#define log_super(fmt, arg...) \
53 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_SUPER, fmt, ##arg)
54#define log_segment(fmt, arg...) \
55 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_SEGMENT, fmt, ##arg)
56#define log_journal(fmt, arg...) \
57 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_JOURNAL, fmt, ##arg)
58#define log_dir(fmt, arg...) \
59 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_DIR, fmt, ##arg)
60#define log_file(fmt, arg...) \
61 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_FILE, fmt, ##arg)
62#define log_inode(fmt, arg...) \
63 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_INODE, fmt, ##arg)
64#define log_readwrite(fmt, arg...) \
65 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_READWRITE, fmt, ##arg)
66#define log_gc(fmt, arg...) \
67 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_GC, fmt, ##arg)
68#define log_gc_noisy(fmt, arg...) \
69 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_GC_NOISY, fmt, ##arg)
70#define log_aliases(fmt, arg...) \
71 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_ALIASES, fmt, ##arg)
72#define log_blockmove(fmt, arg...) \
73 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_BLOCKMOVE, fmt, ##arg)
74
75#define PG_pre_locked PG_owner_priv_1
76#define PagePreLocked(page) test_bit(PG_pre_locked, &(page)->flags)
77#define SetPagePreLocked(page) set_bit(PG_pre_locked, &(page)->flags)
78#define ClearPagePreLocked(page) clear_bit(PG_pre_locked, &(page)->flags)
79
80/* FIXME: This should really be somewhere in the 64bit area. */
81#define LOGFS_LINK_MAX (1<<30)
82
83/* Read-only filesystem */
84#define LOGFS_SB_FLAG_RO 0x0001
85#define LOGFS_SB_FLAG_SEG_ALIAS 0x0002
86#define LOGFS_SB_FLAG_OBJ_ALIAS 0x0004
87#define LOGFS_SB_FLAG_SHUTDOWN 0x0008
88
89/* Write Control Flags */
90#define WF_LOCK 0x01 /* take write lock */
91#define WF_WRITE 0x02 /* write block */
92#define WF_DELETE 0x04 /* delete old block */
93
94typedef u8 __bitwise level_t;
95typedef u8 __bitwise gc_level_t;
96
97#define LEVEL(level) ((__force level_t)(level))
98#define GC_LEVEL(gc_level) ((__force gc_level_t)(gc_level))
99
100#define SUBLEVEL(level) ( (void)((level) == LEVEL(1)), \
101 (__force level_t)((__force u8)(level) - 1) )
102
103/**
104 * struct logfs_area - area management information
105 *
106 * @a_sb: the superblock this area belongs to
107 * @a_is_open: 1 if the area is currently open, else 0
108 * @a_segno: segment number of area
109 * @a_written_bytes: number of bytes already written back
110 * @a_used_bytes: number of used bytes
111 * @a_ops: area operations (either journal or ostore)
112 * @a_erase_count: erase count
113 * @a_level: GC level
114 */
115struct logfs_area { /* a segment open for writing */
116 struct super_block *a_sb;
117 int a_is_open;
118 u32 a_segno;
119 u32 a_written_bytes;
120 u32 a_used_bytes;
121 const struct logfs_area_ops *a_ops;
122 u32 a_erase_count;
123 gc_level_t a_level;
124};
125
126/**
127 * struct logfs_area_ops - area operations
128 *
129 * @get_free_segment: fill area->ofs with the offset of a free segment
130 * @get_erase_count: fill area->erase_count (needs area->ofs)
131 * @erase_segment: erase and setup segment
132 */
133struct logfs_area_ops {
134 void (*get_free_segment)(struct logfs_area *area);
135 void (*get_erase_count)(struct logfs_area *area);
136 int (*erase_segment)(struct logfs_area *area);
137};
138
139/**
140 * struct logfs_device_ops - device access operations
141 *
142 * @readpage: read one page (mm page)
143 * @writeseg: write one segment. may be a partial segment
144 * @erase: erase one segment
145 * @read: read from the device
146 * @erase: erase part of the device
147 */
148struct logfs_device_ops {
149 struct page *(*find_first_sb)(struct super_block *sb, u64 *ofs);
150 struct page *(*find_last_sb)(struct super_block *sb, u64 *ofs);
151 int (*write_sb)(struct super_block *sb, struct page *page);
152 int (*readpage)(void *_sb, struct page *page);
153 void (*writeseg)(struct super_block *sb, u64 ofs, size_t len);
154 int (*erase)(struct super_block *sb, loff_t ofs, size_t len);
155 void (*sync)(struct super_block *sb);
156 void (*put_device)(struct super_block *sb);
157};
158
159/**
160 * struct candidate_list - list of similar candidates
161 */
162struct candidate_list {
163 struct rb_root rb_tree;
164 int count;
165 int maxcount;
166 int sort_by_ec;
167};
168
169/**
170 * struct gc_candidate - "candidate" segment to be garbage collected next
171 *
172 * @list: list (either free of low)
173 * @segno: segment number
174 * @valid: number of valid bytes
175 * @erase_count: erase count of segment
176 * @dist: distance from tree root
177 *
178 * Candidates can be on two lists. The free list contains electees rather
179 * than candidates - segments that no longer contain any valid data. The
180 * low list contains candidates to be picked for GC. It should be kept
181 * short. It is not required to always pick a perfect candidate. In the
182 * worst case GC will have to move more data than absolutely necessary.
183 */
184struct gc_candidate {
185 struct rb_node rb_node;
186 struct candidate_list *list;
187 u32 segno;
188 u32 valid;
189 u32 erase_count;
190 u8 dist;
191};
192
193/**
194 * struct logfs_journal_entry - temporary structure used during journal scan
195 *
196 * @used:
197 * @version: normalized version
198 * @len: length
199 * @offset: offset
200 */
201struct logfs_journal_entry {
202 int used;
203 s16 version;
204 u16 len;
205 u16 datalen;
206 u64 offset;
207};
208
209enum transaction_state {
210 CREATE_1 = 1,
211 CREATE_2,
212 UNLINK_1,
213 UNLINK_2,
214 CROSS_RENAME_1,
215 CROSS_RENAME_2,
216 TARGET_RENAME_1,
217 TARGET_RENAME_2,
218 TARGET_RENAME_3
219};
220
221/**
222 * struct logfs_transaction - essential fields to support atomic dirops
223 *
224 * @ino: target inode
225 * @dir: inode of directory containing dentry
226 * @pos: pos of dentry in directory
227 */
228struct logfs_transaction {
229 enum transaction_state state;
230 u64 ino;
231 u64 dir;
232 u64 pos;
233};
234
235/**
236 * struct logfs_shadow - old block in the shadow of a not-yet-committed new one
237 * @old_ofs: offset of old block on medium
238 * @new_ofs: offset of new block on medium
239 * @ino: inode number
240 * @bix: block index
241 * @old_len: size of old block, including header
242 * @new_len: size of new block, including header
243 * @level: block level
244 */
245struct logfs_shadow {
246 u64 old_ofs;
247 u64 new_ofs;
248 u64 ino;
249 u64 bix;
250 int old_len;
251 int new_len;
252 gc_level_t gc_level;
253};
254
255/**
256 * struct shadow_tree
257 * @new: shadows where old_ofs==0, indexed by new_ofs
258 * @old: shadows where old_ofs!=0, indexed by old_ofs
259 */
260struct shadow_tree {
261 struct btree_head64 new;
262 struct btree_head64 old;
263};
264
265struct object_alias_item {
266 struct list_head list;
267 __be64 val;
268 int child_no;
269};
270
271/**
272 * struct logfs_block - contains any block state
273 * @type: indirect block or inode
274 * @full: number of fully populated children
275 * @partial: number of partially populated children
276 *
277 * Most blocks are directly represented by page cache pages. But when a block
278 * becomes dirty, is part of a transaction, contains aliases or is otherwise
279 * special, a struct logfs_block is allocated to track the additional state.
280 * Inodes are very similar to indirect blocks, so they can also get one of
281 * these structures added when appropriate.
282 */
283#define BLOCK_INDIRECT 1 /* Indirect block */
284#define BLOCK_INODE 2 /* Inode */
285struct logfs_block_ops;
286struct logfs_block {
287 struct list_head alias_list;
288 struct list_head item_list;
289 struct super_block *sb;
290 u64 ino;
291 u64 bix;
292 level_t level;
293 struct page *page;
294 struct inode *inode;
295 struct logfs_transaction *ta;
296 unsigned long alias_map[LOGFS_BLOCK_FACTOR / BITS_PER_LONG];
297 struct logfs_block_ops *ops;
298 int full;
299 int partial;
300 int reserved_bytes;
301};
302
303typedef int write_alias_t(struct super_block *sb, u64 ino, u64 bix,
304 level_t level, int child_no, __be64 val);
305struct logfs_block_ops {
306 void (*write_block)(struct logfs_block *block);
307 gc_level_t (*block_level)(struct logfs_block *block);
308 void (*free_block)(struct super_block *sb, struct logfs_block*block);
309 int (*write_alias)(struct super_block *sb,
310 struct logfs_block *block,
311 write_alias_t *write_one_alias);
312};
313
314struct logfs_super {
315 struct mtd_info *s_mtd; /* underlying device */
316 struct block_device *s_bdev; /* underlying device */
317 const struct logfs_device_ops *s_devops;/* device access */
318 struct inode *s_master_inode; /* inode file */
319 struct inode *s_segfile_inode; /* segment file */
320 struct inode *s_mapping_inode; /* device mapping */
321 atomic_t s_pending_writes; /* outstanting bios */
322 long s_flags;
323 mempool_t *s_btree_pool; /* for btree nodes */
324 mempool_t *s_alias_pool; /* aliases in segment.c */
325 u64 s_feature_incompat;
326 u64 s_feature_ro_compat;
327 u64 s_feature_compat;
328 u64 s_feature_flags;
329 u64 s_sb_ofs[2];
330 /* alias.c fields */
331 struct btree_head32 s_segment_alias; /* remapped segments */
332 int s_no_object_aliases;
333 struct list_head s_object_alias; /* remapped objects */
334 struct btree_head128 s_object_alias_tree; /* remapped objects */
335 struct mutex s_object_alias_mutex;
336 /* dir.c fields */
337 struct mutex s_dirop_mutex; /* for creat/unlink/rename */
338 u64 s_victim_ino; /* used for atomic dir-ops */
339 u64 s_rename_dir; /* source directory ino */
340 u64 s_rename_pos; /* position of source dd */
341 /* gc.c fields */
342 long s_segsize; /* size of a segment */
343 int s_segshift; /* log2 of segment size */
344 long s_segmask; /* 1 << s_segshift - 1 */
345 long s_no_segs; /* segments on device */
346 long s_no_journal_segs; /* segments used for journal */
347 long s_no_blocks; /* blocks per segment */
348 long s_writesize; /* minimum write size */
349 int s_writeshift; /* log2 of write size */
350 u64 s_size; /* filesystem size */
351 struct logfs_area *s_area[LOGFS_NO_AREAS]; /* open segment array */
352 u64 s_gec; /* global erase count */
353 u64 s_wl_gec_ostore; /* time of last wl event */
354 u64 s_wl_gec_journal; /* time of last wl event */
355 u64 s_sweeper; /* current sweeper pos */
356 u8 s_ifile_levels; /* max level of ifile */
357 u8 s_iblock_levels; /* max level of regular files */
358 u8 s_data_levels; /* # of segments to leaf block*/
359 u8 s_total_levels; /* sum of above three */
360 struct btree_head32 s_cand_tree; /* all candidates */
361 struct candidate_list s_free_list; /* 100% free segments */
362 struct candidate_list s_reserve_list; /* Bad segment reserve */
363 struct candidate_list s_low_list[LOGFS_NO_AREAS];/* good candidates */
364 struct candidate_list s_ec_list; /* wear level candidates */
365 struct btree_head32 s_reserved_segments;/* sb, journal, bad, etc. */
366 /* inode.c fields */
367 u64 s_last_ino; /* highest ino used */
368 long s_inos_till_wrap;
369 u32 s_generation; /* i_generation for new files */
370 struct list_head s_freeing_list; /* inodes being freed */
371 /* journal.c fields */
372 struct mutex s_journal_mutex;
373 void *s_je; /* journal entry to compress */
374 void *s_compressed_je; /* block to write to journal */
375 u32 s_journal_seg[LOGFS_JOURNAL_SEGS]; /* journal segments */
376 u32 s_journal_ec[LOGFS_JOURNAL_SEGS]; /* journal erasecounts */
377 u64 s_last_version;
378 struct logfs_area *s_journal_area; /* open journal segment */
379 __be64 s_je_array[64];
380 int s_no_je;
381
382 int s_sum_index; /* for the 12 summaries */
383 struct shadow_tree s_shadow_tree;
384 int s_je_fill; /* index of current je */
385 /* readwrite.c fields */
386 struct mutex s_write_mutex;
387 int s_lock_count;
388 mempool_t *s_block_pool; /* struct logfs_block pool */
389 mempool_t *s_shadow_pool; /* struct logfs_shadow pool */
390 /*
391 * Space accounting:
392 * - s_used_bytes specifies space used to store valid data objects.
393 * - s_dirty_used_bytes is space used to store non-committed data
394 * objects. Those objects have already been written themselves,
395 * but they don't become valid until all indirect blocks up to the
396 * journal have been written as well.
397 * - s_dirty_free_bytes is space used to store the old copy of a
398 * replaced object, as long as the replacement is non-committed.
399 * In other words, it is the amount of space freed when all dirty
400 * blocks are written back.
401 * - s_free_bytes is the amount of free space available for any
402 * purpose.
403 * - s_root_reserve is the amount of free space available only to
404 * the root user. Non-privileged users can no longer write once
405 * this watermark has been reached.
406 * - s_speed_reserve is space which remains unused to speed up
407 * garbage collection performance.
408 * - s_dirty_pages is the space reserved for currently dirty pages.
409 * It is a pessimistic estimate, so some/most will get freed on
410 * page writeback.
411 *
412 * s_used_bytes + s_free_bytes + s_speed_reserve = total usable size
413 */
414 u64 s_free_bytes;
415 u64 s_used_bytes;
416 u64 s_dirty_free_bytes;
417 u64 s_dirty_used_bytes;
418 u64 s_root_reserve;
419 u64 s_speed_reserve;
420 u64 s_dirty_pages;
421 /* Bad block handling:
422 * - s_bad_seg_reserve is a number of segments usually kept
423 * free. When encountering bad blocks, the affected segment's data
424 * is _temporarily_ moved to a reserved segment.
425 * - s_bad_segments is the number of known bad segments.
426 */
427 u32 s_bad_seg_reserve;
428 u32 s_bad_segments;
429};
430
431/**
432 * struct logfs_inode - in-memory inode
433 *
434 * @vfs_inode: struct inode
435 * @li_data: data pointers
436 * @li_used_bytes: number of used bytes
437 * @li_freeing_list: used to track inodes currently being freed
438 * @li_flags: inode flags
439 * @li_refcount: number of internal (GC-induced) references
440 */
441struct logfs_inode {
442 struct inode vfs_inode;
443 u64 li_data[LOGFS_EMBEDDED_FIELDS];
444 u64 li_used_bytes;
445 struct list_head li_freeing_list;
446 struct logfs_block *li_block;
447 u32 li_flags;
448 u8 li_height;
449 int li_refcount;
450};
451
452#define journal_for_each(__i) for (__i = 0; __i < LOGFS_JOURNAL_SEGS; __i++)
453#define for_each_area(__i) for (__i = 0; __i < LOGFS_NO_AREAS; __i++)
454#define for_each_area_down(__i) for (__i = LOGFS_NO_AREAS - 1; __i >= 0; __i--)
455
456/* compr.c */
457int logfs_compress(void *in, void *out, size_t inlen, size_t outlen);
458int logfs_uncompress(void *in, void *out, size_t inlen, size_t outlen);
459int __init logfs_compr_init(void);
460void logfs_compr_exit(void);
461
462/* dev_bdev.c */
463#ifdef CONFIG_BLOCK
464int logfs_get_sb_bdev(struct file_system_type *type, int flags,
465 const char *devname, struct vfsmount *mnt);
466#else
467static inline int logfs_get_sb_bdev(struct file_system_type *type, int flags,
468 const char *devname, struct vfsmount *mnt)
469{
470 return -ENODEV;
471}
472#endif
473
474/* dev_mtd.c */
475#ifdef CONFIG_MTD
476int logfs_get_sb_mtd(struct file_system_type *type, int flags,
477 int mtdnr, struct vfsmount *mnt);
478#else
479static inline int logfs_get_sb_mtd(struct file_system_type *type, int flags,
480 int mtdnr, struct vfsmount *mnt)
481{
482 return -ENODEV;
483}
484#endif
485
486/* dir.c */
487extern const struct inode_operations logfs_symlink_iops;
488extern const struct inode_operations logfs_dir_iops;
489extern const struct file_operations logfs_dir_fops;
490int logfs_replay_journal(struct super_block *sb);
491
492/* file.c */
493extern const struct inode_operations logfs_reg_iops;
494extern const struct file_operations logfs_reg_fops;
495extern const struct address_space_operations logfs_reg_aops;
496int logfs_readpage(struct file *file, struct page *page);
497int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
498 unsigned long arg);
499int logfs_fsync(struct file *file, struct dentry *dentry, int datasync);
500
501/* gc.c */
502u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec);
503void logfs_gc_pass(struct super_block *sb);
504int logfs_check_areas(struct super_block *sb);
505int logfs_init_gc(struct super_block *sb);
506void logfs_cleanup_gc(struct super_block *sb);
507
508/* inode.c */
509extern const struct super_operations logfs_super_operations;
510struct inode *logfs_iget(struct super_block *sb, ino_t ino);
511struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *cookie);
512void logfs_safe_iput(struct inode *inode, int cookie);
513struct inode *logfs_new_inode(struct inode *dir, int mode);
514struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino);
515struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino);
516int logfs_init_inode_cache(void);
517void logfs_destroy_inode_cache(void);
518void destroy_meta_inode(struct inode *inode);
519void logfs_set_blocks(struct inode *inode, u64 no);
520/* these logically belong into inode.c but actually reside in readwrite.c */
521int logfs_read_inode(struct inode *inode);
522int __logfs_write_inode(struct inode *inode, long flags);
523void logfs_delete_inode(struct inode *inode);
524void logfs_clear_inode(struct inode *inode);
525
526/* journal.c */
527void logfs_write_anchor(struct inode *inode);
528int logfs_init_journal(struct super_block *sb);
529void logfs_cleanup_journal(struct super_block *sb);
530int write_alias_journal(struct super_block *sb, u64 ino, u64 bix,
531 level_t level, int child_no, __be64 val);
532void do_logfs_journal_wl_pass(struct super_block *sb);
533
534/* readwrite.c */
535pgoff_t logfs_pack_index(u64 bix, level_t level);
536void logfs_unpack_index(pgoff_t index, u64 *bix, level_t *level);
537int logfs_inode_write(struct inode *inode, const void *buf, size_t count,
538 loff_t bix, long flags, struct shadow_tree *shadow_tree);
539int logfs_readpage_nolock(struct page *page);
540int logfs_write_buf(struct inode *inode, struct page *page, long flags);
541int logfs_delete(struct inode *inode, pgoff_t index,
542 struct shadow_tree *shadow_tree);
543int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs,
544 gc_level_t gc_level, long flags);
545int logfs_is_valid_block(struct super_block *sb, u64 ofs, u64 ino, u64 bix,
546 gc_level_t gc_level);
547int logfs_truncate(struct inode *inode, u64 size);
548u64 logfs_seek_hole(struct inode *inode, u64 bix);
549u64 logfs_seek_data(struct inode *inode, u64 bix);
550int logfs_open_segfile(struct super_block *sb);
551int logfs_init_rw(struct super_block *sb);
552void logfs_cleanup_rw(struct super_block *sb);
553void logfs_add_transaction(struct inode *inode, struct logfs_transaction *ta);
554void logfs_del_transaction(struct inode *inode, struct logfs_transaction *ta);
555void logfs_write_block(struct logfs_block *block, long flags);
556int logfs_write_obj_aliases_pagecache(struct super_block *sb);
557void logfs_get_segment_entry(struct super_block *sb, u32 segno,
558 struct logfs_segment_entry *se);
559void logfs_set_segment_used(struct super_block *sb, u64 ofs, int increment);
560void logfs_set_segment_erased(struct super_block *sb, u32 segno, u32 ec,
561 gc_level_t gc_level);
562void logfs_set_segment_reserved(struct super_block *sb, u32 segno);
563void logfs_set_segment_unreserved(struct super_block *sb, u32 segno, u32 ec);
564struct logfs_block *__alloc_block(struct super_block *sb,
565 u64 ino, u64 bix, level_t level);
566void __free_block(struct super_block *sb, struct logfs_block *block);
567void btree_write_block(struct logfs_block *block);
568void initialize_block_counters(struct page *page, struct logfs_block *block,
569 __be64 *array, int page_is_empty);
570int logfs_exist_block(struct inode *inode, u64 bix);
571int get_page_reserve(struct inode *inode, struct page *page);
572extern struct logfs_block_ops indirect_block_ops;
573
574/* segment.c */
575int logfs_erase_segment(struct super_block *sb, u32 ofs);
576int wbuf_read(struct super_block *sb, u64 ofs, size_t len, void *buf);
577int logfs_segment_read(struct inode *inode, struct page *page, u64 ofs, u64 bix,
578 level_t level);
579int logfs_segment_write(struct inode *inode, struct page *page,
580 struct logfs_shadow *shadow);
581int logfs_segment_delete(struct inode *inode, struct logfs_shadow *shadow);
582int logfs_load_object_aliases(struct super_block *sb,
583 struct logfs_obj_alias *oa, int count);
584void move_page_to_btree(struct page *page);
585int logfs_init_mapping(struct super_block *sb);
586void logfs_sync_area(struct logfs_area *area);
587void logfs_sync_segments(struct super_block *sb);
588
589/* area handling */
590int logfs_init_areas(struct super_block *sb);
591void logfs_cleanup_areas(struct super_block *sb);
592int logfs_open_area(struct logfs_area *area, size_t bytes);
593void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
594 int use_filler);
595
596static inline void logfs_buf_write(struct logfs_area *area, u64 ofs,
597 void *buf, size_t len)
598{
599 __logfs_buf_write(area, ofs, buf, len, 0);
600}
601
602static inline void logfs_buf_recover(struct logfs_area *area, u64 ofs,
603 void *buf, size_t len)
604{
605 __logfs_buf_write(area, ofs, buf, len, 1);
606}
607
608/* super.c */
609struct page *emergency_read_begin(struct address_space *mapping, pgoff_t index);
610void emergency_read_end(struct page *page);
611void logfs_crash_dump(struct super_block *sb);
612void *memchr_inv(const void *s, int c, size_t n);
613int logfs_statfs(struct dentry *dentry, struct kstatfs *stats);
614int logfs_get_sb_device(struct file_system_type *type, int flags,
615 struct mtd_info *mtd, struct block_device *bdev,
616 const struct logfs_device_ops *devops, struct vfsmount *mnt);
617int logfs_check_ds(struct logfs_disk_super *ds);
618int logfs_write_sb(struct super_block *sb);
619
620static inline struct logfs_super *logfs_super(struct super_block *sb)
621{
622 return sb->s_fs_info;
623}
624
625static inline struct logfs_inode *logfs_inode(struct inode *inode)
626{
627 return container_of(inode, struct logfs_inode, vfs_inode);
628}
629
630static inline void logfs_set_ro(struct super_block *sb)
631{
632 logfs_super(sb)->s_flags |= LOGFS_SB_FLAG_RO;
633}
634
635#define LOGFS_BUG(sb) do { \
636 struct super_block *__sb = sb; \
637 logfs_crash_dump(__sb); \
638 logfs_super(__sb)->s_flags |= LOGFS_SB_FLAG_RO; \
639 BUG(); \
640} while (0)
641
642#define LOGFS_BUG_ON(condition, sb) \
643 do { if (unlikely(condition)) LOGFS_BUG((sb)); } while (0)
644
645static inline __be32 logfs_crc32(void *data, size_t len, size_t skip)
646{
647 return cpu_to_be32(crc32(~0, data+skip, len-skip));
648}
649
650static inline u8 logfs_type(struct inode *inode)
651{
652 return (inode->i_mode >> 12) & 15;
653}
654
655static inline pgoff_t logfs_index(struct super_block *sb, u64 pos)
656{
657 return pos >> sb->s_blocksize_bits;
658}
659
660static inline u64 dev_ofs(struct super_block *sb, u32 segno, u32 ofs)
661{
662 return ((u64)segno << logfs_super(sb)->s_segshift) + ofs;
663}
664
665static inline u32 seg_no(struct super_block *sb, u64 ofs)
666{
667 return ofs >> logfs_super(sb)->s_segshift;
668}
669
670static inline u32 seg_ofs(struct super_block *sb, u64 ofs)
671{
672 return ofs & logfs_super(sb)->s_segmask;
673}
674
675static inline u64 seg_align(struct super_block *sb, u64 ofs)
676{
677 return ofs & ~logfs_super(sb)->s_segmask;
678}
679
680static inline struct logfs_block *logfs_block(struct page *page)
681{
682 return (void *)page->private;
683}
684
685static inline level_t shrink_level(gc_level_t __level)
686{
687 u8 level = (__force u8)__level;
688
689 if (level >= LOGFS_MAX_LEVELS)
690 level -= LOGFS_MAX_LEVELS;
691 return (__force level_t)level;
692}
693
694static inline gc_level_t expand_level(u64 ino, level_t __level)
695{
696 u8 level = (__force u8)__level;
697
698 if (ino == LOGFS_INO_MASTER) {
699 /* ifile has seperate areas */
700 level += LOGFS_MAX_LEVELS;
701 }
702 return (__force gc_level_t)level;
703}
704
705static inline int logfs_block_shift(struct super_block *sb, level_t level)
706{
707 level = shrink_level((__force gc_level_t)level);
708 return (__force int)level * (sb->s_blocksize_bits - 3);
709}
710
711static inline u64 logfs_block_mask(struct super_block *sb, level_t level)
712{
713 return ~0ull << logfs_block_shift(sb, level);
714}
715
716static inline struct logfs_area *get_area(struct super_block *sb,
717 gc_level_t gc_level)
718{
719 return logfs_super(sb)->s_area[(__force u8)gc_level];
720}
721
722#endif
diff --git a/fs/logfs/logfs_abi.h b/fs/logfs/logfs_abi.h
new file mode 100644
index 000000000000..5d3782ddecc8
--- /dev/null
+++ b/fs/logfs/logfs_abi.h
@@ -0,0 +1,627 @@
1/*
2 * fs/logfs/logfs_abi.h
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 *
8 * Public header for logfs.
9 */
10#ifndef FS_LOGFS_LOGFS_ABI_H
11#define FS_LOGFS_LOGFS_ABI_H
12
13/* For out-of-kernel compiles */
14#ifndef BUILD_BUG_ON
15#define BUILD_BUG_ON(condition) /**/
16#endif
17
18#define SIZE_CHECK(type, size) \
19static inline void check_##type(void) \
20{ \
21 BUILD_BUG_ON(sizeof(struct type) != (size)); \
22}
23
24/*
25 * Throughout the logfs code, we're constantly dealing with blocks at
26 * various positions or offsets. To remove confusion, we stricly
27 * distinguish between a "position" - the logical position within a
28 * file and an "offset" - the physical location within the device.
29 *
30 * Any usage of the term offset for a logical location or position for
31 * a physical one is a bug and should get fixed.
32 */
33
34/*
35 * Block are allocated in one of several segments depending on their
36 * level. The following levels are used:
37 * 0 - regular data block
38 * 1 - i1 indirect blocks
39 * 2 - i2 indirect blocks
40 * 3 - i3 indirect blocks
41 * 4 - i4 indirect blocks
42 * 5 - i5 indirect blocks
43 * 6 - ifile data blocks
44 * 7 - ifile i1 indirect blocks
45 * 8 - ifile i2 indirect blocks
46 * 9 - ifile i3 indirect blocks
47 * 10 - ifile i4 indirect blocks
48 * 11 - ifile i5 indirect blocks
49 * Potential levels to be used in the future:
50 * 12 - gc recycled blocks, long-lived data
51 * 13 - replacement blocks, short-lived data
52 *
53 * Levels 1-11 are necessary for robust gc operations and help seperate
54 * short-lived metadata from longer-lived file data. In the future,
55 * file data should get seperated into several segments based on simple
56 * heuristics. Old data recycled during gc operation is expected to be
57 * long-lived. New data is of uncertain life expectancy. New data
58 * used to replace older blocks in existing files is expected to be
59 * short-lived.
60 */
61
62
63/* Magic numbers. 64bit for superblock, 32bit for statfs f_type */
64#define LOGFS_MAGIC 0xb21f205ac97e8168ull
65#define LOGFS_MAGIC_U32 0xc97e8168u
66
67/*
68 * Various blocksize related macros. Blocksize is currently fixed at 4KiB.
69 * Sooner or later that should become configurable and the macros replaced
70 * by something superblock-dependent. Pointers in indirect blocks are and
71 * will remain 64bit.
72 *
73 * LOGFS_BLOCKSIZE - self-explaining
74 * LOGFS_BLOCK_FACTOR - number of pointers per indirect block
75 * LOGFS_BLOCK_BITS - log2 of LOGFS_BLOCK_FACTOR, used for shifts
76 */
77#define LOGFS_BLOCKSIZE (4096ull)
78#define LOGFS_BLOCK_FACTOR (LOGFS_BLOCKSIZE / sizeof(u64))
79#define LOGFS_BLOCK_BITS (9)
80
81/*
82 * Number of blocks at various levels of indirection. There are 16 direct
83 * block pointers plus a single indirect pointer.
84 */
85#define I0_BLOCKS (16)
86#define I1_BLOCKS LOGFS_BLOCK_FACTOR
87#define I2_BLOCKS (LOGFS_BLOCK_FACTOR * I1_BLOCKS)
88#define I3_BLOCKS (LOGFS_BLOCK_FACTOR * I2_BLOCKS)
89#define I4_BLOCKS (LOGFS_BLOCK_FACTOR * I3_BLOCKS)
90#define I5_BLOCKS (LOGFS_BLOCK_FACTOR * I4_BLOCKS)
91
92#define INDIRECT_INDEX I0_BLOCKS
93#define LOGFS_EMBEDDED_FIELDS (I0_BLOCKS + 1)
94
95/*
96 * Sizes at which files require another level of indirection. Files smaller
97 * than LOGFS_EMBEDDED_SIZE can be completely stored in the inode itself,
98 * similar like ext2 fast symlinks.
99 *
100 * Data at a position smaller than LOGFS_I0_SIZE is accessed through the
101 * direct pointers, else through the 1x indirect pointer and so forth.
102 */
103#define LOGFS_EMBEDDED_SIZE (LOGFS_EMBEDDED_FIELDS * sizeof(u64))
104#define LOGFS_I0_SIZE (I0_BLOCKS * LOGFS_BLOCKSIZE)
105#define LOGFS_I1_SIZE (I1_BLOCKS * LOGFS_BLOCKSIZE)
106#define LOGFS_I2_SIZE (I2_BLOCKS * LOGFS_BLOCKSIZE)
107#define LOGFS_I3_SIZE (I3_BLOCKS * LOGFS_BLOCKSIZE)
108#define LOGFS_I4_SIZE (I4_BLOCKS * LOGFS_BLOCKSIZE)
109#define LOGFS_I5_SIZE (I5_BLOCKS * LOGFS_BLOCKSIZE)
110
111/*
112 * Each indirect block pointer must have this flag set, if all block pointers
113 * behind it are set, i.e. there is no hole hidden in the shadow of this
114 * indirect block pointer.
115 */
116#define LOGFS_FULLY_POPULATED (1ULL << 63)
117#define pure_ofs(ofs) (ofs & ~LOGFS_FULLY_POPULATED)
118
119/*
120 * LogFS needs to seperate data into levels. Each level is defined as the
121 * maximal possible distance from the master inode (inode of the inode file).
122 * Data blocks reside on level 0, 1x indirect block on level 1, etc.
123 * Inodes reside on level 6, indirect blocks for the inode file on levels 7-11.
124 * This effort is necessary to guarantee garbage collection to always make
125 * progress.
126 *
127 * LOGFS_MAX_INDIRECT is the maximal indirection through indirect blocks,
128 * LOGFS_MAX_LEVELS is one more for the actual data level of a file. It is
129 * the maximal number of levels for one file.
130 * LOGFS_NO_AREAS is twice that, as the inode file and regular files are
131 * effectively stacked on top of each other.
132 */
133#define LOGFS_MAX_INDIRECT (5)
134#define LOGFS_MAX_LEVELS (LOGFS_MAX_INDIRECT + 1)
135#define LOGFS_NO_AREAS (2 * LOGFS_MAX_LEVELS)
136
137/* Maximum size of filenames */
138#define LOGFS_MAX_NAMELEN (255)
139
140/* Number of segments in the primary journal. */
141#define LOGFS_JOURNAL_SEGS (16)
142
143/* Maximum number of free/erased/etc. segments in journal entries */
144#define MAX_CACHED_SEGS (64)
145
146
147/*
148 * LOGFS_OBJECT_HEADERSIZE is the size of a single header in the object store,
149 * LOGFS_MAX_OBJECTSIZE the size of the largest possible object, including
150 * its header,
151 * LOGFS_SEGMENT_RESERVE is the amount of space reserved for each segment for
152 * its segment header and the padded space at the end when no further objects
153 * fit.
154 */
155#define LOGFS_OBJECT_HEADERSIZE (0x1c)
156#define LOGFS_SEGMENT_HEADERSIZE (0x18)
157#define LOGFS_MAX_OBJECTSIZE (LOGFS_OBJECT_HEADERSIZE + LOGFS_BLOCKSIZE)
158#define LOGFS_SEGMENT_RESERVE \
159 (LOGFS_SEGMENT_HEADERSIZE + LOGFS_MAX_OBJECTSIZE - 1)
160
161/*
162 * Segment types:
163 * SEG_SUPER - Data or indirect block
164 * SEG_JOURNAL - Inode
165 * SEG_OSTORE - Dentry
166 */
167enum {
168 SEG_SUPER = 0x01,
169 SEG_JOURNAL = 0x02,
170 SEG_OSTORE = 0x03,
171};
172
173/**
174 * struct logfs_segment_header - per-segment header in the ostore
175 *
176 * @crc: crc32 of header (there is no data)
177 * @pad: unused, must be 0
178 * @type: segment type, see above
179 * @level: GC level for all objects in this segment
180 * @segno: segment number
181 * @ec: erase count for this segment
182 * @gec: global erase count at time of writing
183 */
184struct logfs_segment_header {
185 __be32 crc;
186 __be16 pad;
187 __u8 type;
188 __u8 level;
189 __be32 segno;
190 __be32 ec;
191 __be64 gec;
192};
193
194SIZE_CHECK(logfs_segment_header, LOGFS_SEGMENT_HEADERSIZE);
195
196/**
197 * struct logfs_disk_super - on-medium superblock
198 *
199 * @ds_magic: magic number, must equal LOGFS_MAGIC
200 * @ds_crc: crc32 of structure starting with the next field
201 * @ds_ifile_levels: maximum number of levels for ifile
202 * @ds_iblock_levels: maximum number of levels for regular files
203 * @ds_data_levels: number of seperate levels for data
204 * @pad0: reserved, must be 0
205 * @ds_feature_incompat: incompatible filesystem features
206 * @ds_feature_ro_compat: read-only compatible filesystem features
207 * @ds_feature_compat: compatible filesystem features
208 * @ds_flags: flags
209 * @ds_segment_shift: log2 of segment size
210 * @ds_block_shift: log2 of block size
211 * @ds_write_shift: log2 of write size
212 * @pad1: reserved, must be 0
213 * @ds_journal_seg: segments used by primary journal
214 * @ds_root_reserve: bytes reserved for the superuser
215 * @ds_speed_reserve: bytes reserved to speed up GC
216 * @ds_bad_seg_reserve: number of segments reserved to handle bad blocks
217 * @pad2: reserved, must be 0
218 * @pad3: reserved, must be 0
219 *
220 * Contains only read-only fields. Read-write fields like the amount of used
221 * space is tracked in the dynamic superblock, which is stored in the journal.
222 */
223struct logfs_disk_super {
224 struct logfs_segment_header ds_sh;
225 __be64 ds_magic;
226
227 __be32 ds_crc;
228 __u8 ds_ifile_levels;
229 __u8 ds_iblock_levels;
230 __u8 ds_data_levels;
231 __u8 ds_segment_shift;
232 __u8 ds_block_shift;
233 __u8 ds_write_shift;
234 __u8 pad0[6];
235
236 __be64 ds_filesystem_size;
237 __be32 ds_segment_size;
238 __be32 ds_bad_seg_reserve;
239
240 __be64 ds_feature_incompat;
241 __be64 ds_feature_ro_compat;
242
243 __be64 ds_feature_compat;
244 __be64 ds_feature_flags;
245
246 __be64 ds_root_reserve;
247 __be64 ds_speed_reserve;
248
249 __be32 ds_journal_seg[LOGFS_JOURNAL_SEGS];
250
251 __be64 ds_super_ofs[2];
252 __be64 pad3[8];
253};
254
255SIZE_CHECK(logfs_disk_super, 256);
256
257/*
258 * Object types:
259 * OBJ_BLOCK - Data or indirect block
260 * OBJ_INODE - Inode
261 * OBJ_DENTRY - Dentry
262 */
263enum {
264 OBJ_BLOCK = 0x04,
265 OBJ_INODE = 0x05,
266 OBJ_DENTRY = 0x06,
267};
268
269/**
270 * struct logfs_object_header - per-object header in the ostore
271 *
272 * @crc: crc32 of header, excluding data_crc
273 * @len: length of data
274 * @type: object type, see above
275 * @compr: compression type
276 * @ino: inode number
277 * @bix: block index
278 * @data_crc: crc32 of payload
279 */
280struct logfs_object_header {
281 __be32 crc;
282 __be16 len;
283 __u8 type;
284 __u8 compr;
285 __be64 ino;
286 __be64 bix;
287 __be32 data_crc;
288} __attribute__((packed));
289
290SIZE_CHECK(logfs_object_header, LOGFS_OBJECT_HEADERSIZE);
291
292/*
293 * Reserved inode numbers:
294 * LOGFS_INO_MASTER - master inode (for inode file)
295 * LOGFS_INO_ROOT - root directory
296 * LOGFS_INO_SEGFILE - per-segment used bytes and erase count
297 */
298enum {
299 LOGFS_INO_MAPPING = 0x00,
300 LOGFS_INO_MASTER = 0x01,
301 LOGFS_INO_ROOT = 0x02,
302 LOGFS_INO_SEGFILE = 0x03,
303 LOGFS_RESERVED_INOS = 0x10,
304};
305
306/*
307 * Inode flags. High bits should never be written to the medium. They are
308 * reserved for in-memory usage.
309 * Low bits should either remain in sync with the corresponding FS_*_FL or
310 * reuse slots that obviously don't make sense for logfs.
311 *
312 * LOGFS_IF_DIRTY Inode must be written back
313 * LOGFS_IF_ZOMBIE Inode has been deleted
314 * LOGFS_IF_STILLBORN -ENOSPC happened when creating inode
315 */
316#define LOGFS_IF_COMPRESSED 0x00000004 /* == FS_COMPR_FL */
317#define LOGFS_IF_DIRTY 0x20000000
318#define LOGFS_IF_ZOMBIE 0x40000000
319#define LOGFS_IF_STILLBORN 0x80000000
320
321/* Flags available to chattr */
322#define LOGFS_FL_USER_VISIBLE (LOGFS_IF_COMPRESSED)
323#define LOGFS_FL_USER_MODIFIABLE (LOGFS_IF_COMPRESSED)
324/* Flags inherited from parent directory on file/directory creation */
325#define LOGFS_FL_INHERITED (LOGFS_IF_COMPRESSED)
326
327/**
328 * struct logfs_disk_inode - on-medium inode
329 *
330 * @di_mode: file mode
331 * @di_pad: reserved, must be 0
332 * @di_flags: inode flags, see above
333 * @di_uid: user id
334 * @di_gid: group id
335 * @di_ctime: change time
336 * @di_mtime: modify time
337 * @di_refcount: reference count (aka nlink or link count)
338 * @di_generation: inode generation, for nfs
339 * @di_used_bytes: number of bytes used
340 * @di_size: file size
341 * @di_data: data pointers
342 */
343struct logfs_disk_inode {
344 __be16 di_mode;
345 __u8 di_height;
346 __u8 di_pad;
347 __be32 di_flags;
348 __be32 di_uid;
349 __be32 di_gid;
350
351 __be64 di_ctime;
352 __be64 di_mtime;
353
354 __be64 di_atime;
355 __be32 di_refcount;
356 __be32 di_generation;
357
358 __be64 di_used_bytes;
359 __be64 di_size;
360
361 __be64 di_data[LOGFS_EMBEDDED_FIELDS];
362};
363
364SIZE_CHECK(logfs_disk_inode, 200);
365
366#define INODE_POINTER_OFS \
367 (offsetof(struct logfs_disk_inode, di_data) / sizeof(__be64))
368#define INODE_USED_OFS \
369 (offsetof(struct logfs_disk_inode, di_used_bytes) / sizeof(__be64))
370#define INODE_SIZE_OFS \
371 (offsetof(struct logfs_disk_inode, di_size) / sizeof(__be64))
372#define INODE_HEIGHT_OFS (0)
373
374/**
375 * struct logfs_disk_dentry - on-medium dentry structure
376 *
377 * @ino: inode number
378 * @namelen: length of file name
379 * @type: file type, identical to bits 12..15 of mode
380 * @name: file name
381 */
382/* FIXME: add 6 bytes of padding to remove the __packed */
383struct logfs_disk_dentry {
384 __be64 ino;
385 __be16 namelen;
386 __u8 type;
387 __u8 name[LOGFS_MAX_NAMELEN];
388} __attribute__((packed));
389
390SIZE_CHECK(logfs_disk_dentry, 266);
391
392#define RESERVED 0xffffffff
393#define BADSEG 0xffffffff
394/**
395 * struct logfs_segment_entry - segment file entry
396 *
397 * @ec_level: erase count and level
398 * @valid: number of valid bytes
399 *
400 * Segment file contains one entry for every segment. ec_level contains the
401 * erasecount in the upper 28 bits and the level in the lower 4 bits. An
402 * ec_level of BADSEG (-1) identifies bad segments. valid contains the number
403 * of valid bytes or RESERVED (-1 again) if the segment is used for either the
404 * superblock or the journal, or when the segment is bad.
405 */
406struct logfs_segment_entry {
407 __be32 ec_level;
408 __be32 valid;
409};
410
411SIZE_CHECK(logfs_segment_entry, 8);
412
413/**
414 * struct logfs_journal_header - header for journal entries (JEs)
415 *
416 * @h_crc: crc32 of journal entry
417 * @h_len: length of compressed journal entry,
418 * not including header
419 * @h_datalen: length of uncompressed data
420 * @h_type: JE type
421 * @h_version: unnormalized version of journal entry
422 * @h_compr: compression type
423 * @h_pad: reserved
424 */
425struct logfs_journal_header {
426 __be32 h_crc;
427 __be16 h_len;
428 __be16 h_datalen;
429 __be16 h_type;
430 __be16 h_version;
431 __u8 h_compr;
432 __u8 h_pad[3];
433};
434
435SIZE_CHECK(logfs_journal_header, 16);
436
437/*
438 * Life expectency of data.
439 * VIM_DEFAULT - default vim
440 * VIM_SEGFILE - for segment file only - very short-living
441 * VIM_GC - GC'd data - likely long-living
442 */
443enum logfs_vim {
444 VIM_DEFAULT = 0,
445 VIM_SEGFILE = 1,
446};
447
448/**
449 * struct logfs_je_area - wbuf header
450 *
451 * @segno: segment number of area
452 * @used_bytes: number of bytes already used
453 * @gc_level: GC level
454 * @vim: life expectancy of data
455 *
456 * "Areas" are segments currently being used for writing. There is at least
457 * one area per GC level. Several may be used to seperate long-living from
458 * short-living data. If an area with unknown vim is encountered, it can
459 * simply be closed.
460 * The write buffer immediately follow this header.
461 */
462struct logfs_je_area {
463 __be32 segno;
464 __be32 used_bytes;
465 __u8 gc_level;
466 __u8 vim;
467} __attribute__((packed));
468
469SIZE_CHECK(logfs_je_area, 10);
470
471#define MAX_JOURNAL_HEADER \
472 (sizeof(struct logfs_journal_header) + sizeof(struct logfs_je_area))
473
474/**
475 * struct logfs_je_dynsb - dynamic superblock
476 *
477 * @ds_gec: global erase count
478 * @ds_sweeper: current position of GC "sweeper"
479 * @ds_rename_dir: source directory ino (see dir.c documentation)
480 * @ds_rename_pos: position of source dd (see dir.c documentation)
481 * @ds_victim_ino: victims of incomplete dir operation (see dir.c)
482 * @ds_victim_ino: parent inode of victim (see dir.c)
483 * @ds_used_bytes: number of used bytes
484 */
485struct logfs_je_dynsb {
486 __be64 ds_gec;
487 __be64 ds_sweeper;
488
489 __be64 ds_rename_dir;
490 __be64 ds_rename_pos;
491
492 __be64 ds_victim_ino;
493 __be64 ds_victim_parent; /* XXX */
494
495 __be64 ds_used_bytes;
496 __be32 ds_generation;
497 __be32 pad;
498};
499
500SIZE_CHECK(logfs_je_dynsb, 64);
501
502/**
503 * struct logfs_je_anchor - anchor of filesystem tree, aka master inode
504 *
505 * @da_size: size of inode file
506 * @da_last_ino: last created inode
507 * @da_used_bytes: number of bytes used
508 * @da_data: data pointers
509 */
510struct logfs_je_anchor {
511 __be64 da_size;
512 __be64 da_last_ino;
513
514 __be64 da_used_bytes;
515 u8 da_height;
516 u8 pad[7];
517
518 __be64 da_data[LOGFS_EMBEDDED_FIELDS];
519};
520
521SIZE_CHECK(logfs_je_anchor, 168);
522
523/**
524 * struct logfs_je_spillout - spillout entry (from 1st to 2nd journal)
525 *
526 * @so_segment: segments used for 2nd journal
527 *
528 * Length of the array is given by h_len field in the header.
529 */
530struct logfs_je_spillout {
531 __be64 so_segment[0];
532};
533
534SIZE_CHECK(logfs_je_spillout, 0);
535
536/**
537 * struct logfs_je_journal_ec - erase counts for all journal segments
538 *
539 * @ec: erase count
540 *
541 * Length of the array is given by h_len field in the header.
542 */
543struct logfs_je_journal_ec {
544 __be32 ec[0];
545};
546
547SIZE_CHECK(logfs_je_journal_ec, 0);
548
549/**
550 * struct logfs_je_free_segments - list of free segmetns with erase count
551 */
552struct logfs_je_free_segments {
553 __be32 segno;
554 __be32 ec;
555};
556
557SIZE_CHECK(logfs_je_free_segments, 8);
558
559/**
560 * struct logfs_seg_alias - list of segment aliases
561 */
562struct logfs_seg_alias {
563 __be32 old_segno;
564 __be32 new_segno;
565};
566
567SIZE_CHECK(logfs_seg_alias, 8);
568
569/**
570 * struct logfs_obj_alias - list of object aliases
571 */
572struct logfs_obj_alias {
573 __be64 ino;
574 __be64 bix;
575 __be64 val;
576 u8 level;
577 u8 pad[5];
578 __be16 child_no;
579};
580
581SIZE_CHECK(logfs_obj_alias, 32);
582
583/**
584 * Compression types.
585 *
586 * COMPR_NONE - uncompressed
587 * COMPR_ZLIB - compressed with zlib
588 */
589enum {
590 COMPR_NONE = 0,
591 COMPR_ZLIB = 1,
592};
593
594/*
595 * Journal entries come in groups of 16. First group contains unique
596 * entries, next groups contain one entry per level
597 *
598 * JE_FIRST - smallest possible journal entry number
599 *
600 * JEG_BASE - base group, containing unique entries
601 * JE_COMMIT - commit entry, validates all previous entries
602 * JE_DYNSB - dynamic superblock, anything that ought to be in the
603 * superblock but cannot because it is read-write data
604 * JE_ANCHOR - anchor aka master inode aka inode file's inode
605 * JE_ERASECOUNT erasecounts for all journal segments
606 * JE_SPILLOUT - unused
607 * JE_SEG_ALIAS - aliases segments
608 * JE_AREA - area description
609 *
610 * JE_LAST - largest possible journal entry number
611 */
612enum {
613 JE_FIRST = 0x01,
614
615 JEG_BASE = 0x00,
616 JE_COMMIT = 0x02,
617 JE_DYNSB = 0x03,
618 JE_ANCHOR = 0x04,
619 JE_ERASECOUNT = 0x05,
620 JE_SPILLOUT = 0x06,
621 JE_OBJ_ALIAS = 0x0d,
622 JE_AREA = 0x0e,
623
624 JE_LAST = 0x0e,
625};
626
627#endif
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
new file mode 100644
index 000000000000..1dbe6e8cccec
--- /dev/null
+++ b/fs/logfs/readwrite.c
@@ -0,0 +1,2246 @@
1/*
2 * fs/logfs/readwrite.c
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 *
8 *
9 * Actually contains five sets of very similar functions:
10 * read read blocks from a file
11 * seek_hole find next hole
12 * seek_data find next data block
13 * valid check whether a block still belongs to a file
14 * write write blocks to a file
15 * delete delete a block (for directories and ifile)
16 * rewrite move existing blocks of a file to a new location (gc helper)
17 * truncate truncate a file
18 */
19#include "logfs.h"
20#include <linux/sched.h>
21
22static u64 adjust_bix(u64 bix, level_t level)
23{
24 switch (level) {
25 case 0:
26 return bix;
27 case LEVEL(1):
28 return max_t(u64, bix, I0_BLOCKS);
29 case LEVEL(2):
30 return max_t(u64, bix, I1_BLOCKS);
31 case LEVEL(3):
32 return max_t(u64, bix, I2_BLOCKS);
33 case LEVEL(4):
34 return max_t(u64, bix, I3_BLOCKS);
35 case LEVEL(5):
36 return max_t(u64, bix, I4_BLOCKS);
37 default:
38 WARN_ON(1);
39 return bix;
40 }
41}
42
43static inline u64 maxbix(u8 height)
44{
45 return 1ULL << (LOGFS_BLOCK_BITS * height);
46}
47
48/**
49 * The inode address space is cut in two halves. Lower half belongs to data
50 * pages, upper half to indirect blocks. If the high bit (INDIRECT_BIT) is
51 * set, the actual block index (bix) and level can be derived from the page
52 * index.
53 *
54 * The lowest three bits of the block index are set to 0 after packing and
55 * unpacking. Since the lowest n bits (9 for 4KiB blocksize) are ignored
56 * anyway this is harmless.
57 */
58#define ARCH_SHIFT (BITS_PER_LONG - 32)
59#define INDIRECT_BIT (0x80000000UL << ARCH_SHIFT)
60#define LEVEL_SHIFT (28 + ARCH_SHIFT)
61static inline pgoff_t first_indirect_block(void)
62{
63 return INDIRECT_BIT | (1ULL << LEVEL_SHIFT);
64}
65
66pgoff_t logfs_pack_index(u64 bix, level_t level)
67{
68 pgoff_t index;
69
70 BUG_ON(bix >= INDIRECT_BIT);
71 if (level == 0)
72 return bix;
73
74 index = INDIRECT_BIT;
75 index |= (__force long)level << LEVEL_SHIFT;
76 index |= bix >> ((__force u8)level * LOGFS_BLOCK_BITS);
77 return index;
78}
79
80void logfs_unpack_index(pgoff_t index, u64 *bix, level_t *level)
81{
82 u8 __level;
83
84 if (!(index & INDIRECT_BIT)) {
85 *bix = index;
86 *level = 0;
87 return;
88 }
89
90 __level = (index & ~INDIRECT_BIT) >> LEVEL_SHIFT;
91 *level = LEVEL(__level);
92 *bix = (index << (__level * LOGFS_BLOCK_BITS)) & ~INDIRECT_BIT;
93 *bix = adjust_bix(*bix, *level);
94 return;
95}
96#undef ARCH_SHIFT
97#undef INDIRECT_BIT
98#undef LEVEL_SHIFT
99
100/*
101 * Time is stored as nanoseconds since the epoch.
102 */
103static struct timespec be64_to_timespec(__be64 betime)
104{
105 return ns_to_timespec(be64_to_cpu(betime));
106}
107
108static __be64 timespec_to_be64(struct timespec tsp)
109{
110 return cpu_to_be64((u64)tsp.tv_sec * NSEC_PER_SEC + tsp.tv_nsec);
111}
112
113static void logfs_disk_to_inode(struct logfs_disk_inode *di, struct inode*inode)
114{
115 struct logfs_inode *li = logfs_inode(inode);
116 int i;
117
118 inode->i_mode = be16_to_cpu(di->di_mode);
119 li->li_height = di->di_height;
120 li->li_flags = be32_to_cpu(di->di_flags);
121 inode->i_uid = be32_to_cpu(di->di_uid);
122 inode->i_gid = be32_to_cpu(di->di_gid);
123 inode->i_size = be64_to_cpu(di->di_size);
124 logfs_set_blocks(inode, be64_to_cpu(di->di_used_bytes));
125 inode->i_atime = be64_to_timespec(di->di_atime);
126 inode->i_ctime = be64_to_timespec(di->di_ctime);
127 inode->i_mtime = be64_to_timespec(di->di_mtime);
128 inode->i_nlink = be32_to_cpu(di->di_refcount);
129 inode->i_generation = be32_to_cpu(di->di_generation);
130
131 switch (inode->i_mode & S_IFMT) {
132 case S_IFSOCK: /* fall through */
133 case S_IFBLK: /* fall through */
134 case S_IFCHR: /* fall through */
135 case S_IFIFO:
136 inode->i_rdev = be64_to_cpu(di->di_data[0]);
137 break;
138 case S_IFDIR: /* fall through */
139 case S_IFREG: /* fall through */
140 case S_IFLNK:
141 for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
142 li->li_data[i] = be64_to_cpu(di->di_data[i]);
143 break;
144 default:
145 BUG();
146 }
147}
148
149static void logfs_inode_to_disk(struct inode *inode, struct logfs_disk_inode*di)
150{
151 struct logfs_inode *li = logfs_inode(inode);
152 int i;
153
154 di->di_mode = cpu_to_be16(inode->i_mode);
155 di->di_height = li->li_height;
156 di->di_pad = 0;
157 di->di_flags = cpu_to_be32(li->li_flags);
158 di->di_uid = cpu_to_be32(inode->i_uid);
159 di->di_gid = cpu_to_be32(inode->i_gid);
160 di->di_size = cpu_to_be64(i_size_read(inode));
161 di->di_used_bytes = cpu_to_be64(li->li_used_bytes);
162 di->di_atime = timespec_to_be64(inode->i_atime);
163 di->di_ctime = timespec_to_be64(inode->i_ctime);
164 di->di_mtime = timespec_to_be64(inode->i_mtime);
165 di->di_refcount = cpu_to_be32(inode->i_nlink);
166 di->di_generation = cpu_to_be32(inode->i_generation);
167
168 switch (inode->i_mode & S_IFMT) {
169 case S_IFSOCK: /* fall through */
170 case S_IFBLK: /* fall through */
171 case S_IFCHR: /* fall through */
172 case S_IFIFO:
173 di->di_data[0] = cpu_to_be64(inode->i_rdev);
174 break;
175 case S_IFDIR: /* fall through */
176 case S_IFREG: /* fall through */
177 case S_IFLNK:
178 for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
179 di->di_data[i] = cpu_to_be64(li->li_data[i]);
180 break;
181 default:
182 BUG();
183 }
184}
185
186static void __logfs_set_blocks(struct inode *inode)
187{
188 struct super_block *sb = inode->i_sb;
189 struct logfs_inode *li = logfs_inode(inode);
190
191 inode->i_blocks = ULONG_MAX;
192 if (li->li_used_bytes >> sb->s_blocksize_bits < ULONG_MAX)
193 inode->i_blocks = ALIGN(li->li_used_bytes, 512) >> 9;
194}
195
196void logfs_set_blocks(struct inode *inode, u64 bytes)
197{
198 struct logfs_inode *li = logfs_inode(inode);
199
200 li->li_used_bytes = bytes;
201 __logfs_set_blocks(inode);
202}
203
204static void prelock_page(struct super_block *sb, struct page *page, int lock)
205{
206 struct logfs_super *super = logfs_super(sb);
207
208 BUG_ON(!PageLocked(page));
209 if (lock) {
210 BUG_ON(PagePreLocked(page));
211 SetPagePreLocked(page);
212 } else {
213 /* We are in GC path. */
214 if (PagePreLocked(page))
215 super->s_lock_count++;
216 else
217 SetPagePreLocked(page);
218 }
219}
220
221static void preunlock_page(struct super_block *sb, struct page *page, int lock)
222{
223 struct logfs_super *super = logfs_super(sb);
224
225 BUG_ON(!PageLocked(page));
226 if (lock)
227 ClearPagePreLocked(page);
228 else {
229 /* We are in GC path. */
230 BUG_ON(!PagePreLocked(page));
231 if (super->s_lock_count)
232 super->s_lock_count--;
233 else
234 ClearPagePreLocked(page);
235 }
236}
237
238/*
239 * Logfs is prone to an AB-BA deadlock where one task tries to acquire
240 * s_write_mutex with a locked page and GC tries to get that page while holding
241 * s_write_mutex.
242 * To solve this issue logfs will ignore the page lock iff the page in question
243 * is waiting for s_write_mutex. We annotate this fact by setting PG_pre_locked
244 * in addition to PG_locked.
245 */
246static void logfs_get_wblocks(struct super_block *sb, struct page *page,
247 int lock)
248{
249 struct logfs_super *super = logfs_super(sb);
250
251 if (page)
252 prelock_page(sb, page, lock);
253
254 if (lock) {
255 mutex_lock(&super->s_write_mutex);
256 logfs_gc_pass(sb);
257 /* FIXME: We also have to check for shadowed space
258 * and mempool fill grade */
259 }
260}
261
262static void logfs_put_wblocks(struct super_block *sb, struct page *page,
263 int lock)
264{
265 struct logfs_super *super = logfs_super(sb);
266
267 if (page)
268 preunlock_page(sb, page, lock);
269 /* Order matters - we must clear PG_pre_locked before releasing
270 * s_write_mutex or we could race against another task. */
271 if (lock)
272 mutex_unlock(&super->s_write_mutex);
273}
274
275static struct page *logfs_get_read_page(struct inode *inode, u64 bix,
276 level_t level)
277{
278 return find_or_create_page(inode->i_mapping,
279 logfs_pack_index(bix, level), GFP_NOFS);
280}
281
282static void logfs_put_read_page(struct page *page)
283{
284 unlock_page(page);
285 page_cache_release(page);
286}
287
288static void logfs_lock_write_page(struct page *page)
289{
290 int loop = 0;
291
292 while (unlikely(!trylock_page(page))) {
293 if (loop++ > 0x1000) {
294 /* Has been observed once so far... */
295 printk(KERN_ERR "stack at %p\n", &loop);
296 BUG();
297 }
298 if (PagePreLocked(page)) {
299 /* Holder of page lock is waiting for us, it
300 * is safe to use this page. */
301 break;
302 }
303 /* Some other process has this page locked and has
304 * nothing to do with us. Wait for it to finish.
305 */
306 schedule();
307 }
308 BUG_ON(!PageLocked(page));
309}
310
311static struct page *logfs_get_write_page(struct inode *inode, u64 bix,
312 level_t level)
313{
314 struct address_space *mapping = inode->i_mapping;
315 pgoff_t index = logfs_pack_index(bix, level);
316 struct page *page;
317 int err;
318
319repeat:
320 page = find_get_page(mapping, index);
321 if (!page) {
322 page = __page_cache_alloc(GFP_NOFS);
323 if (!page)
324 return NULL;
325 err = add_to_page_cache_lru(page, mapping, index, GFP_NOFS);
326 if (unlikely(err)) {
327 page_cache_release(page);
328 if (err == -EEXIST)
329 goto repeat;
330 return NULL;
331 }
332 } else logfs_lock_write_page(page);
333 BUG_ON(!PageLocked(page));
334 return page;
335}
336
337static void logfs_unlock_write_page(struct page *page)
338{
339 if (!PagePreLocked(page))
340 unlock_page(page);
341}
342
343static void logfs_put_write_page(struct page *page)
344{
345 logfs_unlock_write_page(page);
346 page_cache_release(page);
347}
348
349static struct page *logfs_get_page(struct inode *inode, u64 bix, level_t level,
350 int rw)
351{
352 if (rw == READ)
353 return logfs_get_read_page(inode, bix, level);
354 else
355 return logfs_get_write_page(inode, bix, level);
356}
357
358static void logfs_put_page(struct page *page, int rw)
359{
360 if (rw == READ)
361 logfs_put_read_page(page);
362 else
363 logfs_put_write_page(page);
364}
365
366static unsigned long __get_bits(u64 val, int skip, int no)
367{
368 u64 ret = val;
369
370 ret >>= skip * no;
371 ret <<= 64 - no;
372 ret >>= 64 - no;
373 return ret;
374}
375
376static unsigned long get_bits(u64 val, level_t skip)
377{
378 return __get_bits(val, (__force int)skip, LOGFS_BLOCK_BITS);
379}
380
381static inline void init_shadow_tree(struct super_block *sb,
382 struct shadow_tree *tree)
383{
384 struct logfs_super *super = logfs_super(sb);
385
386 btree_init_mempool64(&tree->new, super->s_btree_pool);
387 btree_init_mempool64(&tree->old, super->s_btree_pool);
388}
389
390static void indirect_write_block(struct logfs_block *block)
391{
392 struct page *page;
393 struct inode *inode;
394 int ret;
395
396 page = block->page;
397 inode = page->mapping->host;
398 logfs_lock_write_page(page);
399 ret = logfs_write_buf(inode, page, 0);
400 logfs_unlock_write_page(page);
401 /*
402 * This needs some rework. Unless you want your filesystem to run
403 * completely synchronously (you don't), the filesystem will always
404 * report writes as 'successful' before the actual work has been
405 * done. The actual work gets done here and this is where any errors
406 * will show up. And there isn't much we can do about it, really.
407 *
408 * Some attempts to fix the errors (move from bad blocks, retry io,...)
409 * have already been done, so anything left should be either a broken
410 * device or a bug somewhere in logfs itself. Being relatively new,
411 * the odds currently favor a bug, so for now the line below isn't
412 * entirely tasteles.
413 */
414 BUG_ON(ret);
415}
416
417static void inode_write_block(struct logfs_block *block)
418{
419 struct inode *inode;
420 int ret;
421
422 inode = block->inode;
423 if (inode->i_ino == LOGFS_INO_MASTER)
424 logfs_write_anchor(inode);
425 else {
426 ret = __logfs_write_inode(inode, 0);
427 /* see indirect_write_block comment */
428 BUG_ON(ret);
429 }
430}
431
432static gc_level_t inode_block_level(struct logfs_block *block)
433{
434 BUG_ON(block->inode->i_ino == LOGFS_INO_MASTER);
435 return GC_LEVEL(LOGFS_MAX_LEVELS);
436}
437
438static gc_level_t indirect_block_level(struct logfs_block *block)
439{
440 struct page *page;
441 struct inode *inode;
442 u64 bix;
443 level_t level;
444
445 page = block->page;
446 inode = page->mapping->host;
447 logfs_unpack_index(page->index, &bix, &level);
448 return expand_level(inode->i_ino, level);
449}
450
451/*
452 * This silences a false, yet annoying gcc warning. I hate it when my editor
453 * jumps into bitops.h each time I recompile this file.
454 * TODO: Complain to gcc folks about this and upgrade compiler.
455 */
456static unsigned long fnb(const unsigned long *addr,
457 unsigned long size, unsigned long offset)
458{
459 return find_next_bit(addr, size, offset);
460}
461
462static __be64 inode_val0(struct inode *inode)
463{
464 struct logfs_inode *li = logfs_inode(inode);
465 u64 val;
466
467 /*
468 * Explicit shifting generates good code, but must match the format
469 * of the structure. Add some paranoia just in case.
470 */
471 BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_mode) != 0);
472 BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_height) != 2);
473 BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_flags) != 4);
474
475 val = (u64)inode->i_mode << 48 |
476 (u64)li->li_height << 40 |
477 (u64)li->li_flags;
478 return cpu_to_be64(val);
479}
480
481static int inode_write_alias(struct super_block *sb,
482 struct logfs_block *block, write_alias_t *write_one_alias)
483{
484 struct inode *inode = block->inode;
485 struct logfs_inode *li = logfs_inode(inode);
486 unsigned long pos;
487 u64 ino , bix;
488 __be64 val;
489 level_t level;
490 int err;
491
492 for (pos = 0; ; pos++) {
493 pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos);
494 if (pos >= LOGFS_EMBEDDED_FIELDS + INODE_POINTER_OFS)
495 return 0;
496
497 switch (pos) {
498 case INODE_HEIGHT_OFS:
499 val = inode_val0(inode);
500 break;
501 case INODE_USED_OFS:
502 val = cpu_to_be64(li->li_used_bytes);;
503 break;
504 case INODE_SIZE_OFS:
505 val = cpu_to_be64(i_size_read(inode));
506 break;
507 case INODE_POINTER_OFS ... INODE_POINTER_OFS + LOGFS_EMBEDDED_FIELDS - 1:
508 val = cpu_to_be64(li->li_data[pos - INODE_POINTER_OFS]);
509 break;
510 default:
511 BUG();
512 }
513
514 ino = LOGFS_INO_MASTER;
515 bix = inode->i_ino;
516 level = LEVEL(0);
517 err = write_one_alias(sb, ino, bix, level, pos, val);
518 if (err)
519 return err;
520 }
521}
522
523static int indirect_write_alias(struct super_block *sb,
524 struct logfs_block *block, write_alias_t *write_one_alias)
525{
526 unsigned long pos;
527 struct page *page = block->page;
528 u64 ino , bix;
529 __be64 *child, val;
530 level_t level;
531 int err;
532
533 for (pos = 0; ; pos++) {
534 pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos);
535 if (pos >= LOGFS_BLOCK_FACTOR)
536 return 0;
537
538 ino = page->mapping->host->i_ino;
539 logfs_unpack_index(page->index, &bix, &level);
540 child = kmap_atomic(page, KM_USER0);
541 val = child[pos];
542 kunmap_atomic(child, KM_USER0);
543 err = write_one_alias(sb, ino, bix, level, pos, val);
544 if (err)
545 return err;
546 }
547}
548
549int logfs_write_obj_aliases_pagecache(struct super_block *sb)
550{
551 struct logfs_super *super = logfs_super(sb);
552 struct logfs_block *block;
553 int err;
554
555 list_for_each_entry(block, &super->s_object_alias, alias_list) {
556 err = block->ops->write_alias(sb, block, write_alias_journal);
557 if (err)
558 return err;
559 }
560 return 0;
561}
562
563void __free_block(struct super_block *sb, struct logfs_block *block)
564{
565 BUG_ON(!list_empty(&block->item_list));
566 list_del(&block->alias_list);
567 mempool_free(block, logfs_super(sb)->s_block_pool);
568}
569
570static void inode_free_block(struct super_block *sb, struct logfs_block *block)
571{
572 struct inode *inode = block->inode;
573
574 logfs_inode(inode)->li_block = NULL;
575 __free_block(sb, block);
576}
577
578static void indirect_free_block(struct super_block *sb,
579 struct logfs_block *block)
580{
581 ClearPagePrivate(block->page);
582 block->page->private = 0;
583 __free_block(sb, block);
584}
585
586
587static struct logfs_block_ops inode_block_ops = {
588 .write_block = inode_write_block,
589 .block_level = inode_block_level,
590 .free_block = inode_free_block,
591 .write_alias = inode_write_alias,
592};
593
594struct logfs_block_ops indirect_block_ops = {
595 .write_block = indirect_write_block,
596 .block_level = indirect_block_level,
597 .free_block = indirect_free_block,
598 .write_alias = indirect_write_alias,
599};
600
601struct logfs_block *__alloc_block(struct super_block *sb,
602 u64 ino, u64 bix, level_t level)
603{
604 struct logfs_super *super = logfs_super(sb);
605 struct logfs_block *block;
606
607 block = mempool_alloc(super->s_block_pool, GFP_NOFS);
608 memset(block, 0, sizeof(*block));
609 INIT_LIST_HEAD(&block->alias_list);
610 INIT_LIST_HEAD(&block->item_list);
611 block->sb = sb;
612 block->ino = ino;
613 block->bix = bix;
614 block->level = level;
615 return block;
616}
617
618static void alloc_inode_block(struct inode *inode)
619{
620 struct logfs_inode *li = logfs_inode(inode);
621 struct logfs_block *block;
622
623 if (li->li_block)
624 return;
625
626 block = __alloc_block(inode->i_sb, LOGFS_INO_MASTER, inode->i_ino, 0);
627 block->inode = inode;
628 li->li_block = block;
629 block->ops = &inode_block_ops;
630}
631
632void initialize_block_counters(struct page *page, struct logfs_block *block,
633 __be64 *array, int page_is_empty)
634{
635 u64 ptr;
636 int i, start;
637
638 block->partial = 0;
639 block->full = 0;
640 start = 0;
641 if (page->index < first_indirect_block()) {
642 /* Counters are pointless on level 0 */
643 return;
644 }
645 if (page->index == first_indirect_block()) {
646 /* Skip unused pointers */
647 start = I0_BLOCKS;
648 block->full = I0_BLOCKS;
649 }
650 if (!page_is_empty) {
651 for (i = start; i < LOGFS_BLOCK_FACTOR; i++) {
652 ptr = be64_to_cpu(array[i]);
653 if (ptr)
654 block->partial++;
655 if (ptr & LOGFS_FULLY_POPULATED)
656 block->full++;
657 }
658 }
659}
660
661static void alloc_data_block(struct inode *inode, struct page *page)
662{
663 struct logfs_block *block;
664 u64 bix;
665 level_t level;
666
667 if (PagePrivate(page))
668 return;
669
670 logfs_unpack_index(page->index, &bix, &level);
671 block = __alloc_block(inode->i_sb, inode->i_ino, bix, level);
672 block->page = page;
673 SetPagePrivate(page);
674 page->private = (unsigned long)block;
675 block->ops = &indirect_block_ops;
676}
677
678static void alloc_indirect_block(struct inode *inode, struct page *page,
679 int page_is_empty)
680{
681 struct logfs_block *block;
682 __be64 *array;
683
684 if (PagePrivate(page))
685 return;
686
687 alloc_data_block(inode, page);
688
689 block = logfs_block(page);
690 array = kmap_atomic(page, KM_USER0);
691 initialize_block_counters(page, block, array, page_is_empty);
692 kunmap_atomic(array, KM_USER0);
693}
694
695static void block_set_pointer(struct page *page, int index, u64 ptr)
696{
697 struct logfs_block *block = logfs_block(page);
698 __be64 *array;
699 u64 oldptr;
700
701 BUG_ON(!block);
702 array = kmap_atomic(page, KM_USER0);
703 oldptr = be64_to_cpu(array[index]);
704 array[index] = cpu_to_be64(ptr);
705 kunmap_atomic(array, KM_USER0);
706 SetPageUptodate(page);
707
708 block->full += !!(ptr & LOGFS_FULLY_POPULATED)
709 - !!(oldptr & LOGFS_FULLY_POPULATED);
710 block->partial += !!ptr - !!oldptr;
711}
712
713static u64 block_get_pointer(struct page *page, int index)
714{
715 __be64 *block;
716 u64 ptr;
717
718 block = kmap_atomic(page, KM_USER0);
719 ptr = be64_to_cpu(block[index]);
720 kunmap_atomic(block, KM_USER0);
721 return ptr;
722}
723
724static int logfs_read_empty(struct page *page)
725{
726 zero_user_segment(page, 0, PAGE_CACHE_SIZE);
727 return 0;
728}
729
730static int logfs_read_direct(struct inode *inode, struct page *page)
731{
732 struct logfs_inode *li = logfs_inode(inode);
733 pgoff_t index = page->index;
734 u64 block;
735
736 block = li->li_data[index];
737 if (!block)
738 return logfs_read_empty(page);
739
740 return logfs_segment_read(inode, page, block, index, 0);
741}
742
743static int logfs_read_loop(struct inode *inode, struct page *page,
744 int rw_context)
745{
746 struct logfs_inode *li = logfs_inode(inode);
747 u64 bix, bofs = li->li_data[INDIRECT_INDEX];
748 level_t level, target_level;
749 int ret;
750 struct page *ipage;
751
752 logfs_unpack_index(page->index, &bix, &target_level);
753 if (!bofs)
754 return logfs_read_empty(page);
755
756 if (bix >= maxbix(li->li_height))
757 return logfs_read_empty(page);
758
759 for (level = LEVEL(li->li_height);
760 (__force u8)level > (__force u8)target_level;
761 level = SUBLEVEL(level)){
762 ipage = logfs_get_page(inode, bix, level, rw_context);
763 if (!ipage)
764 return -ENOMEM;
765
766 ret = logfs_segment_read(inode, ipage, bofs, bix, level);
767 if (ret) {
768 logfs_put_read_page(ipage);
769 return ret;
770 }
771
772 bofs = block_get_pointer(ipage, get_bits(bix, SUBLEVEL(level)));
773 logfs_put_page(ipage, rw_context);
774 if (!bofs)
775 return logfs_read_empty(page);
776 }
777
778 return logfs_segment_read(inode, page, bofs, bix, 0);
779}
780
781static int logfs_read_block(struct inode *inode, struct page *page,
782 int rw_context)
783{
784 pgoff_t index = page->index;
785
786 if (index < I0_BLOCKS)
787 return logfs_read_direct(inode, page);
788 return logfs_read_loop(inode, page, rw_context);
789}
790
791static int logfs_exist_loop(struct inode *inode, u64 bix)
792{
793 struct logfs_inode *li = logfs_inode(inode);
794 u64 bofs = li->li_data[INDIRECT_INDEX];
795 level_t level;
796 int ret;
797 struct page *ipage;
798
799 if (!bofs)
800 return 0;
801 if (bix >= maxbix(li->li_height))
802 return 0;
803
804 for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)) {
805 ipage = logfs_get_read_page(inode, bix, level);
806 if (!ipage)
807 return -ENOMEM;
808
809 ret = logfs_segment_read(inode, ipage, bofs, bix, level);
810 if (ret) {
811 logfs_put_read_page(ipage);
812 return ret;
813 }
814
815 bofs = block_get_pointer(ipage, get_bits(bix, SUBLEVEL(level)));
816 logfs_put_read_page(ipage);
817 if (!bofs)
818 return 0;
819 }
820
821 return 1;
822}
823
824int logfs_exist_block(struct inode *inode, u64 bix)
825{
826 struct logfs_inode *li = logfs_inode(inode);
827
828 if (bix < I0_BLOCKS)
829 return !!li->li_data[bix];
830 return logfs_exist_loop(inode, bix);
831}
832
833static u64 seek_holedata_direct(struct inode *inode, u64 bix, int data)
834{
835 struct logfs_inode *li = logfs_inode(inode);
836
837 for (; bix < I0_BLOCKS; bix++)
838 if (data ^ (li->li_data[bix] == 0))
839 return bix;
840 return I0_BLOCKS;
841}
842
843static u64 seek_holedata_loop(struct inode *inode, u64 bix, int data)
844{
845 struct logfs_inode *li = logfs_inode(inode);
846 __be64 *rblock;
847 u64 increment, bofs = li->li_data[INDIRECT_INDEX];
848 level_t level;
849 int ret, slot;
850 struct page *page;
851
852 BUG_ON(!bofs);
853
854 for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)) {
855 increment = 1 << (LOGFS_BLOCK_BITS * ((__force u8)level-1));
856 page = logfs_get_read_page(inode, bix, level);
857 if (!page)
858 return bix;
859
860 ret = logfs_segment_read(inode, page, bofs, bix, level);
861 if (ret) {
862 logfs_put_read_page(page);
863 return bix;
864 }
865
866 slot = get_bits(bix, SUBLEVEL(level));
867 rblock = kmap_atomic(page, KM_USER0);
868 while (slot < LOGFS_BLOCK_FACTOR) {
869 if (data && (rblock[slot] != 0))
870 break;
871 if (!data && !(be64_to_cpu(rblock[slot]) & LOGFS_FULLY_POPULATED))
872 break;
873 slot++;
874 bix += increment;
875 bix &= ~(increment - 1);
876 }
877 if (slot >= LOGFS_BLOCK_FACTOR) {
878 kunmap_atomic(rblock, KM_USER0);
879 logfs_put_read_page(page);
880 return bix;
881 }
882 bofs = be64_to_cpu(rblock[slot]);
883 kunmap_atomic(rblock, KM_USER0);
884 logfs_put_read_page(page);
885 if (!bofs) {
886 BUG_ON(data);
887 return bix;
888 }
889 }
890 return bix;
891}
892
893/**
894 * logfs_seek_hole - find next hole starting at a given block index
895 * @inode: inode to search in
896 * @bix: block index to start searching
897 *
898 * Returns next hole. If the file doesn't contain any further holes, the
899 * block address next to eof is returned instead.
900 */
901u64 logfs_seek_hole(struct inode *inode, u64 bix)
902{
903 struct logfs_inode *li = logfs_inode(inode);
904
905 if (bix < I0_BLOCKS) {
906 bix = seek_holedata_direct(inode, bix, 0);
907 if (bix < I0_BLOCKS)
908 return bix;
909 }
910
911 if (!li->li_data[INDIRECT_INDEX])
912 return bix;
913 else if (li->li_data[INDIRECT_INDEX] & LOGFS_FULLY_POPULATED)
914 bix = maxbix(li->li_height);
915 else {
916 bix = seek_holedata_loop(inode, bix, 0);
917 if (bix < maxbix(li->li_height))
918 return bix;
919 /* Should not happen anymore. But if some port writes semi-
920 * corrupt images (as this one used to) we might run into it.
921 */
922 WARN_ON_ONCE(bix == maxbix(li->li_height));
923 }
924
925 return bix;
926}
927
928static u64 __logfs_seek_data(struct inode *inode, u64 bix)
929{
930 struct logfs_inode *li = logfs_inode(inode);
931
932 if (bix < I0_BLOCKS) {
933 bix = seek_holedata_direct(inode, bix, 1);
934 if (bix < I0_BLOCKS)
935 return bix;
936 }
937
938 if (bix < maxbix(li->li_height)) {
939 if (!li->li_data[INDIRECT_INDEX])
940 bix = maxbix(li->li_height);
941 else
942 return seek_holedata_loop(inode, bix, 1);
943 }
944
945 return bix;
946}
947
948/**
949 * logfs_seek_data - find next data block after a given block index
950 * @inode: inode to search in
951 * @bix: block index to start searching
952 *
953 * Returns next data block. If the file doesn't contain any further data
954 * blocks, the last block in the file is returned instead.
955 */
956u64 logfs_seek_data(struct inode *inode, u64 bix)
957{
958 struct super_block *sb = inode->i_sb;
959 u64 ret, end;
960
961 ret = __logfs_seek_data(inode, bix);
962 end = i_size_read(inode) >> sb->s_blocksize_bits;
963 if (ret >= end)
964 ret = max(bix, end);
965 return ret;
966}
967
968static int logfs_is_valid_direct(struct logfs_inode *li, u64 bix, u64 ofs)
969{
970 return pure_ofs(li->li_data[bix]) == ofs;
971}
972
973static int __logfs_is_valid_loop(struct inode *inode, u64 bix,
974 u64 ofs, u64 bofs)
975{
976 struct logfs_inode *li = logfs_inode(inode);
977 level_t level;
978 int ret;
979 struct page *page;
980
981 for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)){
982 page = logfs_get_write_page(inode, bix, level);
983 BUG_ON(!page);
984
985 ret = logfs_segment_read(inode, page, bofs, bix, level);
986 if (ret) {
987 logfs_put_write_page(page);
988 return 0;
989 }
990
991 bofs = block_get_pointer(page, get_bits(bix, SUBLEVEL(level)));
992 logfs_put_write_page(page);
993 if (!bofs)
994 return 0;
995
996 if (pure_ofs(bofs) == ofs)
997 return 1;
998 }
999 return 0;
1000}
1001
1002static int logfs_is_valid_loop(struct inode *inode, u64 bix, u64 ofs)
1003{
1004 struct logfs_inode *li = logfs_inode(inode);
1005 u64 bofs = li->li_data[INDIRECT_INDEX];
1006
1007 if (!bofs)
1008 return 0;
1009
1010 if (bix >= maxbix(li->li_height))
1011 return 0;
1012
1013 if (pure_ofs(bofs) == ofs)
1014 return 1;
1015
1016 return __logfs_is_valid_loop(inode, bix, ofs, bofs);
1017}
1018
1019static int __logfs_is_valid_block(struct inode *inode, u64 bix, u64 ofs)
1020{
1021 struct logfs_inode *li = logfs_inode(inode);
1022
1023 if ((inode->i_nlink == 0) && atomic_read(&inode->i_count) == 1)
1024 return 0;
1025
1026 if (bix < I0_BLOCKS)
1027 return logfs_is_valid_direct(li, bix, ofs);
1028 return logfs_is_valid_loop(inode, bix, ofs);
1029}
1030
1031/**
1032 * logfs_is_valid_block - check whether this block is still valid
1033 *
1034 * @sb - superblock
1035 * @ofs - block physical offset
1036 * @ino - block inode number
1037 * @bix - block index
1038 * @level - block level
1039 *
1040 * Returns 0 if the block is invalid, 1 if it is valid and 2 if it will
1041 * become invalid once the journal is written.
1042 */
1043int logfs_is_valid_block(struct super_block *sb, u64 ofs, u64 ino, u64 bix,
1044 gc_level_t gc_level)
1045{
1046 struct logfs_super *super = logfs_super(sb);
1047 struct inode *inode;
1048 int ret, cookie;
1049
1050 /* Umount closes a segment with free blocks remaining. Those
1051 * blocks are by definition invalid. */
1052 if (ino == -1)
1053 return 0;
1054
1055 LOGFS_BUG_ON((u64)(u_long)ino != ino, sb);
1056
1057 inode = logfs_safe_iget(sb, ino, &cookie);
1058 if (IS_ERR(inode))
1059 goto invalid;
1060
1061 ret = __logfs_is_valid_block(inode, bix, ofs);
1062 logfs_safe_iput(inode, cookie);
1063 if (ret)
1064 return ret;
1065
1066invalid:
1067 /* Block is nominally invalid, but may still sit in the shadow tree,
1068 * waiting for a journal commit.
1069 */
1070 if (btree_lookup64(&super->s_shadow_tree.old, ofs))
1071 return 2;
1072 return 0;
1073}
1074
1075int logfs_readpage_nolock(struct page *page)
1076{
1077 struct inode *inode = page->mapping->host;
1078 int ret = -EIO;
1079
1080 ret = logfs_read_block(inode, page, READ);
1081
1082 if (ret) {
1083 ClearPageUptodate(page);
1084 SetPageError(page);
1085 } else {
1086 SetPageUptodate(page);
1087 ClearPageError(page);
1088 }
1089 flush_dcache_page(page);
1090
1091 return ret;
1092}
1093
1094static int logfs_reserve_bytes(struct inode *inode, int bytes)
1095{
1096 struct logfs_super *super = logfs_super(inode->i_sb);
1097 u64 available = super->s_free_bytes + super->s_dirty_free_bytes
1098 - super->s_dirty_used_bytes - super->s_dirty_pages;
1099
1100 if (!bytes)
1101 return 0;
1102
1103 if (available < bytes)
1104 return -ENOSPC;
1105
1106 if (available < bytes + super->s_root_reserve &&
1107 !capable(CAP_SYS_RESOURCE))
1108 return -ENOSPC;
1109
1110 return 0;
1111}
1112
1113int get_page_reserve(struct inode *inode, struct page *page)
1114{
1115 struct logfs_super *super = logfs_super(inode->i_sb);
1116 int ret;
1117
1118 if (logfs_block(page) && logfs_block(page)->reserved_bytes)
1119 return 0;
1120
1121 logfs_get_wblocks(inode->i_sb, page, WF_LOCK);
1122 ret = logfs_reserve_bytes(inode, 6 * LOGFS_MAX_OBJECTSIZE);
1123 if (!ret) {
1124 alloc_data_block(inode, page);
1125 logfs_block(page)->reserved_bytes += 6 * LOGFS_MAX_OBJECTSIZE;
1126 super->s_dirty_pages += 6 * LOGFS_MAX_OBJECTSIZE;
1127 }
1128 logfs_put_wblocks(inode->i_sb, page, WF_LOCK);
1129 return ret;
1130}
1131
1132/*
1133 * We are protected by write lock. Push victims up to superblock level
1134 * and release transaction when appropriate.
1135 */
1136/* FIXME: This is currently called from the wrong spots. */
1137static void logfs_handle_transaction(struct inode *inode,
1138 struct logfs_transaction *ta)
1139{
1140 struct logfs_super *super = logfs_super(inode->i_sb);
1141
1142 if (!ta)
1143 return;
1144 logfs_inode(inode)->li_block->ta = NULL;
1145
1146 if (inode->i_ino != LOGFS_INO_MASTER) {
1147 BUG(); /* FIXME: Yes, this needs more thought */
1148 /* just remember the transaction until inode is written */
1149 //BUG_ON(logfs_inode(inode)->li_transaction);
1150 //logfs_inode(inode)->li_transaction = ta;
1151 return;
1152 }
1153
1154 switch (ta->state) {
1155 case CREATE_1: /* fall through */
1156 case UNLINK_1:
1157 BUG_ON(super->s_victim_ino);
1158 super->s_victim_ino = ta->ino;
1159 break;
1160 case CREATE_2: /* fall through */
1161 case UNLINK_2:
1162 BUG_ON(super->s_victim_ino != ta->ino);
1163 super->s_victim_ino = 0;
1164 /* transaction ends here - free it */
1165 kfree(ta);
1166 break;
1167 case CROSS_RENAME_1:
1168 BUG_ON(super->s_rename_dir);
1169 BUG_ON(super->s_rename_pos);
1170 super->s_rename_dir = ta->dir;
1171 super->s_rename_pos = ta->pos;
1172 break;
1173 case CROSS_RENAME_2:
1174 BUG_ON(super->s_rename_dir != ta->dir);
1175 BUG_ON(super->s_rename_pos != ta->pos);
1176 super->s_rename_dir = 0;
1177 super->s_rename_pos = 0;
1178 kfree(ta);
1179 break;
1180 case TARGET_RENAME_1:
1181 BUG_ON(super->s_rename_dir);
1182 BUG_ON(super->s_rename_pos);
1183 BUG_ON(super->s_victim_ino);
1184 super->s_rename_dir = ta->dir;
1185 super->s_rename_pos = ta->pos;
1186 super->s_victim_ino = ta->ino;
1187 break;
1188 case TARGET_RENAME_2:
1189 BUG_ON(super->s_rename_dir != ta->dir);
1190 BUG_ON(super->s_rename_pos != ta->pos);
1191 BUG_ON(super->s_victim_ino != ta->ino);
1192 super->s_rename_dir = 0;
1193 super->s_rename_pos = 0;
1194 break;
1195 case TARGET_RENAME_3:
1196 BUG_ON(super->s_rename_dir);
1197 BUG_ON(super->s_rename_pos);
1198 BUG_ON(super->s_victim_ino != ta->ino);
1199 super->s_victim_ino = 0;
1200 kfree(ta);
1201 break;
1202 default:
1203 BUG();
1204 }
1205}
1206
1207/*
1208 * Not strictly a reservation, but rather a check that we still have enough
1209 * space to satisfy the write.
1210 */
1211static int logfs_reserve_blocks(struct inode *inode, int blocks)
1212{
1213 return logfs_reserve_bytes(inode, blocks * LOGFS_MAX_OBJECTSIZE);
1214}
1215
1216struct write_control {
1217 u64 ofs;
1218 long flags;
1219};
1220
1221static struct logfs_shadow *alloc_shadow(struct inode *inode, u64 bix,
1222 level_t level, u64 old_ofs)
1223{
1224 struct logfs_super *super = logfs_super(inode->i_sb);
1225 struct logfs_shadow *shadow;
1226
1227 shadow = mempool_alloc(super->s_shadow_pool, GFP_NOFS);
1228 memset(shadow, 0, sizeof(*shadow));
1229 shadow->ino = inode->i_ino;
1230 shadow->bix = bix;
1231 shadow->gc_level = expand_level(inode->i_ino, level);
1232 shadow->old_ofs = old_ofs & ~LOGFS_FULLY_POPULATED;
1233 return shadow;
1234}
1235
1236static void free_shadow(struct inode *inode, struct logfs_shadow *shadow)
1237{
1238 struct logfs_super *super = logfs_super(inode->i_sb);
1239
1240 mempool_free(shadow, super->s_shadow_pool);
1241}
1242
1243/**
1244 * fill_shadow_tree - Propagate shadow tree changes due to a write
1245 * @inode: Inode owning the page
1246 * @page: Struct page that was written
1247 * @shadow: Shadow for the current write
1248 *
1249 * Writes in logfs can result in two semi-valid objects. The old object
1250 * is still valid as long as it can be reached by following pointers on
1251 * the medium. Only when writes propagate all the way up to the journal
1252 * has the new object safely replaced the old one.
1253 *
1254 * To handle this problem, a struct logfs_shadow is used to represent
1255 * every single write. It is attached to the indirect block, which is
1256 * marked dirty. When the indirect block is written, its shadows are
1257 * handed up to the next indirect block (or inode). Untimately they
1258 * will reach the master inode and be freed upon journal commit.
1259 *
1260 * This function handles a single step in the propagation. It adds the
1261 * shadow for the current write to the tree, along with any shadows in
1262 * the page's tree, in case it was an indirect block. If a page is
1263 * written, the inode parameter is left NULL, if an inode is written,
1264 * the page parameter is left NULL.
1265 */
1266static void fill_shadow_tree(struct inode *inode, struct page *page,
1267 struct logfs_shadow *shadow)
1268{
1269 struct logfs_super *super = logfs_super(inode->i_sb);
1270 struct logfs_block *block = logfs_block(page);
1271 struct shadow_tree *tree = &super->s_shadow_tree;
1272
1273 if (PagePrivate(page)) {
1274 if (block->alias_map)
1275 super->s_no_object_aliases -= bitmap_weight(
1276 block->alias_map, LOGFS_BLOCK_FACTOR);
1277 logfs_handle_transaction(inode, block->ta);
1278 block->ops->free_block(inode->i_sb, block);
1279 }
1280 if (shadow) {
1281 if (shadow->old_ofs)
1282 btree_insert64(&tree->old, shadow->old_ofs, shadow,
1283 GFP_NOFS);
1284 else
1285 btree_insert64(&tree->new, shadow->new_ofs, shadow,
1286 GFP_NOFS);
1287
1288 super->s_dirty_used_bytes += shadow->new_len;
1289 super->s_dirty_free_bytes += shadow->old_len;
1290 }
1291}
1292
1293static void logfs_set_alias(struct super_block *sb, struct logfs_block *block,
1294 long child_no)
1295{
1296 struct logfs_super *super = logfs_super(sb);
1297
1298 if (block->inode && block->inode->i_ino == LOGFS_INO_MASTER) {
1299 /* Aliases in the master inode are pointless. */
1300 return;
1301 }
1302
1303 if (!test_bit(child_no, block->alias_map)) {
1304 set_bit(child_no, block->alias_map);
1305 super->s_no_object_aliases++;
1306 }
1307 list_move_tail(&block->alias_list, &super->s_object_alias);
1308}
1309
1310/*
1311 * Object aliases can and often do change the size and occupied space of a
1312 * file. So not only do we have to change the pointers, we also have to
1313 * change inode->i_size and li->li_used_bytes. Which is done by setting
1314 * another two object aliases for the inode itself.
1315 */
1316static void set_iused(struct inode *inode, struct logfs_shadow *shadow)
1317{
1318 struct logfs_inode *li = logfs_inode(inode);
1319
1320 if (shadow->new_len == shadow->old_len)
1321 return;
1322
1323 alloc_inode_block(inode);
1324 li->li_used_bytes += shadow->new_len - shadow->old_len;
1325 __logfs_set_blocks(inode);
1326 logfs_set_alias(inode->i_sb, li->li_block, INODE_USED_OFS);
1327 logfs_set_alias(inode->i_sb, li->li_block, INODE_SIZE_OFS);
1328}
1329
1330static int logfs_write_i0(struct inode *inode, struct page *page,
1331 struct write_control *wc)
1332{
1333 struct logfs_shadow *shadow;
1334 u64 bix;
1335 level_t level;
1336 int full, err = 0;
1337
1338 logfs_unpack_index(page->index, &bix, &level);
1339 if (wc->ofs == 0)
1340 if (logfs_reserve_blocks(inode, 1))
1341 return -ENOSPC;
1342
1343 shadow = alloc_shadow(inode, bix, level, wc->ofs);
1344 if (wc->flags & WF_WRITE)
1345 err = logfs_segment_write(inode, page, shadow);
1346 if (wc->flags & WF_DELETE)
1347 logfs_segment_delete(inode, shadow);
1348 if (err) {
1349 free_shadow(inode, shadow);
1350 return err;
1351 }
1352
1353 set_iused(inode, shadow);
1354 full = 1;
1355 if (level != 0) {
1356 alloc_indirect_block(inode, page, 0);
1357 full = logfs_block(page)->full == LOGFS_BLOCK_FACTOR;
1358 }
1359 fill_shadow_tree(inode, page, shadow);
1360 wc->ofs = shadow->new_ofs;
1361 if (wc->ofs && full)
1362 wc->ofs |= LOGFS_FULLY_POPULATED;
1363 return 0;
1364}
1365
1366static int logfs_write_direct(struct inode *inode, struct page *page,
1367 long flags)
1368{
1369 struct logfs_inode *li = logfs_inode(inode);
1370 struct write_control wc = {
1371 .ofs = li->li_data[page->index],
1372 .flags = flags,
1373 };
1374 int err;
1375
1376 alloc_inode_block(inode);
1377
1378 err = logfs_write_i0(inode, page, &wc);
1379 if (err)
1380 return err;
1381
1382 li->li_data[page->index] = wc.ofs;
1383 logfs_set_alias(inode->i_sb, li->li_block,
1384 page->index + INODE_POINTER_OFS);
1385 return 0;
1386}
1387
1388static int ptr_change(u64 ofs, struct page *page)
1389{
1390 struct logfs_block *block = logfs_block(page);
1391 int empty0, empty1, full0, full1;
1392
1393 empty0 = ofs == 0;
1394 empty1 = block->partial == 0;
1395 if (empty0 != empty1)
1396 return 1;
1397
1398 /* The !! is necessary to shrink result to int */
1399 full0 = !!(ofs & LOGFS_FULLY_POPULATED);
1400 full1 = block->full == LOGFS_BLOCK_FACTOR;
1401 if (full0 != full1)
1402 return 1;
1403 return 0;
1404}
1405
1406static int __logfs_write_rec(struct inode *inode, struct page *page,
1407 struct write_control *this_wc,
1408 pgoff_t bix, level_t target_level, level_t level)
1409{
1410 int ret, page_empty = 0;
1411 int child_no = get_bits(bix, SUBLEVEL(level));
1412 struct page *ipage;
1413 struct write_control child_wc = {
1414 .flags = this_wc->flags,
1415 };
1416
1417 ipage = logfs_get_write_page(inode, bix, level);
1418 if (!ipage)
1419 return -ENOMEM;
1420
1421 if (this_wc->ofs) {
1422 ret = logfs_segment_read(inode, ipage, this_wc->ofs, bix, level);
1423 if (ret)
1424 goto out;
1425 } else if (!PageUptodate(ipage)) {
1426 page_empty = 1;
1427 logfs_read_empty(ipage);
1428 }
1429
1430 child_wc.ofs = block_get_pointer(ipage, child_no);
1431
1432 if ((__force u8)level-1 > (__force u8)target_level)
1433 ret = __logfs_write_rec(inode, page, &child_wc, bix,
1434 target_level, SUBLEVEL(level));
1435 else
1436 ret = logfs_write_i0(inode, page, &child_wc);
1437
1438 if (ret)
1439 goto out;
1440
1441 alloc_indirect_block(inode, ipage, page_empty);
1442 block_set_pointer(ipage, child_no, child_wc.ofs);
1443 /* FIXME: first condition seems superfluous */
1444 if (child_wc.ofs || logfs_block(ipage)->partial)
1445 this_wc->flags |= WF_WRITE;
1446 /* the condition on this_wc->ofs ensures that we won't consume extra
1447 * space for indirect blocks in the future, which we cannot reserve */
1448 if (!this_wc->ofs || ptr_change(this_wc->ofs, ipage))
1449 ret = logfs_write_i0(inode, ipage, this_wc);
1450 else
1451 logfs_set_alias(inode->i_sb, logfs_block(ipage), child_no);
1452out:
1453 logfs_put_write_page(ipage);
1454 return ret;
1455}
1456
1457static int logfs_write_rec(struct inode *inode, struct page *page,
1458 pgoff_t bix, level_t target_level, long flags)
1459{
1460 struct logfs_inode *li = logfs_inode(inode);
1461 struct write_control wc = {
1462 .ofs = li->li_data[INDIRECT_INDEX],
1463 .flags = flags,
1464 };
1465 int ret;
1466
1467 alloc_inode_block(inode);
1468
1469 if (li->li_height > (__force u8)target_level)
1470 ret = __logfs_write_rec(inode, page, &wc, bix, target_level,
1471 LEVEL(li->li_height));
1472 else
1473 ret = logfs_write_i0(inode, page, &wc);
1474 if (ret)
1475 return ret;
1476
1477 if (li->li_data[INDIRECT_INDEX] != wc.ofs) {
1478 li->li_data[INDIRECT_INDEX] = wc.ofs;
1479 logfs_set_alias(inode->i_sb, li->li_block,
1480 INDIRECT_INDEX + INODE_POINTER_OFS);
1481 }
1482 return ret;
1483}
1484
1485void logfs_add_transaction(struct inode *inode, struct logfs_transaction *ta)
1486{
1487 alloc_inode_block(inode);
1488 logfs_inode(inode)->li_block->ta = ta;
1489}
1490
1491void logfs_del_transaction(struct inode *inode, struct logfs_transaction *ta)
1492{
1493 struct logfs_block *block = logfs_inode(inode)->li_block;
1494
1495 if (block && block->ta)
1496 block->ta = NULL;
1497}
1498
1499static int grow_inode(struct inode *inode, u64 bix, level_t level)
1500{
1501 struct logfs_inode *li = logfs_inode(inode);
1502 u8 height = (__force u8)level;
1503 struct page *page;
1504 struct write_control wc = {
1505 .flags = WF_WRITE,
1506 };
1507 int err;
1508
1509 BUG_ON(height > 5 || li->li_height > 5);
1510 while (height > li->li_height || bix >= maxbix(li->li_height)) {
1511 page = logfs_get_write_page(inode, I0_BLOCKS + 1,
1512 LEVEL(li->li_height + 1));
1513 if (!page)
1514 return -ENOMEM;
1515 logfs_read_empty(page);
1516 alloc_indirect_block(inode, page, 1);
1517 block_set_pointer(page, 0, li->li_data[INDIRECT_INDEX]);
1518 err = logfs_write_i0(inode, page, &wc);
1519 logfs_put_write_page(page);
1520 if (err)
1521 return err;
1522 li->li_data[INDIRECT_INDEX] = wc.ofs;
1523 wc.ofs = 0;
1524 li->li_height++;
1525 logfs_set_alias(inode->i_sb, li->li_block, INODE_HEIGHT_OFS);
1526 }
1527 return 0;
1528}
1529
1530static int __logfs_write_buf(struct inode *inode, struct page *page, long flags)
1531{
1532 struct logfs_super *super = logfs_super(inode->i_sb);
1533 pgoff_t index = page->index;
1534 u64 bix;
1535 level_t level;
1536 int err;
1537
1538 flags |= WF_WRITE | WF_DELETE;
1539 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
1540
1541 logfs_unpack_index(index, &bix, &level);
1542 if (logfs_block(page) && logfs_block(page)->reserved_bytes)
1543 super->s_dirty_pages -= logfs_block(page)->reserved_bytes;
1544
1545 if (index < I0_BLOCKS)
1546 return logfs_write_direct(inode, page, flags);
1547
1548 bix = adjust_bix(bix, level);
1549 err = grow_inode(inode, bix, level);
1550 if (err)
1551 return err;
1552 return logfs_write_rec(inode, page, bix, level, flags);
1553}
1554
1555int logfs_write_buf(struct inode *inode, struct page *page, long flags)
1556{
1557 struct super_block *sb = inode->i_sb;
1558 int ret;
1559
1560 logfs_get_wblocks(sb, page, flags & WF_LOCK);
1561 ret = __logfs_write_buf(inode, page, flags);
1562 logfs_put_wblocks(sb, page, flags & WF_LOCK);
1563 return ret;
1564}
1565
1566static int __logfs_delete(struct inode *inode, struct page *page)
1567{
1568 long flags = WF_DELETE;
1569
1570 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
1571
1572 if (page->index < I0_BLOCKS)
1573 return logfs_write_direct(inode, page, flags);
1574 return logfs_write_rec(inode, page, page->index, 0, flags);
1575}
1576
1577int logfs_delete(struct inode *inode, pgoff_t index,
1578 struct shadow_tree *shadow_tree)
1579{
1580 struct super_block *sb = inode->i_sb;
1581 struct page *page;
1582 int ret;
1583
1584 page = logfs_get_read_page(inode, index, 0);
1585 if (!page)
1586 return -ENOMEM;
1587
1588 logfs_get_wblocks(sb, page, 1);
1589 ret = __logfs_delete(inode, page);
1590 logfs_put_wblocks(sb, page, 1);
1591
1592 logfs_put_read_page(page);
1593
1594 return ret;
1595}
1596
1597/* Rewrite cannot mark the inode dirty but has to write it immediatly. */
1598int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs,
1599 gc_level_t gc_level, long flags)
1600{
1601 level_t level = shrink_level(gc_level);
1602 struct page *page;
1603 int err;
1604
1605 page = logfs_get_write_page(inode, bix, level);
1606 if (!page)
1607 return -ENOMEM;
1608
1609 err = logfs_segment_read(inode, page, ofs, bix, level);
1610 if (!err) {
1611 if (level != 0)
1612 alloc_indirect_block(inode, page, 0);
1613 err = logfs_write_buf(inode, page, flags);
1614 }
1615 logfs_put_write_page(page);
1616 return err;
1617}
1618
1619static int truncate_data_block(struct inode *inode, struct page *page,
1620 u64 ofs, struct logfs_shadow *shadow, u64 size)
1621{
1622 loff_t pageofs = page->index << inode->i_sb->s_blocksize_bits;
1623 u64 bix;
1624 level_t level;
1625 int err;
1626
1627 /* Does truncation happen within this page? */
1628 if (size <= pageofs || size - pageofs >= PAGE_SIZE)
1629 return 0;
1630
1631 logfs_unpack_index(page->index, &bix, &level);
1632 BUG_ON(level != 0);
1633
1634 err = logfs_segment_read(inode, page, ofs, bix, level);
1635 if (err)
1636 return err;
1637
1638 zero_user_segment(page, size - pageofs, PAGE_CACHE_SIZE);
1639 return logfs_segment_write(inode, page, shadow);
1640}
1641
1642static int logfs_truncate_i0(struct inode *inode, struct page *page,
1643 struct write_control *wc, u64 size)
1644{
1645 struct logfs_shadow *shadow;
1646 u64 bix;
1647 level_t level;
1648 int err = 0;
1649
1650 logfs_unpack_index(page->index, &bix, &level);
1651 BUG_ON(level != 0);
1652 shadow = alloc_shadow(inode, bix, level, wc->ofs);
1653
1654 err = truncate_data_block(inode, page, wc->ofs, shadow, size);
1655 if (err) {
1656 free_shadow(inode, shadow);
1657 return err;
1658 }
1659
1660 logfs_segment_delete(inode, shadow);
1661 set_iused(inode, shadow);
1662 fill_shadow_tree(inode, page, shadow);
1663 wc->ofs = shadow->new_ofs;
1664 return 0;
1665}
1666
1667static int logfs_truncate_direct(struct inode *inode, u64 size)
1668{
1669 struct logfs_inode *li = logfs_inode(inode);
1670 struct write_control wc;
1671 struct page *page;
1672 int e;
1673 int err;
1674
1675 alloc_inode_block(inode);
1676
1677 for (e = I0_BLOCKS - 1; e >= 0; e--) {
1678 if (size > (e+1) * LOGFS_BLOCKSIZE)
1679 break;
1680
1681 wc.ofs = li->li_data[e];
1682 if (!wc.ofs)
1683 continue;
1684
1685 page = logfs_get_write_page(inode, e, 0);
1686 if (!page)
1687 return -ENOMEM;
1688 err = logfs_segment_read(inode, page, wc.ofs, e, 0);
1689 if (err) {
1690 logfs_put_write_page(page);
1691 return err;
1692 }
1693 err = logfs_truncate_i0(inode, page, &wc, size);
1694 logfs_put_write_page(page);
1695 if (err)
1696 return err;
1697
1698 li->li_data[e] = wc.ofs;
1699 }
1700 return 0;
1701}
1702
1703/* FIXME: these need to become per-sb once we support different blocksizes */
1704static u64 __logfs_step[] = {
1705 1,
1706 I1_BLOCKS,
1707 I2_BLOCKS,
1708 I3_BLOCKS,
1709};
1710
1711static u64 __logfs_start_index[] = {
1712 I0_BLOCKS,
1713 I1_BLOCKS,
1714 I2_BLOCKS,
1715 I3_BLOCKS
1716};
1717
1718static inline u64 logfs_step(level_t level)
1719{
1720 return __logfs_step[(__force u8)level];
1721}
1722
1723static inline u64 logfs_factor(u8 level)
1724{
1725 return __logfs_step[level] * LOGFS_BLOCKSIZE;
1726}
1727
1728static inline u64 logfs_start_index(level_t level)
1729{
1730 return __logfs_start_index[(__force u8)level];
1731}
1732
1733static void logfs_unpack_raw_index(pgoff_t index, u64 *bix, level_t *level)
1734{
1735 logfs_unpack_index(index, bix, level);
1736 if (*bix <= logfs_start_index(SUBLEVEL(*level)))
1737 *bix = 0;
1738}
1739
1740static int __logfs_truncate_rec(struct inode *inode, struct page *ipage,
1741 struct write_control *this_wc, u64 size)
1742{
1743 int truncate_happened = 0;
1744 int e, err = 0;
1745 u64 bix, child_bix, next_bix;
1746 level_t level;
1747 struct page *page;
1748 struct write_control child_wc = { /* FIXME: flags */ };
1749
1750 logfs_unpack_raw_index(ipage->index, &bix, &level);
1751 err = logfs_segment_read(inode, ipage, this_wc->ofs, bix, level);
1752 if (err)
1753 return err;
1754
1755 for (e = LOGFS_BLOCK_FACTOR - 1; e >= 0; e--) {
1756 child_bix = bix + e * logfs_step(SUBLEVEL(level));
1757 next_bix = child_bix + logfs_step(SUBLEVEL(level));
1758 if (size > next_bix * LOGFS_BLOCKSIZE)
1759 break;
1760
1761 child_wc.ofs = pure_ofs(block_get_pointer(ipage, e));
1762 if (!child_wc.ofs)
1763 continue;
1764
1765 page = logfs_get_write_page(inode, child_bix, SUBLEVEL(level));
1766 if (!page)
1767 return -ENOMEM;
1768
1769 if ((__force u8)level > 1)
1770 err = __logfs_truncate_rec(inode, page, &child_wc, size);
1771 else
1772 err = logfs_truncate_i0(inode, page, &child_wc, size);
1773 logfs_put_write_page(page);
1774 if (err)
1775 return err;
1776
1777 truncate_happened = 1;
1778 alloc_indirect_block(inode, ipage, 0);
1779 block_set_pointer(ipage, e, child_wc.ofs);
1780 }
1781
1782 if (!truncate_happened) {
1783 printk("ineffectual truncate (%lx, %lx, %llx)\n", inode->i_ino, ipage->index, size);
1784 return 0;
1785 }
1786
1787 this_wc->flags = WF_DELETE;
1788 if (logfs_block(ipage)->partial)
1789 this_wc->flags |= WF_WRITE;
1790
1791 return logfs_write_i0(inode, ipage, this_wc);
1792}
1793
1794static int logfs_truncate_rec(struct inode *inode, u64 size)
1795{
1796 struct logfs_inode *li = logfs_inode(inode);
1797 struct write_control wc = {
1798 .ofs = li->li_data[INDIRECT_INDEX],
1799 };
1800 struct page *page;
1801 int err;
1802
1803 alloc_inode_block(inode);
1804
1805 if (!wc.ofs)
1806 return 0;
1807
1808 page = logfs_get_write_page(inode, 0, LEVEL(li->li_height));
1809 if (!page)
1810 return -ENOMEM;
1811
1812 err = __logfs_truncate_rec(inode, page, &wc, size);
1813 logfs_put_write_page(page);
1814 if (err)
1815 return err;
1816
1817 if (li->li_data[INDIRECT_INDEX] != wc.ofs)
1818 li->li_data[INDIRECT_INDEX] = wc.ofs;
1819 return 0;
1820}
1821
1822static int __logfs_truncate(struct inode *inode, u64 size)
1823{
1824 int ret;
1825
1826 if (size >= logfs_factor(logfs_inode(inode)->li_height))
1827 return 0;
1828
1829 ret = logfs_truncate_rec(inode, size);
1830 if (ret)
1831 return ret;
1832
1833 return logfs_truncate_direct(inode, size);
1834}
1835
1836int logfs_truncate(struct inode *inode, u64 size)
1837{
1838 struct super_block *sb = inode->i_sb;
1839 int err;
1840
1841 logfs_get_wblocks(sb, NULL, 1);
1842 err = __logfs_truncate(inode, size);
1843 if (!err)
1844 err = __logfs_write_inode(inode, 0);
1845 logfs_put_wblocks(sb, NULL, 1);
1846
1847 if (!err)
1848 err = vmtruncate(inode, size);
1849
1850 /* I don't trust error recovery yet. */
1851 WARN_ON(err);
1852 return err;
1853}
1854
1855static void move_page_to_inode(struct inode *inode, struct page *page)
1856{
1857 struct logfs_inode *li = logfs_inode(inode);
1858 struct logfs_block *block = logfs_block(page);
1859
1860 if (!block)
1861 return;
1862
1863 log_blockmove("move_page_to_inode(%llx, %llx, %x)\n",
1864 block->ino, block->bix, block->level);
1865 BUG_ON(li->li_block);
1866 block->ops = &inode_block_ops;
1867 block->inode = inode;
1868 li->li_block = block;
1869
1870 block->page = NULL;
1871 page->private = 0;
1872 ClearPagePrivate(page);
1873}
1874
1875static void move_inode_to_page(struct page *page, struct inode *inode)
1876{
1877 struct logfs_inode *li = logfs_inode(inode);
1878 struct logfs_block *block = li->li_block;
1879
1880 if (!block)
1881 return;
1882
1883 log_blockmove("move_inode_to_page(%llx, %llx, %x)\n",
1884 block->ino, block->bix, block->level);
1885 BUG_ON(PagePrivate(page));
1886 block->ops = &indirect_block_ops;
1887 block->page = page;
1888 page->private = (unsigned long)block;
1889 SetPagePrivate(page);
1890
1891 block->inode = NULL;
1892 li->li_block = NULL;
1893}
1894
1895int logfs_read_inode(struct inode *inode)
1896{
1897 struct super_block *sb = inode->i_sb;
1898 struct logfs_super *super = logfs_super(sb);
1899 struct inode *master_inode = super->s_master_inode;
1900 struct page *page;
1901 struct logfs_disk_inode *di;
1902 u64 ino = inode->i_ino;
1903
1904 if (ino << sb->s_blocksize_bits > i_size_read(master_inode))
1905 return -ENODATA;
1906 if (!logfs_exist_block(master_inode, ino))
1907 return -ENODATA;
1908
1909 page = read_cache_page(master_inode->i_mapping, ino,
1910 (filler_t *)logfs_readpage, NULL);
1911 if (IS_ERR(page))
1912 return PTR_ERR(page);
1913
1914 di = kmap_atomic(page, KM_USER0);
1915 logfs_disk_to_inode(di, inode);
1916 kunmap_atomic(di, KM_USER0);
1917 move_page_to_inode(inode, page);
1918 page_cache_release(page);
1919 return 0;
1920}
1921
1922/* Caller must logfs_put_write_page(page); */
1923static struct page *inode_to_page(struct inode *inode)
1924{
1925 struct inode *master_inode = logfs_super(inode->i_sb)->s_master_inode;
1926 struct logfs_disk_inode *di;
1927 struct page *page;
1928
1929 BUG_ON(inode->i_ino == LOGFS_INO_MASTER);
1930
1931 page = logfs_get_write_page(master_inode, inode->i_ino, 0);
1932 if (!page)
1933 return NULL;
1934
1935 di = kmap_atomic(page, KM_USER0);
1936 logfs_inode_to_disk(inode, di);
1937 kunmap_atomic(di, KM_USER0);
1938 move_inode_to_page(page, inode);
1939 return page;
1940}
1941
1942/* Cheaper version of write_inode. All changes are concealed in
1943 * aliases, which are moved back. No write to the medium happens.
1944 */
1945void logfs_clear_inode(struct inode *inode)
1946{
1947 struct super_block *sb = inode->i_sb;
1948 struct logfs_inode *li = logfs_inode(inode);
1949 struct logfs_block *block = li->li_block;
1950 struct page *page;
1951
1952 /* Only deleted files may be dirty at this point */
1953 BUG_ON(inode->i_state & I_DIRTY && inode->i_nlink);
1954 if (!block)
1955 return;
1956 if ((logfs_super(sb)->s_flags & LOGFS_SB_FLAG_SHUTDOWN)) {
1957 block->ops->free_block(inode->i_sb, block);
1958 return;
1959 }
1960
1961 BUG_ON(inode->i_ino < LOGFS_RESERVED_INOS);
1962 page = inode_to_page(inode);
1963 BUG_ON(!page); /* FIXME: Use emergency page */
1964 logfs_put_write_page(page);
1965}
1966
1967static int do_write_inode(struct inode *inode)
1968{
1969 struct super_block *sb = inode->i_sb;
1970 struct inode *master_inode = logfs_super(sb)->s_master_inode;
1971 loff_t size = (inode->i_ino + 1) << inode->i_sb->s_blocksize_bits;
1972 struct page *page;
1973 int err;
1974
1975 BUG_ON(inode->i_ino == LOGFS_INO_MASTER);
1976 /* FIXME: lock inode */
1977
1978 if (i_size_read(master_inode) < size)
1979 i_size_write(master_inode, size);
1980
1981 /* TODO: Tell vfs this inode is clean now */
1982
1983 page = inode_to_page(inode);
1984 if (!page)
1985 return -ENOMEM;
1986
1987 /* FIXME: transaction is part of logfs_block now. Is that enough? */
1988 err = logfs_write_buf(master_inode, page, 0);
1989 logfs_put_write_page(page);
1990 return err;
1991}
1992
1993static void logfs_mod_segment_entry(struct super_block *sb, u32 segno,
1994 int write,
1995 void (*change_se)(struct logfs_segment_entry *, long),
1996 long arg)
1997{
1998 struct logfs_super *super = logfs_super(sb);
1999 struct inode *inode;
2000 struct page *page;
2001 struct logfs_segment_entry *se;
2002 pgoff_t page_no;
2003 int child_no;
2004
2005 page_no = segno >> (sb->s_blocksize_bits - 3);
2006 child_no = segno & ((sb->s_blocksize >> 3) - 1);
2007
2008 inode = super->s_segfile_inode;
2009 page = logfs_get_write_page(inode, page_no, 0);
2010 BUG_ON(!page); /* FIXME: We need some reserve page for this case */
2011 if (!PageUptodate(page))
2012 logfs_read_block(inode, page, WRITE);
2013
2014 if (write)
2015 alloc_indirect_block(inode, page, 0);
2016 se = kmap_atomic(page, KM_USER0);
2017 change_se(se + child_no, arg);
2018 if (write) {
2019 logfs_set_alias(sb, logfs_block(page), child_no);
2020 BUG_ON((int)be32_to_cpu(se[child_no].valid) > super->s_segsize);
2021 }
2022 kunmap_atomic(se, KM_USER0);
2023
2024 logfs_put_write_page(page);
2025}
2026
2027static void __get_segment_entry(struct logfs_segment_entry *se, long _target)
2028{
2029 struct logfs_segment_entry *target = (void *)_target;
2030
2031 *target = *se;
2032}
2033
2034void logfs_get_segment_entry(struct super_block *sb, u32 segno,
2035 struct logfs_segment_entry *se)
2036{
2037 logfs_mod_segment_entry(sb, segno, 0, __get_segment_entry, (long)se);
2038}
2039
2040static void __set_segment_used(struct logfs_segment_entry *se, long increment)
2041{
2042 u32 valid;
2043
2044 valid = be32_to_cpu(se->valid);
2045 valid += increment;
2046 se->valid = cpu_to_be32(valid);
2047}
2048
2049void logfs_set_segment_used(struct super_block *sb, u64 ofs, int increment)
2050{
2051 struct logfs_super *super = logfs_super(sb);
2052 u32 segno = ofs >> super->s_segshift;
2053
2054 if (!increment)
2055 return;
2056
2057 logfs_mod_segment_entry(sb, segno, 1, __set_segment_used, increment);
2058}
2059
2060static void __set_segment_erased(struct logfs_segment_entry *se, long ec_level)
2061{
2062 se->ec_level = cpu_to_be32(ec_level);
2063}
2064
2065void logfs_set_segment_erased(struct super_block *sb, u32 segno, u32 ec,
2066 gc_level_t gc_level)
2067{
2068 u32 ec_level = ec << 4 | (__force u8)gc_level;
2069
2070 logfs_mod_segment_entry(sb, segno, 1, __set_segment_erased, ec_level);
2071}
2072
2073static void __set_segment_reserved(struct logfs_segment_entry *se, long ignore)
2074{
2075 se->valid = cpu_to_be32(RESERVED);
2076}
2077
2078void logfs_set_segment_reserved(struct super_block *sb, u32 segno)
2079{
2080 logfs_mod_segment_entry(sb, segno, 1, __set_segment_reserved, 0);
2081}
2082
2083static void __set_segment_unreserved(struct logfs_segment_entry *se,
2084 long ec_level)
2085{
2086 se->valid = 0;
2087 se->ec_level = cpu_to_be32(ec_level);
2088}
2089
2090void logfs_set_segment_unreserved(struct super_block *sb, u32 segno, u32 ec)
2091{
2092 u32 ec_level = ec << 4;
2093
2094 logfs_mod_segment_entry(sb, segno, 1, __set_segment_unreserved,
2095 ec_level);
2096}
2097
2098int __logfs_write_inode(struct inode *inode, long flags)
2099{
2100 struct super_block *sb = inode->i_sb;
2101 int ret;
2102
2103 logfs_get_wblocks(sb, NULL, flags & WF_LOCK);
2104 ret = do_write_inode(inode);
2105 logfs_put_wblocks(sb, NULL, flags & WF_LOCK);
2106 return ret;
2107}
2108
2109static int do_delete_inode(struct inode *inode)
2110{
2111 struct super_block *sb = inode->i_sb;
2112 struct inode *master_inode = logfs_super(sb)->s_master_inode;
2113 struct page *page;
2114 int ret;
2115
2116 page = logfs_get_write_page(master_inode, inode->i_ino, 0);
2117 if (!page)
2118 return -ENOMEM;
2119
2120 move_inode_to_page(page, inode);
2121
2122 logfs_get_wblocks(sb, page, 1);
2123 ret = __logfs_delete(master_inode, page);
2124 logfs_put_wblocks(sb, page, 1);
2125
2126 logfs_put_write_page(page);
2127 return ret;
2128}
2129
2130/*
2131 * ZOMBIE inodes have already been deleted before and should remain dead,
2132 * if it weren't for valid checking. No need to kill them again here.
2133 */
2134void logfs_delete_inode(struct inode *inode)
2135{
2136 struct logfs_inode *li = logfs_inode(inode);
2137
2138 if (!(li->li_flags & LOGFS_IF_ZOMBIE)) {
2139 li->li_flags |= LOGFS_IF_ZOMBIE;
2140 if (i_size_read(inode) > 0)
2141 logfs_truncate(inode, 0);
2142 do_delete_inode(inode);
2143 }
2144 truncate_inode_pages(&inode->i_data, 0);
2145 clear_inode(inode);
2146}
2147
2148void btree_write_block(struct logfs_block *block)
2149{
2150 struct inode *inode;
2151 struct page *page;
2152 int err, cookie;
2153
2154 inode = logfs_safe_iget(block->sb, block->ino, &cookie);
2155 page = logfs_get_write_page(inode, block->bix, block->level);
2156
2157 err = logfs_readpage_nolock(page);
2158 BUG_ON(err);
2159 BUG_ON(!PagePrivate(page));
2160 BUG_ON(logfs_block(page) != block);
2161 err = __logfs_write_buf(inode, page, 0);
2162 BUG_ON(err);
2163 BUG_ON(PagePrivate(page) || page->private);
2164
2165 logfs_put_write_page(page);
2166 logfs_safe_iput(inode, cookie);
2167}
2168
2169/**
2170 * logfs_inode_write - write inode or dentry objects
2171 *
2172 * @inode: parent inode (ifile or directory)
2173 * @buf: object to write (inode or dentry)
2174 * @n: object size
2175 * @_pos: object number (file position in blocks/objects)
2176 * @flags: write flags
2177 * @lock: 0 if write lock is already taken, 1 otherwise
2178 * @shadow_tree: shadow below this inode
2179 *
2180 * FIXME: All caller of this put a 200-300 byte variable on the stack,
2181 * only to call here and do a memcpy from that stack variable. A good
2182 * example of wasted performance and stack space.
2183 */
2184int logfs_inode_write(struct inode *inode, const void *buf, size_t count,
2185 loff_t bix, long flags, struct shadow_tree *shadow_tree)
2186{
2187 loff_t pos = bix << inode->i_sb->s_blocksize_bits;
2188 int err;
2189 struct page *page;
2190 void *pagebuf;
2191
2192 BUG_ON(pos & (LOGFS_BLOCKSIZE-1));
2193 BUG_ON(count > LOGFS_BLOCKSIZE);
2194 page = logfs_get_write_page(inode, bix, 0);
2195 if (!page)
2196 return -ENOMEM;
2197
2198 pagebuf = kmap_atomic(page, KM_USER0);
2199 memcpy(pagebuf, buf, count);
2200 flush_dcache_page(page);
2201 kunmap_atomic(pagebuf, KM_USER0);
2202
2203 if (i_size_read(inode) < pos + LOGFS_BLOCKSIZE)
2204 i_size_write(inode, pos + LOGFS_BLOCKSIZE);
2205
2206 err = logfs_write_buf(inode, page, flags);
2207 logfs_put_write_page(page);
2208 return err;
2209}
2210
2211int logfs_open_segfile(struct super_block *sb)
2212{
2213 struct logfs_super *super = logfs_super(sb);
2214 struct inode *inode;
2215
2216 inode = logfs_read_meta_inode(sb, LOGFS_INO_SEGFILE);
2217 if (IS_ERR(inode))
2218 return PTR_ERR(inode);
2219 super->s_segfile_inode = inode;
2220 return 0;
2221}
2222
2223int logfs_init_rw(struct super_block *sb)
2224{
2225 struct logfs_super *super = logfs_super(sb);
2226 int min_fill = 3 * super->s_no_blocks;
2227
2228 INIT_LIST_HEAD(&super->s_object_alias);
2229 mutex_init(&super->s_write_mutex);
2230 super->s_block_pool = mempool_create_kmalloc_pool(min_fill,
2231 sizeof(struct logfs_block));
2232 super->s_shadow_pool = mempool_create_kmalloc_pool(min_fill,
2233 sizeof(struct logfs_shadow));
2234 return 0;
2235}
2236
2237void logfs_cleanup_rw(struct super_block *sb)
2238{
2239 struct logfs_super *super = logfs_super(sb);
2240
2241 destroy_meta_inode(super->s_segfile_inode);
2242 if (super->s_block_pool)
2243 mempool_destroy(super->s_block_pool);
2244 if (super->s_shadow_pool)
2245 mempool_destroy(super->s_shadow_pool);
2246}
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
new file mode 100644
index 000000000000..5f58b74516ca
--- /dev/null
+++ b/fs/logfs/segment.c
@@ -0,0 +1,924 @@
1/*
2 * fs/logfs/segment.c - Handling the Object Store
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 *
8 * Object store or ostore makes up the complete device with exception of
9 * the superblock and journal areas. Apart from its own metadata it stores
10 * three kinds of objects: inodes, dentries and blocks, both data and indirect.
11 */
12#include "logfs.h"
13
14static int logfs_mark_segment_bad(struct super_block *sb, u32 segno)
15{
16 struct logfs_super *super = logfs_super(sb);
17 struct btree_head32 *head = &super->s_reserved_segments;
18 int err;
19
20 err = btree_insert32(head, segno, (void *)1, GFP_NOFS);
21 if (err)
22 return err;
23 logfs_super(sb)->s_bad_segments++;
24 /* FIXME: write to journal */
25 return 0;
26}
27
28int logfs_erase_segment(struct super_block *sb, u32 segno)
29{
30 struct logfs_super *super = logfs_super(sb);
31
32 super->s_gec++;
33
34 return super->s_devops->erase(sb, (u64)segno << super->s_segshift,
35 super->s_segsize);
36}
37
38static s64 logfs_get_free_bytes(struct logfs_area *area, size_t bytes)
39{
40 s32 ofs;
41
42 logfs_open_area(area, bytes);
43
44 ofs = area->a_used_bytes;
45 area->a_used_bytes += bytes;
46 BUG_ON(area->a_used_bytes >= logfs_super(area->a_sb)->s_segsize);
47
48 return dev_ofs(area->a_sb, area->a_segno, ofs);
49}
50
51static struct page *get_mapping_page(struct super_block *sb, pgoff_t index,
52 int use_filler)
53{
54 struct logfs_super *super = logfs_super(sb);
55 struct address_space *mapping = super->s_mapping_inode->i_mapping;
56 filler_t *filler = super->s_devops->readpage;
57 struct page *page;
58
59 BUG_ON(mapping_gfp_mask(mapping) & __GFP_FS);
60 if (use_filler)
61 page = read_cache_page(mapping, index, filler, sb);
62 else {
63 page = find_or_create_page(mapping, index, GFP_NOFS);
64 unlock_page(page);
65 }
66 return page;
67}
68
69void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
70 int use_filler)
71{
72 pgoff_t index = ofs >> PAGE_SHIFT;
73 struct page *page;
74 long offset = ofs & (PAGE_SIZE-1);
75 long copylen;
76
77 /* Only logfs_wbuf_recover may use len==0 */
78 BUG_ON(!len && !use_filler);
79 do {
80 copylen = min((ulong)len, PAGE_SIZE - offset);
81
82 page = get_mapping_page(area->a_sb, index, use_filler);
83 SetPageUptodate(page);
84 BUG_ON(!page); /* FIXME: reserve a pool */
85 memcpy(page_address(page) + offset, buf, copylen);
86 SetPagePrivate(page);
87 page_cache_release(page);
88
89 buf += copylen;
90 len -= copylen;
91 offset = 0;
92 index++;
93 } while (len);
94}
95
96/*
97 * bdev_writeseg will write full pages. Memset the tail to prevent data leaks.
98 */
99static void pad_wbuf(struct logfs_area *area, int final)
100{
101 struct super_block *sb = area->a_sb;
102 struct logfs_super *super = logfs_super(sb);
103 struct page *page;
104 u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes);
105 pgoff_t index = ofs >> PAGE_SHIFT;
106 long offset = ofs & (PAGE_SIZE-1);
107 u32 len = PAGE_SIZE - offset;
108
109 if (len == PAGE_SIZE) {
110 /* The math in this function can surely use some love */
111 len = 0;
112 }
113 if (len) {
114 BUG_ON(area->a_used_bytes >= super->s_segsize);
115
116 page = get_mapping_page(area->a_sb, index, 0);
117 BUG_ON(!page); /* FIXME: reserve a pool */
118 memset(page_address(page) + offset, 0xff, len);
119 SetPagePrivate(page);
120 page_cache_release(page);
121 }
122
123 if (!final)
124 return;
125
126 area->a_used_bytes += len;
127 for ( ; area->a_used_bytes < super->s_segsize;
128 area->a_used_bytes += PAGE_SIZE) {
129 /* Memset another page */
130 index++;
131 page = get_mapping_page(area->a_sb, index, 0);
132 BUG_ON(!page); /* FIXME: reserve a pool */
133 memset(page_address(page), 0xff, PAGE_SIZE);
134 SetPagePrivate(page);
135 page_cache_release(page);
136 }
137}
138
139/*
140 * We have to be careful with the alias tree. Since lookup is done by bix,
141 * it needs to be normalized, so 14, 15, 16, etc. all match when dealing with
142 * indirect blocks. So always use it through accessor functions.
143 */
144static void *alias_tree_lookup(struct super_block *sb, u64 ino, u64 bix,
145 level_t level)
146{
147 struct btree_head128 *head = &logfs_super(sb)->s_object_alias_tree;
148 pgoff_t index = logfs_pack_index(bix, level);
149
150 return btree_lookup128(head, ino, index);
151}
152
153static int alias_tree_insert(struct super_block *sb, u64 ino, u64 bix,
154 level_t level, void *val)
155{
156 struct btree_head128 *head = &logfs_super(sb)->s_object_alias_tree;
157 pgoff_t index = logfs_pack_index(bix, level);
158
159 return btree_insert128(head, ino, index, val, GFP_NOFS);
160}
161
162static int btree_write_alias(struct super_block *sb, struct logfs_block *block,
163 write_alias_t *write_one_alias)
164{
165 struct object_alias_item *item;
166 int err;
167
168 list_for_each_entry(item, &block->item_list, list) {
169 err = write_alias_journal(sb, block->ino, block->bix,
170 block->level, item->child_no, item->val);
171 if (err)
172 return err;
173 }
174 return 0;
175}
176
177static gc_level_t btree_block_level(struct logfs_block *block)
178{
179 return expand_level(block->ino, block->level);
180}
181
182static struct logfs_block_ops btree_block_ops = {
183 .write_block = btree_write_block,
184 .block_level = btree_block_level,
185 .free_block = __free_block,
186 .write_alias = btree_write_alias,
187};
188
189int logfs_load_object_aliases(struct super_block *sb,
190 struct logfs_obj_alias *oa, int count)
191{
192 struct logfs_super *super = logfs_super(sb);
193 struct logfs_block *block;
194 struct object_alias_item *item;
195 u64 ino, bix;
196 level_t level;
197 int i, err;
198
199 super->s_flags |= LOGFS_SB_FLAG_OBJ_ALIAS;
200 count /= sizeof(*oa);
201 for (i = 0; i < count; i++) {
202 item = mempool_alloc(super->s_alias_pool, GFP_NOFS);
203 if (!item)
204 return -ENOMEM;
205 memset(item, 0, sizeof(*item));
206
207 super->s_no_object_aliases++;
208 item->val = oa[i].val;
209 item->child_no = be16_to_cpu(oa[i].child_no);
210
211 ino = be64_to_cpu(oa[i].ino);
212 bix = be64_to_cpu(oa[i].bix);
213 level = LEVEL(oa[i].level);
214
215 log_aliases("logfs_load_object_aliases(%llx, %llx, %x, %x) %llx\n",
216 ino, bix, level, item->child_no,
217 be64_to_cpu(item->val));
218 block = alias_tree_lookup(sb, ino, bix, level);
219 if (!block) {
220 block = __alloc_block(sb, ino, bix, level);
221 block->ops = &btree_block_ops;
222 err = alias_tree_insert(sb, ino, bix, level, block);
223 BUG_ON(err); /* mempool empty */
224 }
225 if (test_and_set_bit(item->child_no, block->alias_map)) {
226 printk(KERN_ERR"LogFS: Alias collision detected\n");
227 return -EIO;
228 }
229 list_move_tail(&block->alias_list, &super->s_object_alias);
230 list_add(&item->list, &block->item_list);
231 }
232 return 0;
233}
234
235static void kill_alias(void *_block, unsigned long ignore0,
236 u64 ignore1, u64 ignore2, size_t ignore3)
237{
238 struct logfs_block *block = _block;
239 struct super_block *sb = block->sb;
240 struct logfs_super *super = logfs_super(sb);
241 struct object_alias_item *item;
242
243 while (!list_empty(&block->item_list)) {
244 item = list_entry(block->item_list.next, typeof(*item), list);
245 list_del(&item->list);
246 mempool_free(item, super->s_alias_pool);
247 }
248 block->ops->free_block(sb, block);
249}
250
251static int obj_type(struct inode *inode, level_t level)
252{
253 if (level == 0) {
254 if (S_ISDIR(inode->i_mode))
255 return OBJ_DENTRY;
256 if (inode->i_ino == LOGFS_INO_MASTER)
257 return OBJ_INODE;
258 }
259 return OBJ_BLOCK;
260}
261
262static int obj_len(struct super_block *sb, int obj_type)
263{
264 switch (obj_type) {
265 case OBJ_DENTRY:
266 return sizeof(struct logfs_disk_dentry);
267 case OBJ_INODE:
268 return sizeof(struct logfs_disk_inode);
269 case OBJ_BLOCK:
270 return sb->s_blocksize;
271 default:
272 BUG();
273 }
274}
275
276static int __logfs_segment_write(struct inode *inode, void *buf,
277 struct logfs_shadow *shadow, int type, int len, int compr)
278{
279 struct logfs_area *area;
280 struct super_block *sb = inode->i_sb;
281 s64 ofs;
282 struct logfs_object_header h;
283 int acc_len;
284
285 if (shadow->gc_level == 0)
286 acc_len = len;
287 else
288 acc_len = obj_len(sb, type);
289
290 area = get_area(sb, shadow->gc_level);
291 ofs = logfs_get_free_bytes(area, len + LOGFS_OBJECT_HEADERSIZE);
292 LOGFS_BUG_ON(ofs <= 0, sb);
293 /*
294 * Order is important. logfs_get_free_bytes(), by modifying the
295 * segment file, may modify the content of the very page we're about
296 * to write now. Which is fine, as long as the calculated crc and
297 * written data still match. So do the modifications _before_
298 * calculating the crc.
299 */
300
301 h.len = cpu_to_be16(len);
302 h.type = type;
303 h.compr = compr;
304 h.ino = cpu_to_be64(inode->i_ino);
305 h.bix = cpu_to_be64(shadow->bix);
306 h.crc = logfs_crc32(&h, sizeof(h) - 4, 4);
307 h.data_crc = logfs_crc32(buf, len, 0);
308
309 logfs_buf_write(area, ofs, &h, sizeof(h));
310 logfs_buf_write(area, ofs + LOGFS_OBJECT_HEADERSIZE, buf, len);
311
312 shadow->new_ofs = ofs;
313 shadow->new_len = acc_len + LOGFS_OBJECT_HEADERSIZE;
314
315 return 0;
316}
317
318static s64 logfs_segment_write_compress(struct inode *inode, void *buf,
319 struct logfs_shadow *shadow, int type, int len)
320{
321 struct super_block *sb = inode->i_sb;
322 void *compressor_buf = logfs_super(sb)->s_compressed_je;
323 ssize_t compr_len;
324 int ret;
325
326 mutex_lock(&logfs_super(sb)->s_journal_mutex);
327 compr_len = logfs_compress(buf, compressor_buf, len, len);
328
329 if (compr_len >= 0) {
330 ret = __logfs_segment_write(inode, compressor_buf, shadow,
331 type, compr_len, COMPR_ZLIB);
332 } else {
333 ret = __logfs_segment_write(inode, buf, shadow, type, len,
334 COMPR_NONE);
335 }
336 mutex_unlock(&logfs_super(sb)->s_journal_mutex);
337 return ret;
338}
339
340/**
341 * logfs_segment_write - write data block to object store
342 * @inode: inode containing data
343 *
344 * Returns an errno or zero.
345 */
346int logfs_segment_write(struct inode *inode, struct page *page,
347 struct logfs_shadow *shadow)
348{
349 struct super_block *sb = inode->i_sb;
350 struct logfs_super *super = logfs_super(sb);
351 int do_compress, type, len;
352 int ret;
353 void *buf;
354
355 BUG_ON(logfs_super(sb)->s_flags & LOGFS_SB_FLAG_SHUTDOWN);
356 do_compress = logfs_inode(inode)->li_flags & LOGFS_IF_COMPRESSED;
357 if (shadow->gc_level != 0) {
358 /* temporarily disable compression for indirect blocks */
359 do_compress = 0;
360 }
361
362 type = obj_type(inode, shrink_level(shadow->gc_level));
363 len = obj_len(sb, type);
364 buf = kmap(page);
365 if (do_compress)
366 ret = logfs_segment_write_compress(inode, buf, shadow, type,
367 len);
368 else
369 ret = __logfs_segment_write(inode, buf, shadow, type, len,
370 COMPR_NONE);
371 kunmap(page);
372
373 log_segment("logfs_segment_write(%llx, %llx, %x) %llx->%llx %x->%x\n",
374 shadow->ino, shadow->bix, shadow->gc_level,
375 shadow->old_ofs, shadow->new_ofs,
376 shadow->old_len, shadow->new_len);
377 /* this BUG_ON did catch a locking bug. useful */
378 BUG_ON(!(shadow->new_ofs & (super->s_segsize - 1)));
379 return ret;
380}
381
382int wbuf_read(struct super_block *sb, u64 ofs, size_t len, void *buf)
383{
384 pgoff_t index = ofs >> PAGE_SHIFT;
385 struct page *page;
386 long offset = ofs & (PAGE_SIZE-1);
387 long copylen;
388
389 while (len) {
390 copylen = min((ulong)len, PAGE_SIZE - offset);
391
392 page = get_mapping_page(sb, index, 1);
393 if (IS_ERR(page))
394 return PTR_ERR(page);
395 memcpy(buf, page_address(page) + offset, copylen);
396 page_cache_release(page);
397
398 buf += copylen;
399 len -= copylen;
400 offset = 0;
401 index++;
402 }
403 return 0;
404}
405
406/*
407 * The "position" of indirect blocks is ambiguous. It can be the position
408 * of any data block somewhere behind this indirect block. So we need to
409 * normalize the positions through logfs_block_mask() before comparing.
410 */
411static int check_pos(struct super_block *sb, u64 pos1, u64 pos2, level_t level)
412{
413 return (pos1 & logfs_block_mask(sb, level)) !=
414 (pos2 & logfs_block_mask(sb, level));
415}
416
417#if 0
418static int read_seg_header(struct super_block *sb, u64 ofs,
419 struct logfs_segment_header *sh)
420{
421 __be32 crc;
422 int err;
423
424 err = wbuf_read(sb, ofs, sizeof(*sh), sh);
425 if (err)
426 return err;
427 crc = logfs_crc32(sh, sizeof(*sh), 4);
428 if (crc != sh->crc) {
429 printk(KERN_ERR"LOGFS: header crc error at %llx: expected %x, "
430 "got %x\n", ofs, be32_to_cpu(sh->crc),
431 be32_to_cpu(crc));
432 return -EIO;
433 }
434 return 0;
435}
436#endif
437
438static int read_obj_header(struct super_block *sb, u64 ofs,
439 struct logfs_object_header *oh)
440{
441 __be32 crc;
442 int err;
443
444 err = wbuf_read(sb, ofs, sizeof(*oh), oh);
445 if (err)
446 return err;
447 crc = logfs_crc32(oh, sizeof(*oh) - 4, 4);
448 if (crc != oh->crc) {
449 printk(KERN_ERR"LOGFS: header crc error at %llx: expected %x, "
450 "got %x\n", ofs, be32_to_cpu(oh->crc),
451 be32_to_cpu(crc));
452 return -EIO;
453 }
454 return 0;
455}
456
457static void move_btree_to_page(struct inode *inode, struct page *page,
458 __be64 *data)
459{
460 struct super_block *sb = inode->i_sb;
461 struct logfs_super *super = logfs_super(sb);
462 struct btree_head128 *head = &super->s_object_alias_tree;
463 struct logfs_block *block;
464 struct object_alias_item *item, *next;
465
466 if (!(super->s_flags & LOGFS_SB_FLAG_OBJ_ALIAS))
467 return;
468
469 block = btree_remove128(head, inode->i_ino, page->index);
470 if (!block)
471 return;
472
473 log_blockmove("move_btree_to_page(%llx, %llx, %x)\n",
474 block->ino, block->bix, block->level);
475 list_for_each_entry_safe(item, next, &block->item_list, list) {
476 data[item->child_no] = item->val;
477 list_del(&item->list);
478 mempool_free(item, super->s_alias_pool);
479 }
480 block->page = page;
481 SetPagePrivate(page);
482 page->private = (unsigned long)block;
483 block->ops = &indirect_block_ops;
484 initialize_block_counters(page, block, data, 0);
485}
486
487/*
488 * This silences a false, yet annoying gcc warning. I hate it when my editor
489 * jumps into bitops.h each time I recompile this file.
490 * TODO: Complain to gcc folks about this and upgrade compiler.
491 */
492static unsigned long fnb(const unsigned long *addr,
493 unsigned long size, unsigned long offset)
494{
495 return find_next_bit(addr, size, offset);
496}
497
498void move_page_to_btree(struct page *page)
499{
500 struct logfs_block *block = logfs_block(page);
501 struct super_block *sb = block->sb;
502 struct logfs_super *super = logfs_super(sb);
503 struct object_alias_item *item;
504 unsigned long pos;
505 __be64 *child;
506 int err;
507
508 if (super->s_flags & LOGFS_SB_FLAG_SHUTDOWN) {
509 block->ops->free_block(sb, block);
510 return;
511 }
512 log_blockmove("move_page_to_btree(%llx, %llx, %x)\n",
513 block->ino, block->bix, block->level);
514 super->s_flags |= LOGFS_SB_FLAG_OBJ_ALIAS;
515
516 for (pos = 0; ; pos++) {
517 pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos);
518 if (pos >= LOGFS_BLOCK_FACTOR)
519 break;
520
521 item = mempool_alloc(super->s_alias_pool, GFP_NOFS);
522 BUG_ON(!item); /* mempool empty */
523 memset(item, 0, sizeof(*item));
524
525 child = kmap_atomic(page, KM_USER0);
526 item->val = child[pos];
527 kunmap_atomic(child, KM_USER0);
528 item->child_no = pos;
529 list_add(&item->list, &block->item_list);
530 }
531 block->page = NULL;
532 ClearPagePrivate(page);
533 page->private = 0;
534 block->ops = &btree_block_ops;
535 err = alias_tree_insert(block->sb, block->ino, block->bix, block->level,
536 block);
537 BUG_ON(err); /* mempool empty */
538 ClearPageUptodate(page);
539}
540
541static int __logfs_segment_read(struct inode *inode, void *buf,
542 u64 ofs, u64 bix, level_t level)
543{
544 struct super_block *sb = inode->i_sb;
545 void *compressor_buf = logfs_super(sb)->s_compressed_je;
546 struct logfs_object_header oh;
547 __be32 crc;
548 u16 len;
549 int err, block_len;
550
551 block_len = obj_len(sb, obj_type(inode, level));
552 err = read_obj_header(sb, ofs, &oh);
553 if (err)
554 goto out_err;
555
556 err = -EIO;
557 if (be64_to_cpu(oh.ino) != inode->i_ino
558 || check_pos(sb, be64_to_cpu(oh.bix), bix, level)) {
559 printk(KERN_ERR"LOGFS: (ino, bix) don't match at %llx: "
560 "expected (%lx, %llx), got (%llx, %llx)\n",
561 ofs, inode->i_ino, bix,
562 be64_to_cpu(oh.ino), be64_to_cpu(oh.bix));
563 goto out_err;
564 }
565
566 len = be16_to_cpu(oh.len);
567
568 switch (oh.compr) {
569 case COMPR_NONE:
570 err = wbuf_read(sb, ofs + LOGFS_OBJECT_HEADERSIZE, len, buf);
571 if (err)
572 goto out_err;
573 crc = logfs_crc32(buf, len, 0);
574 if (crc != oh.data_crc) {
575 printk(KERN_ERR"LOGFS: uncompressed data crc error at "
576 "%llx: expected %x, got %x\n", ofs,
577 be32_to_cpu(oh.data_crc),
578 be32_to_cpu(crc));
579 goto out_err;
580 }
581 break;
582 case COMPR_ZLIB:
583 mutex_lock(&logfs_super(sb)->s_journal_mutex);
584 err = wbuf_read(sb, ofs + LOGFS_OBJECT_HEADERSIZE, len,
585 compressor_buf);
586 if (err) {
587 mutex_unlock(&logfs_super(sb)->s_journal_mutex);
588 goto out_err;
589 }
590 crc = logfs_crc32(compressor_buf, len, 0);
591 if (crc != oh.data_crc) {
592 printk(KERN_ERR"LOGFS: compressed data crc error at "
593 "%llx: expected %x, got %x\n", ofs,
594 be32_to_cpu(oh.data_crc),
595 be32_to_cpu(crc));
596 mutex_unlock(&logfs_super(sb)->s_journal_mutex);
597 goto out_err;
598 }
599 err = logfs_uncompress(compressor_buf, buf, len, block_len);
600 mutex_unlock(&logfs_super(sb)->s_journal_mutex);
601 if (err) {
602 printk(KERN_ERR"LOGFS: uncompress error at %llx\n", ofs);
603 goto out_err;
604 }
605 break;
606 default:
607 LOGFS_BUG(sb);
608 err = -EIO;
609 goto out_err;
610 }
611 return 0;
612
613out_err:
614 logfs_set_ro(sb);
615 printk(KERN_ERR"LOGFS: device is read-only now\n");
616 LOGFS_BUG(sb);
617 return err;
618}
619
620/**
621 * logfs_segment_read - read data block from object store
622 * @inode: inode containing data
623 * @buf: data buffer
624 * @ofs: physical data offset
625 * @bix: block index
626 * @level: block level
627 *
628 * Returns 0 on success or a negative errno.
629 */
630int logfs_segment_read(struct inode *inode, struct page *page,
631 u64 ofs, u64 bix, level_t level)
632{
633 int err;
634 void *buf;
635
636 if (PageUptodate(page))
637 return 0;
638
639 ofs &= ~LOGFS_FULLY_POPULATED;
640
641 buf = kmap(page);
642 err = __logfs_segment_read(inode, buf, ofs, bix, level);
643 if (!err) {
644 move_btree_to_page(inode, page, buf);
645 SetPageUptodate(page);
646 }
647 kunmap(page);
648 log_segment("logfs_segment_read(%lx, %llx, %x) %llx (%d)\n",
649 inode->i_ino, bix, level, ofs, err);
650 return err;
651}
652
653int logfs_segment_delete(struct inode *inode, struct logfs_shadow *shadow)
654{
655 struct super_block *sb = inode->i_sb;
656 struct logfs_object_header h;
657 u16 len;
658 int err;
659
660 BUG_ON(logfs_super(sb)->s_flags & LOGFS_SB_FLAG_SHUTDOWN);
661 BUG_ON(shadow->old_ofs & LOGFS_FULLY_POPULATED);
662 if (!shadow->old_ofs)
663 return 0;
664
665 log_segment("logfs_segment_delete(%llx, %llx, %x) %llx->%llx %x->%x\n",
666 shadow->ino, shadow->bix, shadow->gc_level,
667 shadow->old_ofs, shadow->new_ofs,
668 shadow->old_len, shadow->new_len);
669 err = read_obj_header(sb, shadow->old_ofs, &h);
670 LOGFS_BUG_ON(err, sb);
671 LOGFS_BUG_ON(be64_to_cpu(h.ino) != inode->i_ino, sb);
672 LOGFS_BUG_ON(check_pos(sb, shadow->bix, be64_to_cpu(h.bix),
673 shrink_level(shadow->gc_level)), sb);
674
675 if (shadow->gc_level == 0)
676 len = be16_to_cpu(h.len);
677 else
678 len = obj_len(sb, h.type);
679 shadow->old_len = len + sizeof(h);
680 return 0;
681}
682
683static void freeseg(struct super_block *sb, u32 segno)
684{
685 struct logfs_super *super = logfs_super(sb);
686 struct address_space *mapping = super->s_mapping_inode->i_mapping;
687 struct page *page;
688 u64 ofs, start, end;
689
690 start = dev_ofs(sb, segno, 0);
691 end = dev_ofs(sb, segno + 1, 0);
692 for (ofs = start; ofs < end; ofs += PAGE_SIZE) {
693 page = find_get_page(mapping, ofs >> PAGE_SHIFT);
694 if (!page)
695 continue;
696 ClearPagePrivate(page);
697 page_cache_release(page);
698 }
699}
700
701int logfs_open_area(struct logfs_area *area, size_t bytes)
702{
703 struct super_block *sb = area->a_sb;
704 struct logfs_super *super = logfs_super(sb);
705 int err, closed = 0;
706
707 if (area->a_is_open && area->a_used_bytes + bytes <= super->s_segsize)
708 return 0;
709
710 if (area->a_is_open) {
711 u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
712 u32 len = super->s_segsize - area->a_written_bytes;
713
714 log_gc("logfs_close_area(%x)\n", area->a_segno);
715 pad_wbuf(area, 1);
716 super->s_devops->writeseg(area->a_sb, ofs, len);
717 freeseg(sb, area->a_segno);
718 closed = 1;
719 }
720
721 area->a_used_bytes = 0;
722 area->a_written_bytes = 0;
723again:
724 area->a_ops->get_free_segment(area);
725 area->a_ops->get_erase_count(area);
726
727 log_gc("logfs_open_area(%x, %x)\n", area->a_segno, area->a_level);
728 err = area->a_ops->erase_segment(area);
729 if (err) {
730 printk(KERN_WARNING "LogFS: Error erasing segment %x\n",
731 area->a_segno);
732 logfs_mark_segment_bad(sb, area->a_segno);
733 goto again;
734 }
735 area->a_is_open = 1;
736 return closed;
737}
738
739void logfs_sync_area(struct logfs_area *area)
740{
741 struct super_block *sb = area->a_sb;
742 struct logfs_super *super = logfs_super(sb);
743 u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
744 u32 len = (area->a_used_bytes - area->a_written_bytes);
745
746 if (super->s_writesize)
747 len &= ~(super->s_writesize - 1);
748 if (len == 0)
749 return;
750 pad_wbuf(area, 0);
751 super->s_devops->writeseg(sb, ofs, len);
752 area->a_written_bytes += len;
753}
754
755void logfs_sync_segments(struct super_block *sb)
756{
757 struct logfs_super *super = logfs_super(sb);
758 int i;
759
760 for_each_area(i)
761 logfs_sync_area(super->s_area[i]);
762}
763
764/*
765 * Pick a free segment to be used for this area. Effectively takes a
766 * candidate from the free list (not really a candidate anymore).
767 */
768static void ostore_get_free_segment(struct logfs_area *area)
769{
770 struct super_block *sb = area->a_sb;
771 struct logfs_super *super = logfs_super(sb);
772
773 if (super->s_free_list.count == 0) {
774 printk(KERN_ERR"LOGFS: ran out of free segments\n");
775 LOGFS_BUG(sb);
776 }
777
778 area->a_segno = get_best_cand(sb, &super->s_free_list, NULL);
779}
780
781static void ostore_get_erase_count(struct logfs_area *area)
782{
783 struct logfs_segment_entry se;
784 u32 ec_level;
785
786 logfs_get_segment_entry(area->a_sb, area->a_segno, &se);
787 BUG_ON(se.ec_level == cpu_to_be32(BADSEG) ||
788 se.valid == cpu_to_be32(RESERVED));
789
790 ec_level = be32_to_cpu(se.ec_level);
791 area->a_erase_count = (ec_level >> 4) + 1;
792}
793
794static int ostore_erase_segment(struct logfs_area *area)
795{
796 struct super_block *sb = area->a_sb;
797 struct logfs_segment_header sh;
798 u64 ofs;
799 int err;
800
801 err = logfs_erase_segment(sb, area->a_segno);
802 if (err)
803 return err;
804
805 sh.pad = 0;
806 sh.type = SEG_OSTORE;
807 sh.level = (__force u8)area->a_level;
808 sh.segno = cpu_to_be32(area->a_segno);
809 sh.ec = cpu_to_be32(area->a_erase_count);
810 sh.gec = cpu_to_be64(logfs_super(sb)->s_gec);
811 sh.crc = logfs_crc32(&sh, sizeof(sh), 4);
812
813 logfs_set_segment_erased(sb, area->a_segno, area->a_erase_count,
814 area->a_level);
815
816 ofs = dev_ofs(sb, area->a_segno, 0);
817 area->a_used_bytes = sizeof(sh);
818 logfs_buf_write(area, ofs, &sh, sizeof(sh));
819 return 0;
820}
821
822static const struct logfs_area_ops ostore_area_ops = {
823 .get_free_segment = ostore_get_free_segment,
824 .get_erase_count = ostore_get_erase_count,
825 .erase_segment = ostore_erase_segment,
826};
827
828static void free_area(struct logfs_area *area)
829{
830 if (area)
831 freeseg(area->a_sb, area->a_segno);
832 kfree(area);
833}
834
835static struct logfs_area *alloc_area(struct super_block *sb)
836{
837 struct logfs_area *area;
838
839 area = kzalloc(sizeof(*area), GFP_KERNEL);
840 if (!area)
841 return NULL;
842
843 area->a_sb = sb;
844 return area;
845}
846
847static void map_invalidatepage(struct page *page, unsigned long l)
848{
849 BUG();
850}
851
852static int map_releasepage(struct page *page, gfp_t g)
853{
854 /* Don't release these pages */
855 return 0;
856}
857
858static const struct address_space_operations mapping_aops = {
859 .invalidatepage = map_invalidatepage,
860 .releasepage = map_releasepage,
861 .set_page_dirty = __set_page_dirty_nobuffers,
862};
863
864int logfs_init_mapping(struct super_block *sb)
865{
866 struct logfs_super *super = logfs_super(sb);
867 struct address_space *mapping;
868 struct inode *inode;
869
870 inode = logfs_new_meta_inode(sb, LOGFS_INO_MAPPING);
871 if (IS_ERR(inode))
872 return PTR_ERR(inode);
873 super->s_mapping_inode = inode;
874 mapping = inode->i_mapping;
875 mapping->a_ops = &mapping_aops;
876 /* Would it be possible to use __GFP_HIGHMEM as well? */
877 mapping_set_gfp_mask(mapping, GFP_NOFS);
878 return 0;
879}
880
881int logfs_init_areas(struct super_block *sb)
882{
883 struct logfs_super *super = logfs_super(sb);
884 int i = -1;
885
886 super->s_alias_pool = mempool_create_kmalloc_pool(600,
887 sizeof(struct object_alias_item));
888 if (!super->s_alias_pool)
889 return -ENOMEM;
890
891 super->s_journal_area = alloc_area(sb);
892 if (!super->s_journal_area)
893 goto err;
894
895 for_each_area(i) {
896 super->s_area[i] = alloc_area(sb);
897 if (!super->s_area[i])
898 goto err;
899 super->s_area[i]->a_level = GC_LEVEL(i);
900 super->s_area[i]->a_ops = &ostore_area_ops;
901 }
902 btree_init_mempool128(&super->s_object_alias_tree,
903 super->s_btree_pool);
904 return 0;
905
906err:
907 for (i--; i >= 0; i--)
908 free_area(super->s_area[i]);
909 free_area(super->s_journal_area);
910 mempool_destroy(super->s_alias_pool);
911 return -ENOMEM;
912}
913
914void logfs_cleanup_areas(struct super_block *sb)
915{
916 struct logfs_super *super = logfs_super(sb);
917 int i;
918
919 btree_grim_visitor128(&super->s_object_alias_tree, 0, kill_alias);
920 for_each_area(i)
921 free_area(super->s_area[i]);
922 free_area(super->s_journal_area);
923 destroy_meta_inode(super->s_mapping_inode);
924}
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
new file mode 100644
index 000000000000..d128a2c1c8d1
--- /dev/null
+++ b/fs/logfs/super.c
@@ -0,0 +1,634 @@
1/*
2 * fs/logfs/super.c
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 *
8 * Generally contains mount/umount code and also serves as a dump area for
9 * any functions that don't fit elsewhere and neither justify a file of their
10 * own.
11 */
12#include "logfs.h"
13#include <linux/bio.h>
14#include <linux/mtd/mtd.h>
15#include <linux/statfs.h>
16#include <linux/buffer_head.h>
17
18static DEFINE_MUTEX(emergency_mutex);
19static struct page *emergency_page;
20
21struct page *emergency_read_begin(struct address_space *mapping, pgoff_t index)
22{
23 filler_t *filler = (filler_t *)mapping->a_ops->readpage;
24 struct page *page;
25 int err;
26
27 page = read_cache_page(mapping, index, filler, NULL);
28 if (page)
29 return page;
30
31 /* No more pages available, switch to emergency page */
32 printk(KERN_INFO"Logfs: Using emergency page\n");
33 mutex_lock(&emergency_mutex);
34 err = filler(NULL, emergency_page);
35 if (err) {
36 mutex_unlock(&emergency_mutex);
37 printk(KERN_EMERG"Logfs: Error reading emergency page\n");
38 return ERR_PTR(err);
39 }
40 return emergency_page;
41}
42
43void emergency_read_end(struct page *page)
44{
45 if (page == emergency_page)
46 mutex_unlock(&emergency_mutex);
47 else
48 page_cache_release(page);
49}
50
51static void dump_segfile(struct super_block *sb)
52{
53 struct logfs_super *super = logfs_super(sb);
54 struct logfs_segment_entry se;
55 u32 segno;
56
57 for (segno = 0; segno < super->s_no_segs; segno++) {
58 logfs_get_segment_entry(sb, segno, &se);
59 printk("%3x: %6x %8x", segno, be32_to_cpu(se.ec_level),
60 be32_to_cpu(se.valid));
61 if (++segno < super->s_no_segs) {
62 logfs_get_segment_entry(sb, segno, &se);
63 printk(" %6x %8x", be32_to_cpu(se.ec_level),
64 be32_to_cpu(se.valid));
65 }
66 if (++segno < super->s_no_segs) {
67 logfs_get_segment_entry(sb, segno, &se);
68 printk(" %6x %8x", be32_to_cpu(se.ec_level),
69 be32_to_cpu(se.valid));
70 }
71 if (++segno < super->s_no_segs) {
72 logfs_get_segment_entry(sb, segno, &se);
73 printk(" %6x %8x", be32_to_cpu(se.ec_level),
74 be32_to_cpu(se.valid));
75 }
76 printk("\n");
77 }
78}
79
80/*
81 * logfs_crash_dump - dump debug information to device
82 *
83 * The LogFS superblock only occupies part of a segment. This function will
84 * write as much debug information as it can gather into the spare space.
85 */
86void logfs_crash_dump(struct super_block *sb)
87{
88 dump_segfile(sb);
89}
90
91/*
92 * TODO: move to lib/string.c
93 */
94/**
95 * memchr_inv - Find a character in an area of memory.
96 * @s: The memory area
97 * @c: The byte to search for
98 * @n: The size of the area.
99 *
100 * returns the address of the first character other than @c, or %NULL
101 * if the whole buffer contains just @c.
102 */
103void *memchr_inv(const void *s, int c, size_t n)
104{
105 const unsigned char *p = s;
106 while (n-- != 0)
107 if ((unsigned char)c != *p++)
108 return (void *)(p - 1);
109
110 return NULL;
111}
112
113/*
114 * FIXME: There should be a reserve for root, similar to ext2.
115 */
116int logfs_statfs(struct dentry *dentry, struct kstatfs *stats)
117{
118 struct super_block *sb = dentry->d_sb;
119 struct logfs_super *super = logfs_super(sb);
120
121 stats->f_type = LOGFS_MAGIC_U32;
122 stats->f_bsize = sb->s_blocksize;
123 stats->f_blocks = super->s_size >> LOGFS_BLOCK_BITS >> 3;
124 stats->f_bfree = super->s_free_bytes >> sb->s_blocksize_bits;
125 stats->f_bavail = super->s_free_bytes >> sb->s_blocksize_bits;
126 stats->f_files = 0;
127 stats->f_ffree = 0;
128 stats->f_namelen = LOGFS_MAX_NAMELEN;
129 return 0;
130}
131
132static int logfs_sb_set(struct super_block *sb, void *_super)
133{
134 struct logfs_super *super = _super;
135
136 sb->s_fs_info = super;
137 sb->s_mtd = super->s_mtd;
138 sb->s_bdev = super->s_bdev;
139 return 0;
140}
141
142static int logfs_sb_test(struct super_block *sb, void *_super)
143{
144 struct logfs_super *super = _super;
145 struct mtd_info *mtd = super->s_mtd;
146
147 if (mtd && sb->s_mtd == mtd)
148 return 1;
149 if (super->s_bdev && sb->s_bdev == super->s_bdev)
150 return 1;
151 return 0;
152}
153
154static void set_segment_header(struct logfs_segment_header *sh, u8 type,
155 u8 level, u32 segno, u32 ec)
156{
157 sh->pad = 0;
158 sh->type = type;
159 sh->level = level;
160 sh->segno = cpu_to_be32(segno);
161 sh->ec = cpu_to_be32(ec);
162 sh->gec = cpu_to_be64(segno);
163 sh->crc = logfs_crc32(sh, LOGFS_SEGMENT_HEADERSIZE, 4);
164}
165
166static void logfs_write_ds(struct super_block *sb, struct logfs_disk_super *ds,
167 u32 segno, u32 ec)
168{
169 struct logfs_super *super = logfs_super(sb);
170 struct logfs_segment_header *sh = &ds->ds_sh;
171 int i;
172
173 memset(ds, 0, sizeof(*ds));
174 set_segment_header(sh, SEG_SUPER, 0, segno, ec);
175
176 ds->ds_ifile_levels = super->s_ifile_levels;
177 ds->ds_iblock_levels = super->s_iblock_levels;
178 ds->ds_data_levels = super->s_data_levels; /* XXX: Remove */
179 ds->ds_segment_shift = super->s_segshift;
180 ds->ds_block_shift = sb->s_blocksize_bits;
181 ds->ds_write_shift = super->s_writeshift;
182 ds->ds_filesystem_size = cpu_to_be64(super->s_size);
183 ds->ds_segment_size = cpu_to_be32(super->s_segsize);
184 ds->ds_bad_seg_reserve = cpu_to_be32(super->s_bad_seg_reserve);
185 ds->ds_feature_incompat = cpu_to_be64(super->s_feature_incompat);
186 ds->ds_feature_ro_compat= cpu_to_be64(super->s_feature_ro_compat);
187 ds->ds_feature_compat = cpu_to_be64(super->s_feature_compat);
188 ds->ds_feature_flags = cpu_to_be64(super->s_feature_flags);
189 ds->ds_root_reserve = cpu_to_be64(super->s_root_reserve);
190 ds->ds_speed_reserve = cpu_to_be64(super->s_speed_reserve);
191 journal_for_each(i)
192 ds->ds_journal_seg[i] = cpu_to_be32(super->s_journal_seg[i]);
193 ds->ds_magic = cpu_to_be64(LOGFS_MAGIC);
194 ds->ds_crc = logfs_crc32(ds, sizeof(*ds),
195 LOGFS_SEGMENT_HEADERSIZE + 12);
196}
197
198static int write_one_sb(struct super_block *sb,
199 struct page *(*find_sb)(struct super_block *sb, u64 *ofs))
200{
201 struct logfs_super *super = logfs_super(sb);
202 struct logfs_disk_super *ds;
203 struct logfs_segment_entry se;
204 struct page *page;
205 u64 ofs;
206 u32 ec, segno;
207 int err;
208
209 page = find_sb(sb, &ofs);
210 if (!page)
211 return -EIO;
212 ds = page_address(page);
213 segno = seg_no(sb, ofs);
214 logfs_get_segment_entry(sb, segno, &se);
215 ec = be32_to_cpu(se.ec_level) >> 4;
216 ec++;
217 logfs_set_segment_erased(sb, segno, ec, 0);
218 logfs_write_ds(sb, ds, segno, ec);
219 err = super->s_devops->write_sb(sb, page);
220 page_cache_release(page);
221 return err;
222}
223
224int logfs_write_sb(struct super_block *sb)
225{
226 struct logfs_super *super = logfs_super(sb);
227 int err;
228
229 /* First superblock */
230 err = write_one_sb(sb, super->s_devops->find_first_sb);
231 if (err)
232 return err;
233
234 /* Last superblock */
235 err = write_one_sb(sb, super->s_devops->find_last_sb);
236 if (err)
237 return err;
238 return 0;
239}
240
241static int ds_cmp(const void *ds0, const void *ds1)
242{
243 size_t len = sizeof(struct logfs_disk_super);
244
245 /* We know the segment headers differ, so ignore them */
246 len -= LOGFS_SEGMENT_HEADERSIZE;
247 ds0 += LOGFS_SEGMENT_HEADERSIZE;
248 ds1 += LOGFS_SEGMENT_HEADERSIZE;
249 return memcmp(ds0, ds1, len);
250}
251
252static int logfs_recover_sb(struct super_block *sb)
253{
254 struct logfs_super *super = logfs_super(sb);
255 struct logfs_disk_super _ds0, *ds0 = &_ds0;
256 struct logfs_disk_super _ds1, *ds1 = &_ds1;
257 int err, valid0, valid1;
258
259 /* read first superblock */
260 err = wbuf_read(sb, super->s_sb_ofs[0], sizeof(*ds0), ds0);
261 if (err)
262 return err;
263 /* read last superblock */
264 err = wbuf_read(sb, super->s_sb_ofs[1], sizeof(*ds1), ds1);
265 if (err)
266 return err;
267 valid0 = logfs_check_ds(ds0) == 0;
268 valid1 = logfs_check_ds(ds1) == 0;
269
270 if (!valid0 && valid1) {
271 printk(KERN_INFO"First superblock is invalid - fixing.\n");
272 return write_one_sb(sb, super->s_devops->find_first_sb);
273 }
274 if (valid0 && !valid1) {
275 printk(KERN_INFO"Last superblock is invalid - fixing.\n");
276 return write_one_sb(sb, super->s_devops->find_last_sb);
277 }
278 if (valid0 && valid1 && ds_cmp(ds0, ds1)) {
279 printk(KERN_INFO"Superblocks don't match - fixing.\n");
280 return write_one_sb(sb, super->s_devops->find_last_sb);
281 }
282 /* If neither is valid now, something's wrong. Didn't we properly
283 * check them before?!? */
284 BUG_ON(!valid0 && !valid1);
285 return 0;
286}
287
288static int logfs_make_writeable(struct super_block *sb)
289{
290 int err;
291
292 /* Repair any broken superblock copies */
293 err = logfs_recover_sb(sb);
294 if (err)
295 return err;
296
297 /* Check areas for trailing unaccounted data */
298 err = logfs_check_areas(sb);
299 if (err)
300 return err;
301
302 err = logfs_open_segfile(sb);
303 if (err)
304 return err;
305
306 /* Do one GC pass before any data gets dirtied */
307 logfs_gc_pass(sb);
308
309 /* after all initializations are done, replay the journal
310 * for rw-mounts, if necessary */
311 err = logfs_replay_journal(sb);
312 if (err)
313 return err;
314
315 return 0;
316}
317
318static int logfs_get_sb_final(struct super_block *sb, struct vfsmount *mnt)
319{
320 struct inode *rootdir;
321 int err;
322
323 /* root dir */
324 rootdir = logfs_iget(sb, LOGFS_INO_ROOT);
325 if (IS_ERR(rootdir))
326 goto fail;
327
328 sb->s_root = d_alloc_root(rootdir);
329 if (!sb->s_root)
330 goto fail;
331
332 /* FIXME: check for read-only mounts */
333 err = logfs_make_writeable(sb);
334 if (err)
335 goto fail2;
336
337 log_super("LogFS: Finished mounting\n");
338 simple_set_mnt(mnt, sb);
339 return 0;
340
341fail2:
342 iput(rootdir);
343fail:
344 iput(logfs_super(sb)->s_master_inode);
345 return -EIO;
346}
347
348int logfs_check_ds(struct logfs_disk_super *ds)
349{
350 struct logfs_segment_header *sh = &ds->ds_sh;
351
352 if (ds->ds_magic != cpu_to_be64(LOGFS_MAGIC))
353 return -EINVAL;
354 if (sh->crc != logfs_crc32(sh, LOGFS_SEGMENT_HEADERSIZE, 4))
355 return -EINVAL;
356 if (ds->ds_crc != logfs_crc32(ds, sizeof(*ds),
357 LOGFS_SEGMENT_HEADERSIZE + 12))
358 return -EINVAL;
359 return 0;
360}
361
362static struct page *find_super_block(struct super_block *sb)
363{
364 struct logfs_super *super = logfs_super(sb);
365 struct page *first, *last;
366
367 first = super->s_devops->find_first_sb(sb, &super->s_sb_ofs[0]);
368 if (!first || IS_ERR(first))
369 return NULL;
370 last = super->s_devops->find_last_sb(sb, &super->s_sb_ofs[1]);
371 if (!last || IS_ERR(first)) {
372 page_cache_release(first);
373 return NULL;
374 }
375
376 if (!logfs_check_ds(page_address(first))) {
377 page_cache_release(last);
378 return first;
379 }
380
381 /* First one didn't work, try the second superblock */
382 if (!logfs_check_ds(page_address(last))) {
383 page_cache_release(first);
384 return last;
385 }
386
387 /* Neither worked, sorry folks */
388 page_cache_release(first);
389 page_cache_release(last);
390 return NULL;
391}
392
393static int __logfs_read_sb(struct super_block *sb)
394{
395 struct logfs_super *super = logfs_super(sb);
396 struct page *page;
397 struct logfs_disk_super *ds;
398 int i;
399
400 page = find_super_block(sb);
401 if (!page)
402 return -EIO;
403
404 ds = page_address(page);
405 super->s_size = be64_to_cpu(ds->ds_filesystem_size);
406 super->s_root_reserve = be64_to_cpu(ds->ds_root_reserve);
407 super->s_speed_reserve = be64_to_cpu(ds->ds_speed_reserve);
408 super->s_bad_seg_reserve = be32_to_cpu(ds->ds_bad_seg_reserve);
409 super->s_segsize = 1 << ds->ds_segment_shift;
410 super->s_segmask = (1 << ds->ds_segment_shift) - 1;
411 super->s_segshift = ds->ds_segment_shift;
412 sb->s_blocksize = 1 << ds->ds_block_shift;
413 sb->s_blocksize_bits = ds->ds_block_shift;
414 super->s_writesize = 1 << ds->ds_write_shift;
415 super->s_writeshift = ds->ds_write_shift;
416 super->s_no_segs = super->s_size >> super->s_segshift;
417 super->s_no_blocks = super->s_segsize >> sb->s_blocksize_bits;
418 super->s_feature_incompat = be64_to_cpu(ds->ds_feature_incompat);
419 super->s_feature_ro_compat = be64_to_cpu(ds->ds_feature_ro_compat);
420 super->s_feature_compat = be64_to_cpu(ds->ds_feature_compat);
421 super->s_feature_flags = be64_to_cpu(ds->ds_feature_flags);
422
423 journal_for_each(i)
424 super->s_journal_seg[i] = be32_to_cpu(ds->ds_journal_seg[i]);
425
426 super->s_ifile_levels = ds->ds_ifile_levels;
427 super->s_iblock_levels = ds->ds_iblock_levels;
428 super->s_data_levels = ds->ds_data_levels;
429 super->s_total_levels = super->s_ifile_levels + super->s_iblock_levels
430 + super->s_data_levels;
431 page_cache_release(page);
432 return 0;
433}
434
435static int logfs_read_sb(struct super_block *sb)
436{
437 struct logfs_super *super = logfs_super(sb);
438 int ret;
439
440 super->s_btree_pool = mempool_create(32, btree_alloc, btree_free, NULL);
441 if (!super->s_btree_pool)
442 return -ENOMEM;
443
444 btree_init_mempool64(&super->s_shadow_tree.new, super->s_btree_pool);
445 btree_init_mempool64(&super->s_shadow_tree.old, super->s_btree_pool);
446
447 ret = logfs_init_mapping(sb);
448 if (ret)
449 return ret;
450
451 ret = __logfs_read_sb(sb);
452 if (ret)
453 return ret;
454
455 mutex_init(&super->s_dirop_mutex);
456 mutex_init(&super->s_object_alias_mutex);
457 INIT_LIST_HEAD(&super->s_freeing_list);
458
459 ret = logfs_init_rw(sb);
460 if (ret)
461 return ret;
462
463 ret = logfs_init_areas(sb);
464 if (ret)
465 return ret;
466
467 ret = logfs_init_gc(sb);
468 if (ret)
469 return ret;
470
471 ret = logfs_init_journal(sb);
472 if (ret)
473 return ret;
474
475 return 0;
476}
477
478static void logfs_kill_sb(struct super_block *sb)
479{
480 struct logfs_super *super = logfs_super(sb);
481
482 log_super("LogFS: Start unmounting\n");
483 /* Alias entries slow down mount, so evict as many as possible */
484 sync_filesystem(sb);
485 logfs_write_anchor(super->s_master_inode);
486
487 /*
488 * From this point on alias entries are simply dropped - and any
489 * writes to the object store are considered bugs.
490 */
491 super->s_flags |= LOGFS_SB_FLAG_SHUTDOWN;
492 log_super("LogFS: Now in shutdown\n");
493 generic_shutdown_super(sb);
494
495 BUG_ON(super->s_dirty_used_bytes || super->s_dirty_free_bytes);
496
497 logfs_cleanup_gc(sb);
498 logfs_cleanup_journal(sb);
499 logfs_cleanup_areas(sb);
500 logfs_cleanup_rw(sb);
501 super->s_devops->put_device(sb);
502 mempool_destroy(super->s_btree_pool);
503 mempool_destroy(super->s_alias_pool);
504 kfree(super);
505 log_super("LogFS: Finished unmounting\n");
506}
507
508int logfs_get_sb_device(struct file_system_type *type, int flags,
509 struct mtd_info *mtd, struct block_device *bdev,
510 const struct logfs_device_ops *devops, struct vfsmount *mnt)
511{
512 struct logfs_super *super;
513 struct super_block *sb;
514 int err = -ENOMEM;
515 static int mount_count;
516
517 log_super("LogFS: Start mount %x\n", mount_count++);
518 super = kzalloc(sizeof(*super), GFP_KERNEL);
519 if (!super)
520 goto err0;
521
522 super->s_mtd = mtd;
523 super->s_bdev = bdev;
524 err = -EINVAL;
525 sb = sget(type, logfs_sb_test, logfs_sb_set, super);
526 if (IS_ERR(sb))
527 goto err0;
528
529 if (sb->s_root) {
530 /* Device is already in use */
531 err = 0;
532 simple_set_mnt(mnt, sb);
533 goto err0;
534 }
535
536 super->s_devops = devops;
537
538 /*
539 * sb->s_maxbytes is limited to 8TB. On 32bit systems, the page cache
540 * only covers 16TB and the upper 8TB are used for indirect blocks.
541 * On 64bit system we could bump up the limit, but that would make
542 * the filesystem incompatible with 32bit systems.
543 */
544 sb->s_maxbytes = (1ull << 43) - 1;
545 sb->s_op = &logfs_super_operations;
546 sb->s_flags = flags | MS_NOATIME;
547
548 err = logfs_read_sb(sb);
549 if (err)
550 goto err1;
551
552 sb->s_flags |= MS_ACTIVE;
553 err = logfs_get_sb_final(sb, mnt);
554 if (err)
555 goto err1;
556 return 0;
557
558err1:
559 up_write(&sb->s_umount);
560 deactivate_super(sb);
561 return err;
562err0:
563 kfree(super);
564 //devops->put_device(sb);
565 return err;
566}
567
568static int logfs_get_sb(struct file_system_type *type, int flags,
569 const char *devname, void *data, struct vfsmount *mnt)
570{
571 ulong mtdnr;
572
573 if (!devname)
574 return logfs_get_sb_bdev(type, flags, devname, mnt);
575 if (strncmp(devname, "mtd", 3))
576 return logfs_get_sb_bdev(type, flags, devname, mnt);
577
578 {
579 char *garbage;
580 mtdnr = simple_strtoul(devname+3, &garbage, 0);
581 if (*garbage)
582 return -EINVAL;
583 }
584
585 return logfs_get_sb_mtd(type, flags, mtdnr, mnt);
586}
587
588static struct file_system_type logfs_fs_type = {
589 .owner = THIS_MODULE,
590 .name = "logfs",
591 .get_sb = logfs_get_sb,
592 .kill_sb = logfs_kill_sb,
593 .fs_flags = FS_REQUIRES_DEV,
594
595};
596
597static int __init logfs_init(void)
598{
599 int ret;
600
601 emergency_page = alloc_pages(GFP_KERNEL, 0);
602 if (!emergency_page)
603 return -ENOMEM;
604
605 ret = logfs_compr_init();
606 if (ret)
607 goto out1;
608
609 ret = logfs_init_inode_cache();
610 if (ret)
611 goto out2;
612
613 return register_filesystem(&logfs_fs_type);
614out2:
615 logfs_compr_exit();
616out1:
617 __free_pages(emergency_page, 0);
618 return ret;
619}
620
621static void __exit logfs_exit(void)
622{
623 unregister_filesystem(&logfs_fs_type);
624 logfs_destroy_inode_cache();
625 logfs_compr_exit();
626 __free_pages(emergency_page, 0);
627}
628
629module_init(logfs_init);
630module_exit(logfs_exit);
631
632MODULE_LICENSE("GPL v2");
633MODULE_AUTHOR("Joern Engel <joern@logfs.org>");
634MODULE_DESCRIPTION("scalable flash filesystem");
diff --git a/include/linux/btree-128.h b/include/linux/btree-128.h
new file mode 100644
index 000000000000..0b3414c4c928
--- /dev/null
+++ b/include/linux/btree-128.h
@@ -0,0 +1,109 @@
1extern struct btree_geo btree_geo128;
2
3struct btree_head128 { struct btree_head h; };
4
5static inline void btree_init_mempool128(struct btree_head128 *head,
6 mempool_t *mempool)
7{
8 btree_init_mempool(&head->h, mempool);
9}
10
11static inline int btree_init128(struct btree_head128 *head)
12{
13 return btree_init(&head->h);
14}
15
16static inline void btree_destroy128(struct btree_head128 *head)
17{
18 btree_destroy(&head->h);
19}
20
21static inline void *btree_lookup128(struct btree_head128 *head, u64 k1, u64 k2)
22{
23 u64 key[2] = {k1, k2};
24 return btree_lookup(&head->h, &btree_geo128, (unsigned long *)&key);
25}
26
27static inline void *btree_get_prev128(struct btree_head128 *head,
28 u64 *k1, u64 *k2)
29{
30 u64 key[2] = {*k1, *k2};
31 void *val;
32
33 val = btree_get_prev(&head->h, &btree_geo128,
34 (unsigned long *)&key);
35 *k1 = key[0];
36 *k2 = key[1];
37 return val;
38}
39
40static inline int btree_insert128(struct btree_head128 *head, u64 k1, u64 k2,
41 void *val, gfp_t gfp)
42{
43 u64 key[2] = {k1, k2};
44 return btree_insert(&head->h, &btree_geo128,
45 (unsigned long *)&key, val, gfp);
46}
47
48static inline int btree_update128(struct btree_head128 *head, u64 k1, u64 k2,
49 void *val)
50{
51 u64 key[2] = {k1, k2};
52 return btree_update(&head->h, &btree_geo128,
53 (unsigned long *)&key, val);
54}
55
56static inline void *btree_remove128(struct btree_head128 *head, u64 k1, u64 k2)
57{
58 u64 key[2] = {k1, k2};
59 return btree_remove(&head->h, &btree_geo128, (unsigned long *)&key);
60}
61
62static inline void *btree_last128(struct btree_head128 *head, u64 *k1, u64 *k2)
63{
64 u64 key[2];
65 void *val;
66
67 val = btree_last(&head->h, &btree_geo128, (unsigned long *)&key[0]);
68 if (val) {
69 *k1 = key[0];
70 *k2 = key[1];
71 }
72
73 return val;
74}
75
76static inline int btree_merge128(struct btree_head128 *target,
77 struct btree_head128 *victim,
78 gfp_t gfp)
79{
80 return btree_merge(&target->h, &victim->h, &btree_geo128, gfp);
81}
82
83void visitor128(void *elem, unsigned long opaque, unsigned long *__key,
84 size_t index, void *__func);
85
86typedef void (*visitor128_t)(void *elem, unsigned long opaque,
87 u64 key1, u64 key2, size_t index);
88
89static inline size_t btree_visitor128(struct btree_head128 *head,
90 unsigned long opaque,
91 visitor128_t func2)
92{
93 return btree_visitor(&head->h, &btree_geo128, opaque,
94 visitor128, func2);
95}
96
97static inline size_t btree_grim_visitor128(struct btree_head128 *head,
98 unsigned long opaque,
99 visitor128_t func2)
100{
101 return btree_grim_visitor(&head->h, &btree_geo128, opaque,
102 visitor128, func2);
103}
104
105#define btree_for_each_safe128(head, k1, k2, val) \
106 for (val = btree_last128(head, &k1, &k2); \
107 val; \
108 val = btree_get_prev128(head, &k1, &k2))
109
diff --git a/include/linux/btree-type.h b/include/linux/btree-type.h
new file mode 100644
index 000000000000..9a1147ef8563
--- /dev/null
+++ b/include/linux/btree-type.h
@@ -0,0 +1,147 @@
1#define __BTREE_TP(pfx, type, sfx) pfx ## type ## sfx
2#define _BTREE_TP(pfx, type, sfx) __BTREE_TP(pfx, type, sfx)
3#define BTREE_TP(pfx) _BTREE_TP(pfx, BTREE_TYPE_SUFFIX,)
4#define BTREE_FN(name) BTREE_TP(btree_ ## name)
5#define BTREE_TYPE_HEAD BTREE_TP(struct btree_head)
6#define VISITOR_FN BTREE_TP(visitor)
7#define VISITOR_FN_T _BTREE_TP(visitor, BTREE_TYPE_SUFFIX, _t)
8
9BTREE_TYPE_HEAD {
10 struct btree_head h;
11};
12
13static inline void BTREE_FN(init_mempool)(BTREE_TYPE_HEAD *head,
14 mempool_t *mempool)
15{
16 btree_init_mempool(&head->h, mempool);
17}
18
19static inline int BTREE_FN(init)(BTREE_TYPE_HEAD *head)
20{
21 return btree_init(&head->h);
22}
23
24static inline void BTREE_FN(destroy)(BTREE_TYPE_HEAD *head)
25{
26 btree_destroy(&head->h);
27}
28
29static inline int BTREE_FN(merge)(BTREE_TYPE_HEAD *target,
30 BTREE_TYPE_HEAD *victim,
31 gfp_t gfp)
32{
33 return btree_merge(&target->h, &victim->h, BTREE_TYPE_GEO, gfp);
34}
35
36#if (BITS_PER_LONG > BTREE_TYPE_BITS)
37static inline void *BTREE_FN(lookup)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE key)
38{
39 unsigned long _key = key;
40 return btree_lookup(&head->h, BTREE_TYPE_GEO, &_key);
41}
42
43static inline int BTREE_FN(insert)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE key,
44 void *val, gfp_t gfp)
45{
46 unsigned long _key = key;
47 return btree_insert(&head->h, BTREE_TYPE_GEO, &_key, val, gfp);
48}
49
50static inline int BTREE_FN(update)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE key,
51 void *val)
52{
53 unsigned long _key = key;
54 return btree_update(&head->h, BTREE_TYPE_GEO, &_key, val);
55}
56
57static inline void *BTREE_FN(remove)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE key)
58{
59 unsigned long _key = key;
60 return btree_remove(&head->h, BTREE_TYPE_GEO, &_key);
61}
62
63static inline void *BTREE_FN(last)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE *key)
64{
65 unsigned long _key;
66 void *val = btree_last(&head->h, BTREE_TYPE_GEO, &_key);
67 if (val)
68 *key = _key;
69 return val;
70}
71
72static inline void *BTREE_FN(get_prev)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE *key)
73{
74 unsigned long _key = *key;
75 void *val = btree_get_prev(&head->h, BTREE_TYPE_GEO, &_key);
76 if (val)
77 *key = _key;
78 return val;
79}
80#else
81static inline void *BTREE_FN(lookup)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE key)
82{
83 return btree_lookup(&head->h, BTREE_TYPE_GEO, (unsigned long *)&key);
84}
85
86static inline int BTREE_FN(insert)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE key,
87 void *val, gfp_t gfp)
88{
89 return btree_insert(&head->h, BTREE_TYPE_GEO, (unsigned long *)&key,
90 val, gfp);
91}
92
93static inline int BTREE_FN(update)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE key,
94 void *val)
95{
96 return btree_update(&head->h, BTREE_TYPE_GEO, (unsigned long *)&key, val);
97}
98
99static inline void *BTREE_FN(remove)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE key)
100{
101 return btree_remove(&head->h, BTREE_TYPE_GEO, (unsigned long *)&key);
102}
103
104static inline void *BTREE_FN(last)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE *key)
105{
106 return btree_last(&head->h, BTREE_TYPE_GEO, (unsigned long *)key);
107}
108
109static inline void *BTREE_FN(get_prev)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE *key)
110{
111 return btree_get_prev(&head->h, BTREE_TYPE_GEO, (unsigned long *)key);
112}
113#endif
114
115void VISITOR_FN(void *elem, unsigned long opaque, unsigned long *key,
116 size_t index, void *__func);
117
118typedef void (*VISITOR_FN_T)(void *elem, unsigned long opaque,
119 BTREE_KEYTYPE key, size_t index);
120
121static inline size_t BTREE_FN(visitor)(BTREE_TYPE_HEAD *head,
122 unsigned long opaque,
123 VISITOR_FN_T func2)
124{
125 return btree_visitor(&head->h, BTREE_TYPE_GEO, opaque,
126 visitorl, func2);
127}
128
129static inline size_t BTREE_FN(grim_visitor)(BTREE_TYPE_HEAD *head,
130 unsigned long opaque,
131 VISITOR_FN_T func2)
132{
133 return btree_grim_visitor(&head->h, BTREE_TYPE_GEO, opaque,
134 visitorl, func2);
135}
136
137#undef VISITOR_FN
138#undef VISITOR_FN_T
139#undef __BTREE_TP
140#undef _BTREE_TP
141#undef BTREE_TP
142#undef BTREE_FN
143#undef BTREE_TYPE_HEAD
144#undef BTREE_TYPE_SUFFIX
145#undef BTREE_TYPE_GEO
146#undef BTREE_KEYTYPE
147#undef BTREE_TYPE_BITS
diff --git a/include/linux/btree.h b/include/linux/btree.h
new file mode 100644
index 000000000000..65b5bb058324
--- /dev/null
+++ b/include/linux/btree.h
@@ -0,0 +1,243 @@
1#ifndef BTREE_H
2#define BTREE_H
3
4#include <linux/kernel.h>
5#include <linux/mempool.h>
6
7/**
8 * DOC: B+Tree basics
9 *
10 * A B+Tree is a data structure for looking up arbitrary (currently allowing
11 * unsigned long, u32, u64 and 2 * u64) keys into pointers. The data structure
12 * is described at http://en.wikipedia.org/wiki/B-tree, we currently do not
13 * use binary search to find the key on lookups.
14 *
15 * Each B+Tree consists of a head, that contains bookkeeping information and
16 * a variable number (starting with zero) nodes. Each node contains the keys
17 * and pointers to sub-nodes, or, for leaf nodes, the keys and values for the
18 * tree entries.
19 *
20 * Each node in this implementation has the following layout:
21 * [key1, key2, ..., keyN] [val1, val2, ..., valN]
22 *
23 * Each key here is an array of unsigned longs, geo->no_longs in total. The
24 * number of keys and values (N) is geo->no_pairs.
25 */
26
27/**
28 * struct btree_head - btree head
29 *
30 * @node: the first node in the tree
31 * @mempool: mempool used for node allocations
32 * @height: current of the tree
33 */
34struct btree_head {
35 unsigned long *node;
36 mempool_t *mempool;
37 int height;
38};
39
40/* btree geometry */
41struct btree_geo;
42
43/**
44 * btree_alloc - allocate function for the mempool
45 * @gfp_mask: gfp mask for the allocation
46 * @pool_data: unused
47 */
48void *btree_alloc(gfp_t gfp_mask, void *pool_data);
49
50/**
51 * btree_free - free function for the mempool
52 * @element: the element to free
53 * @pool_data: unused
54 */
55void btree_free(void *element, void *pool_data);
56
57/**
58 * btree_init_mempool - initialise a btree with given mempool
59 *
60 * @head: the btree head to initialise
61 * @mempool: the mempool to use
62 *
63 * When this function is used, there is no need to destroy
64 * the mempool.
65 */
66void btree_init_mempool(struct btree_head *head, mempool_t *mempool);
67
68/**
69 * btree_init - initialise a btree
70 *
71 * @head: the btree head to initialise
72 *
73 * This function allocates the memory pool that the
74 * btree needs. Returns zero or a negative error code
75 * (-%ENOMEM) when memory allocation fails.
76 *
77 */
78int __must_check btree_init(struct btree_head *head);
79
80/**
81 * btree_destroy - destroy mempool
82 *
83 * @head: the btree head to destroy
84 *
85 * This function destroys the internal memory pool, use only
86 * when using btree_init(), not with btree_init_mempool().
87 */
88void btree_destroy(struct btree_head *head);
89
90/**
91 * btree_lookup - look up a key in the btree
92 *
93 * @head: the btree to look in
94 * @geo: the btree geometry
95 * @key: the key to look up
96 *
97 * This function returns the value for the given key, or %NULL.
98 */
99void *btree_lookup(struct btree_head *head, struct btree_geo *geo,
100 unsigned long *key);
101
102/**
103 * btree_insert - insert an entry into the btree
104 *
105 * @head: the btree to add to
106 * @geo: the btree geometry
107 * @key: the key to add (must not already be present)
108 * @val: the value to add (must not be %NULL)
109 * @gfp: allocation flags for node allocations
110 *
111 * This function returns 0 if the item could be added, or an
112 * error code if it failed (may fail due to memory pressure).
113 */
114int __must_check btree_insert(struct btree_head *head, struct btree_geo *geo,
115 unsigned long *key, void *val, gfp_t gfp);
116/**
117 * btree_update - update an entry in the btree
118 *
119 * @head: the btree to update
120 * @geo: the btree geometry
121 * @key: the key to update
122 * @val: the value to change it to (must not be %NULL)
123 *
124 * This function returns 0 if the update was successful, or
125 * -%ENOENT if the key could not be found.
126 */
127int btree_update(struct btree_head *head, struct btree_geo *geo,
128 unsigned long *key, void *val);
129/**
130 * btree_remove - remove an entry from the btree
131 *
132 * @head: the btree to update
133 * @geo: the btree geometry
134 * @key: the key to remove
135 *
136 * This function returns the removed entry, or %NULL if the key
137 * could not be found.
138 */
139void *btree_remove(struct btree_head *head, struct btree_geo *geo,
140 unsigned long *key);
141
142/**
143 * btree_merge - merge two btrees
144 *
145 * @target: the tree that gets all the entries
146 * @victim: the tree that gets merged into @target
147 * @geo: the btree geometry
148 * @gfp: allocation flags
149 *
150 * The two trees @target and @victim may not contain the same keys,
151 * that is a bug and triggers a BUG(). This function returns zero
152 * if the trees were merged successfully, and may return a failure
153 * when memory allocation fails, in which case both trees might have
154 * been partially merged, i.e. some entries have been moved from
155 * @victim to @target.
156 */
157int btree_merge(struct btree_head *target, struct btree_head *victim,
158 struct btree_geo *geo, gfp_t gfp);
159
160/**
161 * btree_last - get last entry in btree
162 *
163 * @head: btree head
164 * @geo: btree geometry
165 * @key: last key
166 *
167 * Returns the last entry in the btree, and sets @key to the key
168 * of that entry; returns NULL if the tree is empty, in that case
169 * key is not changed.
170 */
171void *btree_last(struct btree_head *head, struct btree_geo *geo,
172 unsigned long *key);
173
174/**
175 * btree_get_prev - get previous entry
176 *
177 * @head: btree head
178 * @geo: btree geometry
179 * @key: pointer to key
180 *
181 * The function returns the next item right before the value pointed to by
182 * @key, and updates @key with its key, or returns %NULL when there is no
183 * entry with a key smaller than the given key.
184 */
185void *btree_get_prev(struct btree_head *head, struct btree_geo *geo,
186 unsigned long *key);
187
188
189/* internal use, use btree_visitor{l,32,64,128} */
190size_t btree_visitor(struct btree_head *head, struct btree_geo *geo,
191 unsigned long opaque,
192 void (*func)(void *elem, unsigned long opaque,
193 unsigned long *key, size_t index,
194 void *func2),
195 void *func2);
196
197/* internal use, use btree_grim_visitor{l,32,64,128} */
198size_t btree_grim_visitor(struct btree_head *head, struct btree_geo *geo,
199 unsigned long opaque,
200 void (*func)(void *elem, unsigned long opaque,
201 unsigned long *key,
202 size_t index, void *func2),
203 void *func2);
204
205
206#include <linux/btree-128.h>
207
208extern struct btree_geo btree_geo32;
209#define BTREE_TYPE_SUFFIX l
210#define BTREE_TYPE_BITS BITS_PER_LONG
211#define BTREE_TYPE_GEO &btree_geo32
212#define BTREE_KEYTYPE unsigned long
213#include <linux/btree-type.h>
214
215#define btree_for_each_safel(head, key, val) \
216 for (val = btree_lastl(head, &key); \
217 val; \
218 val = btree_get_prevl(head, &key))
219
220#define BTREE_TYPE_SUFFIX 32
221#define BTREE_TYPE_BITS 32
222#define BTREE_TYPE_GEO &btree_geo32
223#define BTREE_KEYTYPE u32
224#include <linux/btree-type.h>
225
226#define btree_for_each_safe32(head, key, val) \
227 for (val = btree_last32(head, &key); \
228 val; \
229 val = btree_get_prev32(head, &key))
230
231extern struct btree_geo btree_geo64;
232#define BTREE_TYPE_SUFFIX 64
233#define BTREE_TYPE_BITS 64
234#define BTREE_TYPE_GEO &btree_geo64
235#define BTREE_KEYTYPE u64
236#include <linux/btree-type.h>
237
238#define btree_for_each_safe64(head, key, val) \
239 for (val = btree_last64(head, &key); \
240 val; \
241 val = btree_get_prev64(head, &key))
242
243#endif
diff --git a/lib/Kconfig b/lib/Kconfig
index bb1326d3839c..277fbfb233b9 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -156,6 +156,9 @@ config TEXTSEARCH_BM
156config TEXTSEARCH_FSM 156config TEXTSEARCH_FSM
157 tristate 157 tristate
158 158
159config BTREE
160 boolean
161
159config HAS_IOMEM 162config HAS_IOMEM
160 boolean 163 boolean
161 depends on !NO_IOMEM 164 depends on !NO_IOMEM
diff --git a/lib/Makefile b/lib/Makefile
index 2e78277eff9d..cff82612e98b 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -41,6 +41,7 @@ lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o
41obj-$(CONFIG_GENERIC_FIND_LAST_BIT) += find_last_bit.o 41obj-$(CONFIG_GENERIC_FIND_LAST_BIT) += find_last_bit.o
42obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o 42obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o
43obj-$(CONFIG_LOCK_KERNEL) += kernel_lock.o 43obj-$(CONFIG_LOCK_KERNEL) += kernel_lock.o
44obj-$(CONFIG_BTREE) += btree.o
44obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o 45obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o
45obj-$(CONFIG_DEBUG_LIST) += list_debug.o 46obj-$(CONFIG_DEBUG_LIST) += list_debug.o
46obj-$(CONFIG_DEBUG_OBJECTS) += debugobjects.o 47obj-$(CONFIG_DEBUG_OBJECTS) += debugobjects.o
diff --git a/lib/btree.c b/lib/btree.c
new file mode 100644
index 000000000000..41859a820218
--- /dev/null
+++ b/lib/btree.c
@@ -0,0 +1,797 @@
1/*
2 * lib/btree.c - Simple In-memory B+Tree
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2007-2008 Joern Engel <joern@logfs.org>
7 * Bits and pieces stolen from Peter Zijlstra's code, which is
8 * Copyright 2007, Red Hat Inc. Peter Zijlstra <pzijlstr@redhat.com>
9 * GPLv2
10 *
11 * see http://programming.kicks-ass.net/kernel-patches/vma_lookup/btree.patch
12 *
13 * A relatively simple B+Tree implementation. I have written it as a learning
14 * excercise to understand how B+Trees work. Turned out to be useful as well.
15 *
16 * B+Trees can be used similar to Linux radix trees (which don't have anything
17 * in common with textbook radix trees, beware). Prerequisite for them working
18 * well is that access to a random tree node is much faster than a large number
19 * of operations within each node.
20 *
21 * Disks have fulfilled the prerequisite for a long time. More recently DRAM
22 * has gained similar properties, as memory access times, when measured in cpu
23 * cycles, have increased. Cacheline sizes have increased as well, which also
24 * helps B+Trees.
25 *
26 * Compared to radix trees, B+Trees are more efficient when dealing with a
27 * sparsely populated address space. Between 25% and 50% of the memory is
28 * occupied with valid pointers. When densely populated, radix trees contain
29 * ~98% pointers - hard to beat. Very sparse radix trees contain only ~2%
30 * pointers.
31 *
32 * This particular implementation stores pointers identified by a long value.
33 * Storing NULL pointers is illegal, lookup will return NULL when no entry
34 * was found.
35 *
36 * A tricks was used that is not commonly found in textbooks. The lowest
37 * values are to the right, not to the left. All used slots within a node
38 * are on the left, all unused slots contain NUL values. Most operations
39 * simply loop once over all slots and terminate on the first NUL.
40 */
41
42#include <linux/btree.h>
43#include <linux/cache.h>
44#include <linux/kernel.h>
45#include <linux/slab.h>
46#include <linux/module.h>
47
48#define MAX(a, b) ((a) > (b) ? (a) : (b))
49#define NODESIZE MAX(L1_CACHE_BYTES, 128)
50
51struct btree_geo {
52 int keylen;
53 int no_pairs;
54 int no_longs;
55};
56
57struct btree_geo btree_geo32 = {
58 .keylen = 1,
59 .no_pairs = NODESIZE / sizeof(long) / 2,
60 .no_longs = NODESIZE / sizeof(long) / 2,
61};
62EXPORT_SYMBOL_GPL(btree_geo32);
63
64#define LONG_PER_U64 (64 / BITS_PER_LONG)
65struct btree_geo btree_geo64 = {
66 .keylen = LONG_PER_U64,
67 .no_pairs = NODESIZE / sizeof(long) / (1 + LONG_PER_U64),
68 .no_longs = LONG_PER_U64 * (NODESIZE / sizeof(long) / (1 + LONG_PER_U64)),
69};
70EXPORT_SYMBOL_GPL(btree_geo64);
71
72struct btree_geo btree_geo128 = {
73 .keylen = 2 * LONG_PER_U64,
74 .no_pairs = NODESIZE / sizeof(long) / (1 + 2 * LONG_PER_U64),
75 .no_longs = 2 * LONG_PER_U64 * (NODESIZE / sizeof(long) / (1 + 2 * LONG_PER_U64)),
76};
77EXPORT_SYMBOL_GPL(btree_geo128);
78
79static struct kmem_cache *btree_cachep;
80
81void *btree_alloc(gfp_t gfp_mask, void *pool_data)
82{
83 return kmem_cache_alloc(btree_cachep, gfp_mask);
84}
85EXPORT_SYMBOL_GPL(btree_alloc);
86
87void btree_free(void *element, void *pool_data)
88{
89 kmem_cache_free(btree_cachep, element);
90}
91EXPORT_SYMBOL_GPL(btree_free);
92
93static unsigned long *btree_node_alloc(struct btree_head *head, gfp_t gfp)
94{
95 unsigned long *node;
96
97 node = mempool_alloc(head->mempool, gfp);
98 memset(node, 0, NODESIZE);
99 return node;
100}
101
102static int longcmp(const unsigned long *l1, const unsigned long *l2, size_t n)
103{
104 size_t i;
105
106 for (i = 0; i < n; i++) {
107 if (l1[i] < l2[i])
108 return -1;
109 if (l1[i] > l2[i])
110 return 1;
111 }
112 return 0;
113}
114
115static unsigned long *longcpy(unsigned long *dest, const unsigned long *src,
116 size_t n)
117{
118 size_t i;
119
120 for (i = 0; i < n; i++)
121 dest[i] = src[i];
122 return dest;
123}
124
125static unsigned long *longset(unsigned long *s, unsigned long c, size_t n)
126{
127 size_t i;
128
129 for (i = 0; i < n; i++)
130 s[i] = c;
131 return s;
132}
133
134static void dec_key(struct btree_geo *geo, unsigned long *key)
135{
136 unsigned long val;
137 int i;
138
139 for (i = geo->keylen - 1; i >= 0; i--) {
140 val = key[i];
141 key[i] = val - 1;
142 if (val)
143 break;
144 }
145}
146
147static unsigned long *bkey(struct btree_geo *geo, unsigned long *node, int n)
148{
149 return &node[n * geo->keylen];
150}
151
152static void *bval(struct btree_geo *geo, unsigned long *node, int n)
153{
154 return (void *)node[geo->no_longs + n];
155}
156
157static void setkey(struct btree_geo *geo, unsigned long *node, int n,
158 unsigned long *key)
159{
160 longcpy(bkey(geo, node, n), key, geo->keylen);
161}
162
163static void setval(struct btree_geo *geo, unsigned long *node, int n,
164 void *val)
165{
166 node[geo->no_longs + n] = (unsigned long) val;
167}
168
169static void clearpair(struct btree_geo *geo, unsigned long *node, int n)
170{
171 longset(bkey(geo, node, n), 0, geo->keylen);
172 node[geo->no_longs + n] = 0;
173}
174
175static inline void __btree_init(struct btree_head *head)
176{
177 head->node = NULL;
178 head->height = 0;
179}
180
181void btree_init_mempool(struct btree_head *head, mempool_t *mempool)
182{
183 __btree_init(head);
184 head->mempool = mempool;
185}
186EXPORT_SYMBOL_GPL(btree_init_mempool);
187
188int btree_init(struct btree_head *head)
189{
190 __btree_init(head);
191 head->mempool = mempool_create(0, btree_alloc, btree_free, NULL);
192 if (!head->mempool)
193 return -ENOMEM;
194 return 0;
195}
196EXPORT_SYMBOL_GPL(btree_init);
197
198void btree_destroy(struct btree_head *head)
199{
200 mempool_destroy(head->mempool);
201 head->mempool = NULL;
202}
203EXPORT_SYMBOL_GPL(btree_destroy);
204
205void *btree_last(struct btree_head *head, struct btree_geo *geo,
206 unsigned long *key)
207{
208 int height = head->height;
209 unsigned long *node = head->node;
210
211 if (height == 0)
212 return NULL;
213
214 for ( ; height > 1; height--)
215 node = bval(geo, node, 0);
216
217 longcpy(key, bkey(geo, node, 0), geo->keylen);
218 return bval(geo, node, 0);
219}
220EXPORT_SYMBOL_GPL(btree_last);
221
222static int keycmp(struct btree_geo *geo, unsigned long *node, int pos,
223 unsigned long *key)
224{
225 return longcmp(bkey(geo, node, pos), key, geo->keylen);
226}
227
228static int keyzero(struct btree_geo *geo, unsigned long *key)
229{
230 int i;
231
232 for (i = 0; i < geo->keylen; i++)
233 if (key[i])
234 return 0;
235
236 return 1;
237}
238
239void *btree_lookup(struct btree_head *head, struct btree_geo *geo,
240 unsigned long *key)
241{
242 int i, height = head->height;
243 unsigned long *node = head->node;
244
245 if (height == 0)
246 return NULL;
247
248 for ( ; height > 1; height--) {
249 for (i = 0; i < geo->no_pairs; i++)
250 if (keycmp(geo, node, i, key) <= 0)
251 break;
252 if (i == geo->no_pairs)
253 return NULL;
254 node = bval(geo, node, i);
255 if (!node)
256 return NULL;
257 }
258
259 if (!node)
260 return NULL;
261
262 for (i = 0; i < geo->no_pairs; i++)
263 if (keycmp(geo, node, i, key) == 0)
264 return bval(geo, node, i);
265 return NULL;
266}
267EXPORT_SYMBOL_GPL(btree_lookup);
268
269int btree_update(struct btree_head *head, struct btree_geo *geo,
270 unsigned long *key, void *val)
271{
272 int i, height = head->height;
273 unsigned long *node = head->node;
274
275 if (height == 0)
276 return -ENOENT;
277
278 for ( ; height > 1; height--) {
279 for (i = 0; i < geo->no_pairs; i++)
280 if (keycmp(geo, node, i, key) <= 0)
281 break;
282 if (i == geo->no_pairs)
283 return -ENOENT;
284 node = bval(geo, node, i);
285 if (!node)
286 return -ENOENT;
287 }
288
289 if (!node)
290 return -ENOENT;
291
292 for (i = 0; i < geo->no_pairs; i++)
293 if (keycmp(geo, node, i, key) == 0) {
294 setval(geo, node, i, val);
295 return 0;
296 }
297 return -ENOENT;
298}
299EXPORT_SYMBOL_GPL(btree_update);
300
301/*
302 * Usually this function is quite similar to normal lookup. But the key of
303 * a parent node may be smaller than the smallest key of all its siblings.
304 * In such a case we cannot just return NULL, as we have only proven that no
305 * key smaller than __key, but larger than this parent key exists.
306 * So we set __key to the parent key and retry. We have to use the smallest
307 * such parent key, which is the last parent key we encountered.
308 */
309void *btree_get_prev(struct btree_head *head, struct btree_geo *geo,
310 unsigned long *__key)
311{
312 int i, height;
313 unsigned long *node, *oldnode;
314 unsigned long *retry_key = NULL, key[geo->keylen];
315
316 if (keyzero(geo, __key))
317 return NULL;
318
319 if (head->height == 0)
320 return NULL;
321retry:
322 longcpy(key, __key, geo->keylen);
323 dec_key(geo, key);
324
325 node = head->node;
326 for (height = head->height ; height > 1; height--) {
327 for (i = 0; i < geo->no_pairs; i++)
328 if (keycmp(geo, node, i, key) <= 0)
329 break;
330 if (i == geo->no_pairs)
331 goto miss;
332 oldnode = node;
333 node = bval(geo, node, i);
334 if (!node)
335 goto miss;
336 retry_key = bkey(geo, oldnode, i);
337 }
338
339 if (!node)
340 goto miss;
341
342 for (i = 0; i < geo->no_pairs; i++) {
343 if (keycmp(geo, node, i, key) <= 0) {
344 if (bval(geo, node, i)) {
345 longcpy(__key, bkey(geo, node, i), geo->keylen);
346 return bval(geo, node, i);
347 } else
348 goto miss;
349 }
350 }
351miss:
352 if (retry_key) {
353 __key = retry_key;
354 retry_key = NULL;
355 goto retry;
356 }
357 return NULL;
358}
359
360static int getpos(struct btree_geo *geo, unsigned long *node,
361 unsigned long *key)
362{
363 int i;
364
365 for (i = 0; i < geo->no_pairs; i++) {
366 if (keycmp(geo, node, i, key) <= 0)
367 break;
368 }
369 return i;
370}
371
372static int getfill(struct btree_geo *geo, unsigned long *node, int start)
373{
374 int i;
375
376 for (i = start; i < geo->no_pairs; i++)
377 if (!bval(geo, node, i))
378 break;
379 return i;
380}
381
382/*
383 * locate the correct leaf node in the btree
384 */
385static unsigned long *find_level(struct btree_head *head, struct btree_geo *geo,
386 unsigned long *key, int level)
387{
388 unsigned long *node = head->node;
389 int i, height;
390
391 for (height = head->height; height > level; height--) {
392 for (i = 0; i < geo->no_pairs; i++)
393 if (keycmp(geo, node, i, key) <= 0)
394 break;
395
396 if ((i == geo->no_pairs) || !bval(geo, node, i)) {
397 /* right-most key is too large, update it */
398 /* FIXME: If the right-most key on higher levels is
399 * always zero, this wouldn't be necessary. */
400 i--;
401 setkey(geo, node, i, key);
402 }
403 BUG_ON(i < 0);
404 node = bval(geo, node, i);
405 }
406 BUG_ON(!node);
407 return node;
408}
409
410static int btree_grow(struct btree_head *head, struct btree_geo *geo,
411 gfp_t gfp)
412{
413 unsigned long *node;
414 int fill;
415
416 node = btree_node_alloc(head, gfp);
417 if (!node)
418 return -ENOMEM;
419 if (head->node) {
420 fill = getfill(geo, head->node, 0);
421 setkey(geo, node, 0, bkey(geo, head->node, fill - 1));
422 setval(geo, node, 0, head->node);
423 }
424 head->node = node;
425 head->height++;
426 return 0;
427}
428
429static void btree_shrink(struct btree_head *head, struct btree_geo *geo)
430{
431 unsigned long *node;
432 int fill;
433
434 if (head->height <= 1)
435 return;
436
437 node = head->node;
438 fill = getfill(geo, node, 0);
439 BUG_ON(fill > 1);
440 head->node = bval(geo, node, 0);
441 head->height--;
442 mempool_free(node, head->mempool);
443}
444
445static int btree_insert_level(struct btree_head *head, struct btree_geo *geo,
446 unsigned long *key, void *val, int level,
447 gfp_t gfp)
448{
449 unsigned long *node;
450 int i, pos, fill, err;
451
452 BUG_ON(!val);
453 if (head->height < level) {
454 err = btree_grow(head, geo, gfp);
455 if (err)
456 return err;
457 }
458
459retry:
460 node = find_level(head, geo, key, level);
461 pos = getpos(geo, node, key);
462 fill = getfill(geo, node, pos);
463 /* two identical keys are not allowed */
464 BUG_ON(pos < fill && keycmp(geo, node, pos, key) == 0);
465
466 if (fill == geo->no_pairs) {
467 /* need to split node */
468 unsigned long *new;
469
470 new = btree_node_alloc(head, gfp);
471 if (!new)
472 return -ENOMEM;
473 err = btree_insert_level(head, geo,
474 bkey(geo, node, fill / 2 - 1),
475 new, level + 1, gfp);
476 if (err) {
477 mempool_free(new, head->mempool);
478 return err;
479 }
480 for (i = 0; i < fill / 2; i++) {
481 setkey(geo, new, i, bkey(geo, node, i));
482 setval(geo, new, i, bval(geo, node, i));
483 setkey(geo, node, i, bkey(geo, node, i + fill / 2));
484 setval(geo, node, i, bval(geo, node, i + fill / 2));
485 clearpair(geo, node, i + fill / 2);
486 }
487 if (fill & 1) {
488 setkey(geo, node, i, bkey(geo, node, fill - 1));
489 setval(geo, node, i, bval(geo, node, fill - 1));
490 clearpair(geo, node, fill - 1);
491 }
492 goto retry;
493 }
494 BUG_ON(fill >= geo->no_pairs);
495
496 /* shift and insert */
497 for (i = fill; i > pos; i--) {
498 setkey(geo, node, i, bkey(geo, node, i - 1));
499 setval(geo, node, i, bval(geo, node, i - 1));
500 }
501 setkey(geo, node, pos, key);
502 setval(geo, node, pos, val);
503
504 return 0;
505}
506
507int btree_insert(struct btree_head *head, struct btree_geo *geo,
508 unsigned long *key, void *val, gfp_t gfp)
509{
510 return btree_insert_level(head, geo, key, val, 1, gfp);
511}
512EXPORT_SYMBOL_GPL(btree_insert);
513
514static void *btree_remove_level(struct btree_head *head, struct btree_geo *geo,
515 unsigned long *key, int level);
516static void merge(struct btree_head *head, struct btree_geo *geo, int level,
517 unsigned long *left, int lfill,
518 unsigned long *right, int rfill,
519 unsigned long *parent, int lpos)
520{
521 int i;
522
523 for (i = 0; i < rfill; i++) {
524 /* Move all keys to the left */
525 setkey(geo, left, lfill + i, bkey(geo, right, i));
526 setval(geo, left, lfill + i, bval(geo, right, i));
527 }
528 /* Exchange left and right child in parent */
529 setval(geo, parent, lpos, right);
530 setval(geo, parent, lpos + 1, left);
531 /* Remove left (formerly right) child from parent */
532 btree_remove_level(head, geo, bkey(geo, parent, lpos), level + 1);
533 mempool_free(right, head->mempool);
534}
535
536static void rebalance(struct btree_head *head, struct btree_geo *geo,
537 unsigned long *key, int level, unsigned long *child, int fill)
538{
539 unsigned long *parent, *left = NULL, *right = NULL;
540 int i, no_left, no_right;
541
542 if (fill == 0) {
543 /* Because we don't steal entries from a neigbour, this case
544 * can happen. Parent node contains a single child, this
545 * node, so merging with a sibling never happens.
546 */
547 btree_remove_level(head, geo, key, level + 1);
548 mempool_free(child, head->mempool);
549 return;
550 }
551
552 parent = find_level(head, geo, key, level + 1);
553 i = getpos(geo, parent, key);
554 BUG_ON(bval(geo, parent, i) != child);
555
556 if (i > 0) {
557 left = bval(geo, parent, i - 1);
558 no_left = getfill(geo, left, 0);
559 if (fill + no_left <= geo->no_pairs) {
560 merge(head, geo, level,
561 left, no_left,
562 child, fill,
563 parent, i - 1);
564 return;
565 }
566 }
567 if (i + 1 < getfill(geo, parent, i)) {
568 right = bval(geo, parent, i + 1);
569 no_right = getfill(geo, right, 0);
570 if (fill + no_right <= geo->no_pairs) {
571 merge(head, geo, level,
572 child, fill,
573 right, no_right,
574 parent, i);
575 return;
576 }
577 }
578 /*
579 * We could also try to steal one entry from the left or right
580 * neighbor. By not doing so we changed the invariant from
581 * "all nodes are at least half full" to "no two neighboring
582 * nodes can be merged". Which means that the average fill of
583 * all nodes is still half or better.
584 */
585}
586
587static void *btree_remove_level(struct btree_head *head, struct btree_geo *geo,
588 unsigned long *key, int level)
589{
590 unsigned long *node;
591 int i, pos, fill;
592 void *ret;
593
594 if (level > head->height) {
595 /* we recursed all the way up */
596 head->height = 0;
597 head->node = NULL;
598 return NULL;
599 }
600
601 node = find_level(head, geo, key, level);
602 pos = getpos(geo, node, key);
603 fill = getfill(geo, node, pos);
604 if ((level == 1) && (keycmp(geo, node, pos, key) != 0))
605 return NULL;
606 ret = bval(geo, node, pos);
607
608 /* remove and shift */
609 for (i = pos; i < fill - 1; i++) {
610 setkey(geo, node, i, bkey(geo, node, i + 1));
611 setval(geo, node, i, bval(geo, node, i + 1));
612 }
613 clearpair(geo, node, fill - 1);
614
615 if (fill - 1 < geo->no_pairs / 2) {
616 if (level < head->height)
617 rebalance(head, geo, key, level, node, fill - 1);
618 else if (fill - 1 == 1)
619 btree_shrink(head, geo);
620 }
621
622 return ret;
623}
624
625void *btree_remove(struct btree_head *head, struct btree_geo *geo,
626 unsigned long *key)
627{
628 if (head->height == 0)
629 return NULL;
630
631 return btree_remove_level(head, geo, key, 1);
632}
633EXPORT_SYMBOL_GPL(btree_remove);
634
635int btree_merge(struct btree_head *target, struct btree_head *victim,
636 struct btree_geo *geo, gfp_t gfp)
637{
638 unsigned long key[geo->keylen];
639 unsigned long dup[geo->keylen];
640 void *val;
641 int err;
642
643 BUG_ON(target == victim);
644
645 if (!(target->node)) {
646 /* target is empty, just copy fields over */
647 target->node = victim->node;
648 target->height = victim->height;
649 __btree_init(victim);
650 return 0;
651 }
652
653 /* TODO: This needs some optimizations. Currently we do three tree
654 * walks to remove a single object from the victim.
655 */
656 for (;;) {
657 if (!btree_last(victim, geo, key))
658 break;
659 val = btree_lookup(victim, geo, key);
660 err = btree_insert(target, geo, key, val, gfp);
661 if (err)
662 return err;
663 /* We must make a copy of the key, as the original will get
664 * mangled inside btree_remove. */
665 longcpy(dup, key, geo->keylen);
666 btree_remove(victim, geo, dup);
667 }
668 return 0;
669}
670EXPORT_SYMBOL_GPL(btree_merge);
671
672static size_t __btree_for_each(struct btree_head *head, struct btree_geo *geo,
673 unsigned long *node, unsigned long opaque,
674 void (*func)(void *elem, unsigned long opaque,
675 unsigned long *key, size_t index,
676 void *func2),
677 void *func2, int reap, int height, size_t count)
678{
679 int i;
680 unsigned long *child;
681
682 for (i = 0; i < geo->no_pairs; i++) {
683 child = bval(geo, node, i);
684 if (!child)
685 break;
686 if (height > 1)
687 count = __btree_for_each(head, geo, child, opaque,
688 func, func2, reap, height - 1, count);
689 else
690 func(child, opaque, bkey(geo, node, i), count++,
691 func2);
692 }
693 if (reap)
694 mempool_free(node, head->mempool);
695 return count;
696}
697
698static void empty(void *elem, unsigned long opaque, unsigned long *key,
699 size_t index, void *func2)
700{
701}
702
703void visitorl(void *elem, unsigned long opaque, unsigned long *key,
704 size_t index, void *__func)
705{
706 visitorl_t func = __func;
707
708 func(elem, opaque, *key, index);
709}
710EXPORT_SYMBOL_GPL(visitorl);
711
712void visitor32(void *elem, unsigned long opaque, unsigned long *__key,
713 size_t index, void *__func)
714{
715 visitor32_t func = __func;
716 u32 *key = (void *)__key;
717
718 func(elem, opaque, *key, index);
719}
720EXPORT_SYMBOL_GPL(visitor32);
721
722void visitor64(void *elem, unsigned long opaque, unsigned long *__key,
723 size_t index, void *__func)
724{
725 visitor64_t func = __func;
726 u64 *key = (void *)__key;
727
728 func(elem, opaque, *key, index);
729}
730EXPORT_SYMBOL_GPL(visitor64);
731
732void visitor128(void *elem, unsigned long opaque, unsigned long *__key,
733 size_t index, void *__func)
734{
735 visitor128_t func = __func;
736 u64 *key = (void *)__key;
737
738 func(elem, opaque, key[0], key[1], index);
739}
740EXPORT_SYMBOL_GPL(visitor128);
741
742size_t btree_visitor(struct btree_head *head, struct btree_geo *geo,
743 unsigned long opaque,
744 void (*func)(void *elem, unsigned long opaque,
745 unsigned long *key,
746 size_t index, void *func2),
747 void *func2)
748{
749 size_t count = 0;
750
751 if (!func2)
752 func = empty;
753 if (head->node)
754 count = __btree_for_each(head, geo, head->node, opaque, func,
755 func2, 0, head->height, 0);
756 return count;
757}
758EXPORT_SYMBOL_GPL(btree_visitor);
759
760size_t btree_grim_visitor(struct btree_head *head, struct btree_geo *geo,
761 unsigned long opaque,
762 void (*func)(void *elem, unsigned long opaque,
763 unsigned long *key,
764 size_t index, void *func2),
765 void *func2)
766{
767 size_t count = 0;
768
769 if (!func2)
770 func = empty;
771 if (head->node)
772 count = __btree_for_each(head, geo, head->node, opaque, func,
773 func2, 1, head->height, 0);
774 __btree_init(head);
775 return count;
776}
777EXPORT_SYMBOL_GPL(btree_grim_visitor);
778
779static int __init btree_module_init(void)
780{
781 btree_cachep = kmem_cache_create("btree_node", NODESIZE, 0,
782 SLAB_HWCACHE_ALIGN, NULL);
783 return 0;
784}
785
786static void __exit btree_module_exit(void)
787{
788 kmem_cache_destroy(btree_cachep);
789}
790
791/* If core code starts using btree, initialization should happen even earlier */
792module_init(btree_module_init);
793module_exit(btree_module_exit);
794
795MODULE_AUTHOR("Joern Engel <joern@logfs.org>");
796MODULE_AUTHOR("Johannes Berg <johannes@sipsolutions.net>");
797MODULE_LICENSE("GPL");