aboutsummaryrefslogtreecommitdiffstats
path: root/fs/logfs
diff options
context:
space:
mode:
authorAndrea Bastoni <bastoni@cs.unc.edu>2010-05-30 19:16:45 -0400
committerAndrea Bastoni <bastoni@cs.unc.edu>2010-05-30 19:16:45 -0400
commitada47b5fe13d89735805b566185f4885f5a3f750 (patch)
tree644b88f8a71896307d71438e9b3af49126ffb22b /fs/logfs
parent43e98717ad40a4ae64545b5ba047c7b86aa44f4f (diff)
parent3280f21d43ee541f97f8cda5792150d2dbec20d5 (diff)
Merge branch 'wip-2.6.34' into old-private-masterarchived-private-master
Diffstat (limited to 'fs/logfs')
-rw-r--r--fs/logfs/Kconfig17
-rw-r--r--fs/logfs/Makefile13
-rw-r--r--fs/logfs/compr.c95
-rw-r--r--fs/logfs/dev_bdev.c333
-rw-r--r--fs/logfs/dev_mtd.c254
-rw-r--r--fs/logfs/dir.c827
-rw-r--r--fs/logfs/file.c263
-rw-r--r--fs/logfs/gc.c739
-rw-r--r--fs/logfs/inode.c418
-rw-r--r--fs/logfs/journal.c898
-rw-r--r--fs/logfs/logfs.h736
-rw-r--r--fs/logfs/logfs_abi.h629
-rw-r--r--fs/logfs/readwrite.c2267
-rw-r--r--fs/logfs/segment.c930
-rw-r--r--fs/logfs/super.c657
15 files changed, 9076 insertions, 0 deletions
diff --git a/fs/logfs/Kconfig b/fs/logfs/Kconfig
new file mode 100644
index 000000000000..daf9a9b32dd3
--- /dev/null
+++ b/fs/logfs/Kconfig
@@ -0,0 +1,17 @@
1config LOGFS
2 tristate "LogFS file system (EXPERIMENTAL)"
3 depends on (MTD || BLOCK) && EXPERIMENTAL
4 select ZLIB_INFLATE
5 select ZLIB_DEFLATE
6 select CRC32
7 select BTREE
8 help
9 Flash filesystem aimed to scale efficiently to large devices.
10 In comparison to JFFS2 it offers significantly faster mount
11 times and potentially less RAM usage, although the latter has
12 not been measured yet.
13
14 In its current state it is still very experimental and should
15 not be used for other than testing purposes.
16
17 If unsure, say N.
diff --git a/fs/logfs/Makefile b/fs/logfs/Makefile
new file mode 100644
index 000000000000..4820027787ee
--- /dev/null
+++ b/fs/logfs/Makefile
@@ -0,0 +1,13 @@
1obj-$(CONFIG_LOGFS) += logfs.o
2
3logfs-y += compr.o
4logfs-y += dir.o
5logfs-y += file.o
6logfs-y += gc.o
7logfs-y += inode.o
8logfs-y += journal.o
9logfs-y += readwrite.o
10logfs-y += segment.o
11logfs-y += super.o
12logfs-$(CONFIG_BLOCK) += dev_bdev.o
13logfs-$(CONFIG_MTD) += dev_mtd.o
diff --git a/fs/logfs/compr.c b/fs/logfs/compr.c
new file mode 100644
index 000000000000..44bbfd249abc
--- /dev/null
+++ b/fs/logfs/compr.c
@@ -0,0 +1,95 @@
1/*
2 * fs/logfs/compr.c - compression routines
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */
8#include "logfs.h"
9#include <linux/vmalloc.h>
10#include <linux/zlib.h>
11
12#define COMPR_LEVEL 3
13
14static DEFINE_MUTEX(compr_mutex);
15static struct z_stream_s stream;
16
17int logfs_compress(void *in, void *out, size_t inlen, size_t outlen)
18{
19 int err, ret;
20
21 ret = -EIO;
22 mutex_lock(&compr_mutex);
23 err = zlib_deflateInit(&stream, COMPR_LEVEL);
24 if (err != Z_OK)
25 goto error;
26
27 stream.next_in = in;
28 stream.avail_in = inlen;
29 stream.total_in = 0;
30 stream.next_out = out;
31 stream.avail_out = outlen;
32 stream.total_out = 0;
33
34 err = zlib_deflate(&stream, Z_FINISH);
35 if (err != Z_STREAM_END)
36 goto error;
37
38 err = zlib_deflateEnd(&stream);
39 if (err != Z_OK)
40 goto error;
41
42 if (stream.total_out >= stream.total_in)
43 goto error;
44
45 ret = stream.total_out;
46error:
47 mutex_unlock(&compr_mutex);
48 return ret;
49}
50
51int logfs_uncompress(void *in, void *out, size_t inlen, size_t outlen)
52{
53 int err, ret;
54
55 ret = -EIO;
56 mutex_lock(&compr_mutex);
57 err = zlib_inflateInit(&stream);
58 if (err != Z_OK)
59 goto error;
60
61 stream.next_in = in;
62 stream.avail_in = inlen;
63 stream.total_in = 0;
64 stream.next_out = out;
65 stream.avail_out = outlen;
66 stream.total_out = 0;
67
68 err = zlib_inflate(&stream, Z_FINISH);
69 if (err != Z_STREAM_END)
70 goto error;
71
72 err = zlib_inflateEnd(&stream);
73 if (err != Z_OK)
74 goto error;
75
76 ret = 0;
77error:
78 mutex_unlock(&compr_mutex);
79 return ret;
80}
81
82int __init logfs_compr_init(void)
83{
84 size_t size = max(zlib_deflate_workspacesize(),
85 zlib_inflate_workspacesize());
86 stream.workspace = vmalloc(size);
87 if (!stream.workspace)
88 return -ENOMEM;
89 return 0;
90}
91
92void logfs_compr_exit(void)
93{
94 vfree(stream.workspace);
95}
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
new file mode 100644
index 000000000000..243c00071f76
--- /dev/null
+++ b/fs/logfs/dev_bdev.c
@@ -0,0 +1,333 @@
1/*
2 * fs/logfs/dev_bdev.c - Device access methods for block devices
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */
8#include "logfs.h"
9#include <linux/bio.h>
10#include <linux/blkdev.h>
11#include <linux/buffer_head.h>
12#include <linux/gfp.h>
13
14#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
15
16static void request_complete(struct bio *bio, int err)
17{
18 complete((struct completion *)bio->bi_private);
19}
20
21static int sync_request(struct page *page, struct block_device *bdev, int rw)
22{
23 struct bio bio;
24 struct bio_vec bio_vec;
25 struct completion complete;
26
27 bio_init(&bio);
28 bio.bi_io_vec = &bio_vec;
29 bio_vec.bv_page = page;
30 bio_vec.bv_len = PAGE_SIZE;
31 bio_vec.bv_offset = 0;
32 bio.bi_vcnt = 1;
33 bio.bi_idx = 0;
34 bio.bi_size = PAGE_SIZE;
35 bio.bi_bdev = bdev;
36 bio.bi_sector = page->index * (PAGE_SIZE >> 9);
37 init_completion(&complete);
38 bio.bi_private = &complete;
39 bio.bi_end_io = request_complete;
40
41 submit_bio(rw, &bio);
42 generic_unplug_device(bdev_get_queue(bdev));
43 wait_for_completion(&complete);
44 return test_bit(BIO_UPTODATE, &bio.bi_flags) ? 0 : -EIO;
45}
46
47static int bdev_readpage(void *_sb, struct page *page)
48{
49 struct super_block *sb = _sb;
50 struct block_device *bdev = logfs_super(sb)->s_bdev;
51 int err;
52
53 err = sync_request(page, bdev, READ);
54 if (err) {
55 ClearPageUptodate(page);
56 SetPageError(page);
57 } else {
58 SetPageUptodate(page);
59 ClearPageError(page);
60 }
61 unlock_page(page);
62 return err;
63}
64
65static DECLARE_WAIT_QUEUE_HEAD(wq);
66
67static void writeseg_end_io(struct bio *bio, int err)
68{
69 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
70 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
71 struct super_block *sb = bio->bi_private;
72 struct logfs_super *super = logfs_super(sb);
73 struct page *page;
74
75 BUG_ON(!uptodate); /* FIXME: Retry io or write elsewhere */
76 BUG_ON(err);
77 BUG_ON(bio->bi_vcnt == 0);
78 do {
79 page = bvec->bv_page;
80 if (--bvec >= bio->bi_io_vec)
81 prefetchw(&bvec->bv_page->flags);
82
83 end_page_writeback(page);
84 page_cache_release(page);
85 } while (bvec >= bio->bi_io_vec);
86 bio_put(bio);
87 if (atomic_dec_and_test(&super->s_pending_writes))
88 wake_up(&wq);
89}
90
91static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
92 size_t nr_pages)
93{
94 struct logfs_super *super = logfs_super(sb);
95 struct address_space *mapping = super->s_mapping_inode->i_mapping;
96 struct bio *bio;
97 struct page *page;
98 struct request_queue *q = bdev_get_queue(sb->s_bdev);
99 unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9);
100 int i;
101
102 if (max_pages > BIO_MAX_PAGES)
103 max_pages = BIO_MAX_PAGES;
104 bio = bio_alloc(GFP_NOFS, max_pages);
105 BUG_ON(!bio);
106
107 for (i = 0; i < nr_pages; i++) {
108 if (i >= max_pages) {
109 /* Block layer cannot split bios :( */
110 bio->bi_vcnt = i;
111 bio->bi_idx = 0;
112 bio->bi_size = i * PAGE_SIZE;
113 bio->bi_bdev = super->s_bdev;
114 bio->bi_sector = ofs >> 9;
115 bio->bi_private = sb;
116 bio->bi_end_io = writeseg_end_io;
117 atomic_inc(&super->s_pending_writes);
118 submit_bio(WRITE, bio);
119
120 ofs += i * PAGE_SIZE;
121 index += i;
122 nr_pages -= i;
123 i = 0;
124
125 bio = bio_alloc(GFP_NOFS, max_pages);
126 BUG_ON(!bio);
127 }
128 page = find_lock_page(mapping, index + i);
129 BUG_ON(!page);
130 bio->bi_io_vec[i].bv_page = page;
131 bio->bi_io_vec[i].bv_len = PAGE_SIZE;
132 bio->bi_io_vec[i].bv_offset = 0;
133
134 BUG_ON(PageWriteback(page));
135 set_page_writeback(page);
136 unlock_page(page);
137 }
138 bio->bi_vcnt = nr_pages;
139 bio->bi_idx = 0;
140 bio->bi_size = nr_pages * PAGE_SIZE;
141 bio->bi_bdev = super->s_bdev;
142 bio->bi_sector = ofs >> 9;
143 bio->bi_private = sb;
144 bio->bi_end_io = writeseg_end_io;
145 atomic_inc(&super->s_pending_writes);
146 submit_bio(WRITE, bio);
147 return 0;
148}
149
150static void bdev_writeseg(struct super_block *sb, u64 ofs, size_t len)
151{
152 struct logfs_super *super = logfs_super(sb);
153 int head;
154
155 BUG_ON(super->s_flags & LOGFS_SB_FLAG_RO);
156
157 if (len == 0) {
158 /* This can happen when the object fit perfectly into a
159 * segment, the segment gets written per sync and subsequently
160 * closed.
161 */
162 return;
163 }
164 head = ofs & (PAGE_SIZE - 1);
165 if (head) {
166 ofs -= head;
167 len += head;
168 }
169 len = PAGE_ALIGN(len);
170 __bdev_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT);
171 generic_unplug_device(bdev_get_queue(logfs_super(sb)->s_bdev));
172}
173
174
175static void erase_end_io(struct bio *bio, int err)
176{
177 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
178 struct super_block *sb = bio->bi_private;
179 struct logfs_super *super = logfs_super(sb);
180
181 BUG_ON(!uptodate); /* FIXME: Retry io or write elsewhere */
182 BUG_ON(err);
183 BUG_ON(bio->bi_vcnt == 0);
184 bio_put(bio);
185 if (atomic_dec_and_test(&super->s_pending_writes))
186 wake_up(&wq);
187}
188
189static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index,
190 size_t nr_pages)
191{
192 struct logfs_super *super = logfs_super(sb);
193 struct bio *bio;
194 struct request_queue *q = bdev_get_queue(sb->s_bdev);
195 unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9);
196 int i;
197
198 if (max_pages > BIO_MAX_PAGES)
199 max_pages = BIO_MAX_PAGES;
200 bio = bio_alloc(GFP_NOFS, max_pages);
201 BUG_ON(!bio);
202
203 for (i = 0; i < nr_pages; i++) {
204 if (i >= max_pages) {
205 /* Block layer cannot split bios :( */
206 bio->bi_vcnt = i;
207 bio->bi_idx = 0;
208 bio->bi_size = i * PAGE_SIZE;
209 bio->bi_bdev = super->s_bdev;
210 bio->bi_sector = ofs >> 9;
211 bio->bi_private = sb;
212 bio->bi_end_io = erase_end_io;
213 atomic_inc(&super->s_pending_writes);
214 submit_bio(WRITE, bio);
215
216 ofs += i * PAGE_SIZE;
217 index += i;
218 nr_pages -= i;
219 i = 0;
220
221 bio = bio_alloc(GFP_NOFS, max_pages);
222 BUG_ON(!bio);
223 }
224 bio->bi_io_vec[i].bv_page = super->s_erase_page;
225 bio->bi_io_vec[i].bv_len = PAGE_SIZE;
226 bio->bi_io_vec[i].bv_offset = 0;
227 }
228 bio->bi_vcnt = nr_pages;
229 bio->bi_idx = 0;
230 bio->bi_size = nr_pages * PAGE_SIZE;
231 bio->bi_bdev = super->s_bdev;
232 bio->bi_sector = ofs >> 9;
233 bio->bi_private = sb;
234 bio->bi_end_io = erase_end_io;
235 atomic_inc(&super->s_pending_writes);
236 submit_bio(WRITE, bio);
237 return 0;
238}
239
240static int bdev_erase(struct super_block *sb, loff_t to, size_t len,
241 int ensure_write)
242{
243 struct logfs_super *super = logfs_super(sb);
244
245 BUG_ON(to & (PAGE_SIZE - 1));
246 BUG_ON(len & (PAGE_SIZE - 1));
247
248 if (super->s_flags & LOGFS_SB_FLAG_RO)
249 return -EROFS;
250
251 if (ensure_write) {
252 /*
253 * Object store doesn't care whether erases happen or not.
254 * But for the journal they are required. Otherwise a scan
255 * can find an old commit entry and assume it is the current
256 * one, travelling back in time.
257 */
258 do_erase(sb, to, to >> PAGE_SHIFT, len >> PAGE_SHIFT);
259 }
260
261 return 0;
262}
263
264static void bdev_sync(struct super_block *sb)
265{
266 struct logfs_super *super = logfs_super(sb);
267
268 wait_event(wq, atomic_read(&super->s_pending_writes) == 0);
269}
270
271static struct page *bdev_find_first_sb(struct super_block *sb, u64 *ofs)
272{
273 struct logfs_super *super = logfs_super(sb);
274 struct address_space *mapping = super->s_mapping_inode->i_mapping;
275 filler_t *filler = bdev_readpage;
276
277 *ofs = 0;
278 return read_cache_page(mapping, 0, filler, sb);
279}
280
281static struct page *bdev_find_last_sb(struct super_block *sb, u64 *ofs)
282{
283 struct logfs_super *super = logfs_super(sb);
284 struct address_space *mapping = super->s_mapping_inode->i_mapping;
285 filler_t *filler = bdev_readpage;
286 u64 pos = (super->s_bdev->bd_inode->i_size & ~0xfffULL) - 0x1000;
287 pgoff_t index = pos >> PAGE_SHIFT;
288
289 *ofs = pos;
290 return read_cache_page(mapping, index, filler, sb);
291}
292
293static int bdev_write_sb(struct super_block *sb, struct page *page)
294{
295 struct block_device *bdev = logfs_super(sb)->s_bdev;
296
297 /* Nothing special to do for block devices. */
298 return sync_request(page, bdev, WRITE);
299}
300
301static void bdev_put_device(struct super_block *sb)
302{
303 close_bdev_exclusive(logfs_super(sb)->s_bdev, FMODE_READ|FMODE_WRITE);
304}
305
306static const struct logfs_device_ops bd_devops = {
307 .find_first_sb = bdev_find_first_sb,
308 .find_last_sb = bdev_find_last_sb,
309 .write_sb = bdev_write_sb,
310 .readpage = bdev_readpage,
311 .writeseg = bdev_writeseg,
312 .erase = bdev_erase,
313 .sync = bdev_sync,
314 .put_device = bdev_put_device,
315};
316
317int logfs_get_sb_bdev(struct file_system_type *type, int flags,
318 const char *devname, struct vfsmount *mnt)
319{
320 struct block_device *bdev;
321
322 bdev = open_bdev_exclusive(devname, FMODE_READ|FMODE_WRITE, type);
323 if (IS_ERR(bdev))
324 return PTR_ERR(bdev);
325
326 if (MAJOR(bdev->bd_dev) == MTD_BLOCK_MAJOR) {
327 int mtdnr = MINOR(bdev->bd_dev);
328 close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE);
329 return logfs_get_sb_mtd(type, flags, mtdnr, mnt);
330 }
331
332 return logfs_get_sb_device(type, flags, NULL, bdev, &bd_devops, mnt);
333}
diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c
new file mode 100644
index 000000000000..cafb6ef2e05b
--- /dev/null
+++ b/fs/logfs/dev_mtd.c
@@ -0,0 +1,254 @@
1/*
2 * fs/logfs/dev_mtd.c - Device access methods for MTD
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */
8#include "logfs.h"
9#include <linux/completion.h>
10#include <linux/mount.h>
11#include <linux/sched.h>
12
13#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
14
15static int mtd_read(struct super_block *sb, loff_t ofs, size_t len, void *buf)
16{
17 struct mtd_info *mtd = logfs_super(sb)->s_mtd;
18 size_t retlen;
19 int ret;
20
21 ret = mtd->read(mtd, ofs, len, &retlen, buf);
22 BUG_ON(ret == -EINVAL);
23 if (ret)
24 return ret;
25
26 /* Not sure if we should loop instead. */
27 if (retlen != len)
28 return -EIO;
29
30 return 0;
31}
32
33static int mtd_write(struct super_block *sb, loff_t ofs, size_t len, void *buf)
34{
35 struct logfs_super *super = logfs_super(sb);
36 struct mtd_info *mtd = super->s_mtd;
37 size_t retlen;
38 loff_t page_start, page_end;
39 int ret;
40
41 if (super->s_flags & LOGFS_SB_FLAG_RO)
42 return -EROFS;
43
44 BUG_ON((ofs >= mtd->size) || (len > mtd->size - ofs));
45 BUG_ON(ofs != (ofs >> super->s_writeshift) << super->s_writeshift);
46 BUG_ON(len > PAGE_CACHE_SIZE);
47 page_start = ofs & PAGE_CACHE_MASK;
48 page_end = PAGE_CACHE_ALIGN(ofs + len) - 1;
49 ret = mtd->write(mtd, ofs, len, &retlen, buf);
50 if (ret || (retlen != len))
51 return -EIO;
52
53 return 0;
54}
55
56/*
57 * For as long as I can remember (since about 2001) mtd->erase has been an
58 * asynchronous interface lacking the first driver to actually use the
59 * asynchronous properties. So just to prevent the first implementor of such
60 * a thing from breaking logfs in 2350, we do the usual pointless dance to
61 * declare a completion variable and wait for completion before returning
62 * from mtd_erase(). What an excercise in futility!
63 */
64static void logfs_erase_callback(struct erase_info *ei)
65{
66 complete((struct completion *)ei->priv);
67}
68
69static int mtd_erase_mapping(struct super_block *sb, loff_t ofs, size_t len)
70{
71 struct logfs_super *super = logfs_super(sb);
72 struct address_space *mapping = super->s_mapping_inode->i_mapping;
73 struct page *page;
74 pgoff_t index = ofs >> PAGE_SHIFT;
75
76 for (index = ofs >> PAGE_SHIFT; index < (ofs + len) >> PAGE_SHIFT; index++) {
77 page = find_get_page(mapping, index);
78 if (!page)
79 continue;
80 memset(page_address(page), 0xFF, PAGE_SIZE);
81 page_cache_release(page);
82 }
83 return 0;
84}
85
86static int mtd_erase(struct super_block *sb, loff_t ofs, size_t len,
87 int ensure_write)
88{
89 struct mtd_info *mtd = logfs_super(sb)->s_mtd;
90 struct erase_info ei;
91 DECLARE_COMPLETION_ONSTACK(complete);
92 int ret;
93
94 BUG_ON(len % mtd->erasesize);
95 if (logfs_super(sb)->s_flags & LOGFS_SB_FLAG_RO)
96 return -EROFS;
97
98 memset(&ei, 0, sizeof(ei));
99 ei.mtd = mtd;
100 ei.addr = ofs;
101 ei.len = len;
102 ei.callback = logfs_erase_callback;
103 ei.priv = (long)&complete;
104 ret = mtd->erase(mtd, &ei);
105 if (ret)
106 return -EIO;
107
108 wait_for_completion(&complete);
109 if (ei.state != MTD_ERASE_DONE)
110 return -EIO;
111 return mtd_erase_mapping(sb, ofs, len);
112}
113
114static void mtd_sync(struct super_block *sb)
115{
116 struct mtd_info *mtd = logfs_super(sb)->s_mtd;
117
118 if (mtd->sync)
119 mtd->sync(mtd);
120}
121
122static int mtd_readpage(void *_sb, struct page *page)
123{
124 struct super_block *sb = _sb;
125 int err;
126
127 err = mtd_read(sb, page->index << PAGE_SHIFT, PAGE_SIZE,
128 page_address(page));
129 if (err == -EUCLEAN) {
130 err = 0;
131 /* FIXME: force GC this segment */
132 }
133 if (err) {
134 ClearPageUptodate(page);
135 SetPageError(page);
136 } else {
137 SetPageUptodate(page);
138 ClearPageError(page);
139 }
140 unlock_page(page);
141 return err;
142}
143
144static struct page *mtd_find_first_sb(struct super_block *sb, u64 *ofs)
145{
146 struct logfs_super *super = logfs_super(sb);
147 struct address_space *mapping = super->s_mapping_inode->i_mapping;
148 filler_t *filler = mtd_readpage;
149 struct mtd_info *mtd = super->s_mtd;
150
151 if (!mtd->block_isbad)
152 return NULL;
153
154 *ofs = 0;
155 while (mtd->block_isbad(mtd, *ofs)) {
156 *ofs += mtd->erasesize;
157 if (*ofs >= mtd->size)
158 return NULL;
159 }
160 BUG_ON(*ofs & ~PAGE_MASK);
161 return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb);
162}
163
164static struct page *mtd_find_last_sb(struct super_block *sb, u64 *ofs)
165{
166 struct logfs_super *super = logfs_super(sb);
167 struct address_space *mapping = super->s_mapping_inode->i_mapping;
168 filler_t *filler = mtd_readpage;
169 struct mtd_info *mtd = super->s_mtd;
170
171 if (!mtd->block_isbad)
172 return NULL;
173
174 *ofs = mtd->size - mtd->erasesize;
175 while (mtd->block_isbad(mtd, *ofs)) {
176 *ofs -= mtd->erasesize;
177 if (*ofs <= 0)
178 return NULL;
179 }
180 *ofs = *ofs + mtd->erasesize - 0x1000;
181 BUG_ON(*ofs & ~PAGE_MASK);
182 return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb);
183}
184
185static int __mtd_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
186 size_t nr_pages)
187{
188 struct logfs_super *super = logfs_super(sb);
189 struct address_space *mapping = super->s_mapping_inode->i_mapping;
190 struct page *page;
191 int i, err;
192
193 for (i = 0; i < nr_pages; i++) {
194 page = find_lock_page(mapping, index + i);
195 BUG_ON(!page);
196
197 err = mtd_write(sb, page->index << PAGE_SHIFT, PAGE_SIZE,
198 page_address(page));
199 unlock_page(page);
200 page_cache_release(page);
201 if (err)
202 return err;
203 }
204 return 0;
205}
206
207static void mtd_writeseg(struct super_block *sb, u64 ofs, size_t len)
208{
209 struct logfs_super *super = logfs_super(sb);
210 int head;
211
212 if (super->s_flags & LOGFS_SB_FLAG_RO)
213 return;
214
215 if (len == 0) {
216 /* This can happen when the object fit perfectly into a
217 * segment, the segment gets written per sync and subsequently
218 * closed.
219 */
220 return;
221 }
222 head = ofs & (PAGE_SIZE - 1);
223 if (head) {
224 ofs -= head;
225 len += head;
226 }
227 len = PAGE_ALIGN(len);
228 __mtd_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT);
229}
230
231static void mtd_put_device(struct super_block *sb)
232{
233 put_mtd_device(logfs_super(sb)->s_mtd);
234}
235
236static const struct logfs_device_ops mtd_devops = {
237 .find_first_sb = mtd_find_first_sb,
238 .find_last_sb = mtd_find_last_sb,
239 .readpage = mtd_readpage,
240 .writeseg = mtd_writeseg,
241 .erase = mtd_erase,
242 .sync = mtd_sync,
243 .put_device = mtd_put_device,
244};
245
246int logfs_get_sb_mtd(struct file_system_type *type, int flags,
247 int mtdnr, struct vfsmount *mnt)
248{
249 struct mtd_info *mtd;
250 const struct logfs_device_ops *devops = &mtd_devops;
251
252 mtd = get_mtd_device(NULL, mtdnr);
253 return logfs_get_sb_device(type, flags, mtd, NULL, devops, mnt);
254}
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
new file mode 100644
index 000000000000..2396a85c0f55
--- /dev/null
+++ b/fs/logfs/dir.c
@@ -0,0 +1,827 @@
1/*
2 * fs/logfs/dir.c - directory-related code
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */
8#include "logfs.h"
9#include <linux/slab.h>
10
11/*
12 * Atomic dir operations
13 *
14 * Directory operations are by default not atomic. Dentries and Inodes are
15 * created/removed/altered in seperate operations. Therefore we need to do
16 * a small amount of journaling.
17 *
18 * Create, link, mkdir, mknod and symlink all share the same function to do
19 * the work: __logfs_create. This function works in two atomic steps:
20 * 1. allocate inode (remember in journal)
21 * 2. allocate dentry (clear journal)
22 *
23 * As we can only get interrupted between the two, when the inode we just
24 * created is simply stored in the anchor. On next mount, if we were
25 * interrupted, we delete the inode. From a users point of view the
26 * operation never happened.
27 *
28 * Unlink and rmdir also share the same function: unlink. Again, this
29 * function works in two atomic steps
30 * 1. remove dentry (remember inode in journal)
31 * 2. unlink inode (clear journal)
32 *
33 * And again, on the next mount, if we were interrupted, we delete the inode.
34 * From a users point of view the operation succeeded.
35 *
36 * Rename is the real pain to deal with, harder than all the other methods
37 * combined. Depending on the circumstances we can run into three cases.
38 * A "target rename" where the target dentry already existed, a "local
39 * rename" where both parent directories are identical or a "cross-directory
40 * rename" in the remaining case.
41 *
42 * Local rename is atomic, as the old dentry is simply rewritten with a new
43 * name.
44 *
45 * Cross-directory rename works in two steps, similar to __logfs_create and
46 * logfs_unlink:
47 * 1. Write new dentry (remember old dentry in journal)
48 * 2. Remove old dentry (clear journal)
49 *
50 * Here we remember a dentry instead of an inode. On next mount, if we were
51 * interrupted, we delete the dentry. From a users point of view, the
52 * operation succeeded.
53 *
54 * Target rename works in three atomic steps:
55 * 1. Attach old inode to new dentry (remember old dentry and new inode)
56 * 2. Remove old dentry (still remember the new inode)
57 * 3. Remove victim inode
58 *
59 * Here we remember both an inode an a dentry. If we get interrupted
60 * between steps 1 and 2, we delete both the dentry and the inode. If
61 * we get interrupted between steps 2 and 3, we delete just the inode.
62 * In either case, the remaining objects are deleted on next mount. From
63 * a users point of view, the operation succeeded.
64 */
65
66static int write_dir(struct inode *dir, struct logfs_disk_dentry *dd,
67 loff_t pos)
68{
69 return logfs_inode_write(dir, dd, sizeof(*dd), pos, WF_LOCK, NULL);
70}
71
72static int write_inode(struct inode *inode)
73{
74 return __logfs_write_inode(inode, WF_LOCK);
75}
76
77static s64 dir_seek_data(struct inode *inode, s64 pos)
78{
79 s64 new_pos = logfs_seek_data(inode, pos);
80
81 return max(pos, new_pos - 1);
82}
83
84static int beyond_eof(struct inode *inode, loff_t bix)
85{
86 loff_t pos = bix << inode->i_sb->s_blocksize_bits;
87 return pos >= i_size_read(inode);
88}
89
90/*
91 * Prime value was chosen to be roughly 256 + 26. r5 hash uses 11,
92 * so short names (len <= 9) don't even occupy the complete 32bit name
93 * space. A prime >256 ensures short names quickly spread the 32bit
94 * name space. Add about 26 for the estimated amount of information
95 * of each character and pick a prime nearby, preferrably a bit-sparse
96 * one.
97 */
98static u32 hash_32(const char *s, int len, u32 seed)
99{
100 u32 hash = seed;
101 int i;
102
103 for (i = 0; i < len; i++)
104 hash = hash * 293 + s[i];
105 return hash;
106}
107
108/*
109 * We have to satisfy several conflicting requirements here. Small
110 * directories should stay fairly compact and not require too many
111 * indirect blocks. The number of possible locations for a given hash
112 * should be small to make lookup() fast. And we should try hard not
113 * to overflow the 32bit name space or nfs and 32bit host systems will
114 * be unhappy.
115 *
116 * So we use the following scheme. First we reduce the hash to 0..15
117 * and try a direct block. If that is occupied we reduce the hash to
118 * 16..255 and try an indirect block. Same for 2x and 3x indirect
119 * blocks. Lastly we reduce the hash to 0x800_0000 .. 0xffff_ffff,
120 * but use buckets containing eight entries instead of a single one.
121 *
122 * Using 16 entries should allow for a reasonable amount of hash
123 * collisions, so the 32bit name space can be packed fairly tight
124 * before overflowing. Oh and currently we don't overflow but return
125 * and error.
126 *
127 * How likely are collisions? Doing the appropriate math is beyond me
128 * and the Bronstein textbook. But running a test program to brute
129 * force collisions for a couple of days showed that on average the
130 * first collision occurs after 598M entries, with 290M being the
131 * smallest result. Obviously 21 entries could already cause a
132 * collision if all entries are carefully chosen.
133 */
134static pgoff_t hash_index(u32 hash, int round)
135{
136 u32 i0_blocks = I0_BLOCKS;
137 u32 i1_blocks = I1_BLOCKS;
138 u32 i2_blocks = I2_BLOCKS;
139 u32 i3_blocks = I3_BLOCKS;
140
141 switch (round) {
142 case 0:
143 return hash % i0_blocks;
144 case 1:
145 return i0_blocks + hash % (i1_blocks - i0_blocks);
146 case 2:
147 return i1_blocks + hash % (i2_blocks - i1_blocks);
148 case 3:
149 return i2_blocks + hash % (i3_blocks - i2_blocks);
150 case 4 ... 19:
151 return i3_blocks + 16 * (hash % (((1<<31) - i3_blocks) / 16))
152 + round - 4;
153 }
154 BUG();
155}
156
157static struct page *logfs_get_dd_page(struct inode *dir, struct dentry *dentry)
158{
159 struct qstr *name = &dentry->d_name;
160 struct page *page;
161 struct logfs_disk_dentry *dd;
162 u32 hash = hash_32(name->name, name->len, 0);
163 pgoff_t index;
164 int round;
165
166 if (name->len > LOGFS_MAX_NAMELEN)
167 return ERR_PTR(-ENAMETOOLONG);
168
169 for (round = 0; round < 20; round++) {
170 index = hash_index(hash, round);
171
172 if (beyond_eof(dir, index))
173 return NULL;
174 if (!logfs_exist_block(dir, index))
175 continue;
176 page = read_cache_page(dir->i_mapping, index,
177 (filler_t *)logfs_readpage, NULL);
178 if (IS_ERR(page))
179 return page;
180 dd = kmap_atomic(page, KM_USER0);
181 BUG_ON(dd->namelen == 0);
182
183 if (name->len != be16_to_cpu(dd->namelen) ||
184 memcmp(name->name, dd->name, name->len)) {
185 kunmap_atomic(dd, KM_USER0);
186 page_cache_release(page);
187 continue;
188 }
189
190 kunmap_atomic(dd, KM_USER0);
191 return page;
192 }
193 return NULL;
194}
195
196static int logfs_remove_inode(struct inode *inode)
197{
198 int ret;
199
200 inode->i_nlink--;
201 ret = write_inode(inode);
202 LOGFS_BUG_ON(ret, inode->i_sb);
203 return ret;
204}
205
206static void abort_transaction(struct inode *inode, struct logfs_transaction *ta)
207{
208 if (logfs_inode(inode)->li_block)
209 logfs_inode(inode)->li_block->ta = NULL;
210 kfree(ta);
211}
212
213static int logfs_unlink(struct inode *dir, struct dentry *dentry)
214{
215 struct logfs_super *super = logfs_super(dir->i_sb);
216 struct inode *inode = dentry->d_inode;
217 struct logfs_transaction *ta;
218 struct page *page;
219 pgoff_t index;
220 int ret;
221
222 ta = kzalloc(sizeof(*ta), GFP_KERNEL);
223 if (!ta)
224 return -ENOMEM;
225
226 ta->state = UNLINK_1;
227 ta->ino = inode->i_ino;
228
229 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
230
231 page = logfs_get_dd_page(dir, dentry);
232 if (!page) {
233 kfree(ta);
234 return -ENOENT;
235 }
236 if (IS_ERR(page)) {
237 kfree(ta);
238 return PTR_ERR(page);
239 }
240 index = page->index;
241 page_cache_release(page);
242
243 mutex_lock(&super->s_dirop_mutex);
244 logfs_add_transaction(dir, ta);
245
246 ret = logfs_delete(dir, index, NULL);
247 if (!ret)
248 ret = write_inode(dir);
249
250 if (ret) {
251 abort_transaction(dir, ta);
252 printk(KERN_ERR"LOGFS: unable to delete inode\n");
253 goto out;
254 }
255
256 ta->state = UNLINK_2;
257 logfs_add_transaction(inode, ta);
258 ret = logfs_remove_inode(inode);
259out:
260 mutex_unlock(&super->s_dirop_mutex);
261 return ret;
262}
263
264static inline int logfs_empty_dir(struct inode *dir)
265{
266 u64 data;
267
268 data = logfs_seek_data(dir, 0) << dir->i_sb->s_blocksize_bits;
269 return data >= i_size_read(dir);
270}
271
272static int logfs_rmdir(struct inode *dir, struct dentry *dentry)
273{
274 struct inode *inode = dentry->d_inode;
275
276 if (!logfs_empty_dir(inode))
277 return -ENOTEMPTY;
278
279 return logfs_unlink(dir, dentry);
280}
281
282/* FIXME: readdir currently has it's own dir_walk code. I don't see a good
283 * way to combine the two copies */
284#define IMPLICIT_NODES 2
285static int __logfs_readdir(struct file *file, void *buf, filldir_t filldir)
286{
287 struct inode *dir = file->f_dentry->d_inode;
288 loff_t pos = file->f_pos - IMPLICIT_NODES;
289 struct page *page;
290 struct logfs_disk_dentry *dd;
291 int full;
292
293 BUG_ON(pos < 0);
294 for (;; pos++) {
295 if (beyond_eof(dir, pos))
296 break;
297 if (!logfs_exist_block(dir, pos)) {
298 /* deleted dentry */
299 pos = dir_seek_data(dir, pos);
300 continue;
301 }
302 page = read_cache_page(dir->i_mapping, pos,
303 (filler_t *)logfs_readpage, NULL);
304 if (IS_ERR(page))
305 return PTR_ERR(page);
306 dd = kmap(page);
307 BUG_ON(dd->namelen == 0);
308
309 full = filldir(buf, (char *)dd->name, be16_to_cpu(dd->namelen),
310 pos, be64_to_cpu(dd->ino), dd->type);
311 kunmap(page);
312 page_cache_release(page);
313 if (full)
314 break;
315 }
316
317 file->f_pos = pos + IMPLICIT_NODES;
318 return 0;
319}
320
321static int logfs_readdir(struct file *file, void *buf, filldir_t filldir)
322{
323 struct inode *inode = file->f_dentry->d_inode;
324 ino_t pino = parent_ino(file->f_dentry);
325 int err;
326
327 if (file->f_pos < 0)
328 return -EINVAL;
329
330 if (file->f_pos == 0) {
331 if (filldir(buf, ".", 1, 1, inode->i_ino, DT_DIR) < 0)
332 return 0;
333 file->f_pos++;
334 }
335 if (file->f_pos == 1) {
336 if (filldir(buf, "..", 2, 2, pino, DT_DIR) < 0)
337 return 0;
338 file->f_pos++;
339 }
340
341 err = __logfs_readdir(file, buf, filldir);
342 return err;
343}
344
345static void logfs_set_name(struct logfs_disk_dentry *dd, struct qstr *name)
346{
347 dd->namelen = cpu_to_be16(name->len);
348 memcpy(dd->name, name->name, name->len);
349}
350
351static struct dentry *logfs_lookup(struct inode *dir, struct dentry *dentry,
352 struct nameidata *nd)
353{
354 struct page *page;
355 struct logfs_disk_dentry *dd;
356 pgoff_t index;
357 u64 ino = 0;
358 struct inode *inode;
359
360 page = logfs_get_dd_page(dir, dentry);
361 if (IS_ERR(page))
362 return ERR_CAST(page);
363 if (!page) {
364 d_add(dentry, NULL);
365 return NULL;
366 }
367 index = page->index;
368 dd = kmap_atomic(page, KM_USER0);
369 ino = be64_to_cpu(dd->ino);
370 kunmap_atomic(dd, KM_USER0);
371 page_cache_release(page);
372
373 inode = logfs_iget(dir->i_sb, ino);
374 if (IS_ERR(inode)) {
375 printk(KERN_ERR"LogFS: Cannot read inode #%llx for dentry (%lx, %lx)n",
376 ino, dir->i_ino, index);
377 return ERR_CAST(inode);
378 }
379 return d_splice_alias(inode, dentry);
380}
381
382static void grow_dir(struct inode *dir, loff_t index)
383{
384 index = (index + 1) << dir->i_sb->s_blocksize_bits;
385 if (i_size_read(dir) < index)
386 i_size_write(dir, index);
387}
388
389static int logfs_write_dir(struct inode *dir, struct dentry *dentry,
390 struct inode *inode)
391{
392 struct page *page;
393 struct logfs_disk_dentry *dd;
394 u32 hash = hash_32(dentry->d_name.name, dentry->d_name.len, 0);
395 pgoff_t index;
396 int round, err;
397
398 for (round = 0; round < 20; round++) {
399 index = hash_index(hash, round);
400
401 if (logfs_exist_block(dir, index))
402 continue;
403 page = find_or_create_page(dir->i_mapping, index, GFP_KERNEL);
404 if (!page)
405 return -ENOMEM;
406
407 dd = kmap_atomic(page, KM_USER0);
408 memset(dd, 0, sizeof(*dd));
409 dd->ino = cpu_to_be64(inode->i_ino);
410 dd->type = logfs_type(inode);
411 logfs_set_name(dd, &dentry->d_name);
412 kunmap_atomic(dd, KM_USER0);
413
414 err = logfs_write_buf(dir, page, WF_LOCK);
415 unlock_page(page);
416 page_cache_release(page);
417 if (!err)
418 grow_dir(dir, index);
419 return err;
420 }
421 /* FIXME: Is there a better return value? In most cases neither
422 * the filesystem nor the directory are full. But we have had
423 * too many collisions for this particular hash and no fallback.
424 */
425 return -ENOSPC;
426}
427
428static int __logfs_create(struct inode *dir, struct dentry *dentry,
429 struct inode *inode, const char *dest, long destlen)
430{
431 struct logfs_super *super = logfs_super(dir->i_sb);
432 struct logfs_inode *li = logfs_inode(inode);
433 struct logfs_transaction *ta;
434 int ret;
435
436 ta = kzalloc(sizeof(*ta), GFP_KERNEL);
437 if (!ta)
438 return -ENOMEM;
439
440 ta->state = CREATE_1;
441 ta->ino = inode->i_ino;
442 mutex_lock(&super->s_dirop_mutex);
443 logfs_add_transaction(inode, ta);
444
445 if (dest) {
446 /* symlink */
447 ret = logfs_inode_write(inode, dest, destlen, 0, WF_LOCK, NULL);
448 if (!ret)
449 ret = write_inode(inode);
450 } else {
451 /* creat/mkdir/mknod */
452 ret = write_inode(inode);
453 }
454 if (ret) {
455 abort_transaction(inode, ta);
456 li->li_flags |= LOGFS_IF_STILLBORN;
457 /* FIXME: truncate symlink */
458 inode->i_nlink--;
459 iput(inode);
460 goto out;
461 }
462
463 ta->state = CREATE_2;
464 logfs_add_transaction(dir, ta);
465 ret = logfs_write_dir(dir, dentry, inode);
466 /* sync directory */
467 if (!ret)
468 ret = write_inode(dir);
469
470 if (ret) {
471 logfs_del_transaction(dir, ta);
472 ta->state = CREATE_2;
473 logfs_add_transaction(inode, ta);
474 logfs_remove_inode(inode);
475 iput(inode);
476 goto out;
477 }
478 d_instantiate(dentry, inode);
479out:
480 mutex_unlock(&super->s_dirop_mutex);
481 return ret;
482}
483
484static int logfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
485{
486 struct inode *inode;
487
488 /*
489 * FIXME: why do we have to fill in S_IFDIR, while the mode is
490 * correct for mknod, creat, etc.? Smells like the vfs *should*
491 * do it for us but for some reason fails to do so.
492 */
493 inode = logfs_new_inode(dir, S_IFDIR | mode);
494 if (IS_ERR(inode))
495 return PTR_ERR(inode);
496
497 inode->i_op = &logfs_dir_iops;
498 inode->i_fop = &logfs_dir_fops;
499
500 return __logfs_create(dir, dentry, inode, NULL, 0);
501}
502
503static int logfs_create(struct inode *dir, struct dentry *dentry, int mode,
504 struct nameidata *nd)
505{
506 struct inode *inode;
507
508 inode = logfs_new_inode(dir, mode);
509 if (IS_ERR(inode))
510 return PTR_ERR(inode);
511
512 inode->i_op = &logfs_reg_iops;
513 inode->i_fop = &logfs_reg_fops;
514 inode->i_mapping->a_ops = &logfs_reg_aops;
515
516 return __logfs_create(dir, dentry, inode, NULL, 0);
517}
518
519static int logfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
520 dev_t rdev)
521{
522 struct inode *inode;
523
524 if (dentry->d_name.len > LOGFS_MAX_NAMELEN)
525 return -ENAMETOOLONG;
526
527 inode = logfs_new_inode(dir, mode);
528 if (IS_ERR(inode))
529 return PTR_ERR(inode);
530
531 init_special_inode(inode, mode, rdev);
532
533 return __logfs_create(dir, dentry, inode, NULL, 0);
534}
535
536static int logfs_symlink(struct inode *dir, struct dentry *dentry,
537 const char *target)
538{
539 struct inode *inode;
540 size_t destlen = strlen(target) + 1;
541
542 if (destlen > dir->i_sb->s_blocksize)
543 return -ENAMETOOLONG;
544
545 inode = logfs_new_inode(dir, S_IFLNK | 0777);
546 if (IS_ERR(inode))
547 return PTR_ERR(inode);
548
549 inode->i_op = &logfs_symlink_iops;
550 inode->i_mapping->a_ops = &logfs_reg_aops;
551
552 return __logfs_create(dir, dentry, inode, target, destlen);
553}
554
555static int logfs_permission(struct inode *inode, int mask)
556{
557 return generic_permission(inode, mask, NULL);
558}
559
560static int logfs_link(struct dentry *old_dentry, struct inode *dir,
561 struct dentry *dentry)
562{
563 struct inode *inode = old_dentry->d_inode;
564
565 if (inode->i_nlink >= LOGFS_LINK_MAX)
566 return -EMLINK;
567
568 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
569 atomic_inc(&inode->i_count);
570 inode->i_nlink++;
571 mark_inode_dirty_sync(inode);
572
573 return __logfs_create(dir, dentry, inode, NULL, 0);
574}
575
576static int logfs_get_dd(struct inode *dir, struct dentry *dentry,
577 struct logfs_disk_dentry *dd, loff_t *pos)
578{
579 struct page *page;
580 void *map;
581
582 page = logfs_get_dd_page(dir, dentry);
583 if (IS_ERR(page))
584 return PTR_ERR(page);
585 *pos = page->index;
586 map = kmap_atomic(page, KM_USER0);
587 memcpy(dd, map, sizeof(*dd));
588 kunmap_atomic(map, KM_USER0);
589 page_cache_release(page);
590 return 0;
591}
592
593static int logfs_delete_dd(struct inode *dir, loff_t pos)
594{
595 /*
596 * Getting called with pos somewhere beyond eof is either a goofup
597 * within this file or means someone maliciously edited the
598 * (crc-protected) journal.
599 */
600 BUG_ON(beyond_eof(dir, pos));
601 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
602 log_dir(" Delete dentry (%lx, %llx)\n", dir->i_ino, pos);
603 return logfs_delete(dir, pos, NULL);
604}
605
606/*
607 * Cross-directory rename, target does not exist. Just a little nasty.
608 * Create a new dentry in the target dir, then remove the old dentry,
609 * all the while taking care to remember our operation in the journal.
610 */
611static int logfs_rename_cross(struct inode *old_dir, struct dentry *old_dentry,
612 struct inode *new_dir, struct dentry *new_dentry)
613{
614 struct logfs_super *super = logfs_super(old_dir->i_sb);
615 struct logfs_disk_dentry dd;
616 struct logfs_transaction *ta;
617 loff_t pos;
618 int err;
619
620 /* 1. locate source dd */
621 err = logfs_get_dd(old_dir, old_dentry, &dd, &pos);
622 if (err)
623 return err;
624
625 ta = kzalloc(sizeof(*ta), GFP_KERNEL);
626 if (!ta)
627 return -ENOMEM;
628
629 ta->state = CROSS_RENAME_1;
630 ta->dir = old_dir->i_ino;
631 ta->pos = pos;
632
633 /* 2. write target dd */
634 mutex_lock(&super->s_dirop_mutex);
635 logfs_add_transaction(new_dir, ta);
636 err = logfs_write_dir(new_dir, new_dentry, old_dentry->d_inode);
637 if (!err)
638 err = write_inode(new_dir);
639
640 if (err) {
641 super->s_rename_dir = 0;
642 super->s_rename_pos = 0;
643 abort_transaction(new_dir, ta);
644 goto out;
645 }
646
647 /* 3. remove source dd */
648 ta->state = CROSS_RENAME_2;
649 logfs_add_transaction(old_dir, ta);
650 err = logfs_delete_dd(old_dir, pos);
651 if (!err)
652 err = write_inode(old_dir);
653 LOGFS_BUG_ON(err, old_dir->i_sb);
654out:
655 mutex_unlock(&super->s_dirop_mutex);
656 return err;
657}
658
659static int logfs_replace_inode(struct inode *dir, struct dentry *dentry,
660 struct logfs_disk_dentry *dd, struct inode *inode)
661{
662 loff_t pos;
663 int err;
664
665 err = logfs_get_dd(dir, dentry, dd, &pos);
666 if (err)
667 return err;
668 dd->ino = cpu_to_be64(inode->i_ino);
669 dd->type = logfs_type(inode);
670
671 err = write_dir(dir, dd, pos);
672 if (err)
673 return err;
674 log_dir("Replace dentry (%lx, %llx) %s -> %llx\n", dir->i_ino, pos,
675 dd->name, be64_to_cpu(dd->ino));
676 return write_inode(dir);
677}
678
679/* Target dentry exists - the worst case. We need to attach the source
680 * inode to the target dentry, then remove the orphaned target inode and
681 * source dentry.
682 */
683static int logfs_rename_target(struct inode *old_dir, struct dentry *old_dentry,
684 struct inode *new_dir, struct dentry *new_dentry)
685{
686 struct logfs_super *super = logfs_super(old_dir->i_sb);
687 struct inode *old_inode = old_dentry->d_inode;
688 struct inode *new_inode = new_dentry->d_inode;
689 int isdir = S_ISDIR(old_inode->i_mode);
690 struct logfs_disk_dentry dd;
691 struct logfs_transaction *ta;
692 loff_t pos;
693 int err;
694
695 BUG_ON(isdir != S_ISDIR(new_inode->i_mode));
696 if (isdir) {
697 if (!logfs_empty_dir(new_inode))
698 return -ENOTEMPTY;
699 }
700
701 /* 1. locate source dd */
702 err = logfs_get_dd(old_dir, old_dentry, &dd, &pos);
703 if (err)
704 return err;
705
706 ta = kzalloc(sizeof(*ta), GFP_KERNEL);
707 if (!ta)
708 return -ENOMEM;
709
710 ta->state = TARGET_RENAME_1;
711 ta->dir = old_dir->i_ino;
712 ta->pos = pos;
713 ta->ino = new_inode->i_ino;
714
715 /* 2. attach source inode to target dd */
716 mutex_lock(&super->s_dirop_mutex);
717 logfs_add_transaction(new_dir, ta);
718 err = logfs_replace_inode(new_dir, new_dentry, &dd, old_inode);
719 if (err) {
720 super->s_rename_dir = 0;
721 super->s_rename_pos = 0;
722 super->s_victim_ino = 0;
723 abort_transaction(new_dir, ta);
724 goto out;
725 }
726
727 /* 3. remove source dd */
728 ta->state = TARGET_RENAME_2;
729 logfs_add_transaction(old_dir, ta);
730 err = logfs_delete_dd(old_dir, pos);
731 if (!err)
732 err = write_inode(old_dir);
733 LOGFS_BUG_ON(err, old_dir->i_sb);
734
735 /* 4. remove target inode */
736 ta->state = TARGET_RENAME_3;
737 logfs_add_transaction(new_inode, ta);
738 err = logfs_remove_inode(new_inode);
739
740out:
741 mutex_unlock(&super->s_dirop_mutex);
742 return err;
743}
744
745static int logfs_rename(struct inode *old_dir, struct dentry *old_dentry,
746 struct inode *new_dir, struct dentry *new_dentry)
747{
748 if (new_dentry->d_inode)
749 return logfs_rename_target(old_dir, old_dentry,
750 new_dir, new_dentry);
751 return logfs_rename_cross(old_dir, old_dentry, new_dir, new_dentry);
752}
753
754/* No locking done here, as this is called before .get_sb() returns. */
755int logfs_replay_journal(struct super_block *sb)
756{
757 struct logfs_super *super = logfs_super(sb);
758 struct inode *inode;
759 u64 ino, pos;
760 int err;
761
762 if (super->s_victim_ino) {
763 /* delete victim inode */
764 ino = super->s_victim_ino;
765 printk(KERN_INFO"LogFS: delete unmapped inode #%llx\n", ino);
766 inode = logfs_iget(sb, ino);
767 if (IS_ERR(inode))
768 goto fail;
769
770 LOGFS_BUG_ON(i_size_read(inode) > 0, sb);
771 super->s_victim_ino = 0;
772 err = logfs_remove_inode(inode);
773 iput(inode);
774 if (err) {
775 super->s_victim_ino = ino;
776 goto fail;
777 }
778 }
779 if (super->s_rename_dir) {
780 /* delete old dd from rename */
781 ino = super->s_rename_dir;
782 pos = super->s_rename_pos;
783 printk(KERN_INFO"LogFS: delete unbacked dentry (%llx, %llx)\n",
784 ino, pos);
785 inode = logfs_iget(sb, ino);
786 if (IS_ERR(inode))
787 goto fail;
788
789 super->s_rename_dir = 0;
790 super->s_rename_pos = 0;
791 err = logfs_delete_dd(inode, pos);
792 iput(inode);
793 if (err) {
794 super->s_rename_dir = ino;
795 super->s_rename_pos = pos;
796 goto fail;
797 }
798 }
799 return 0;
800fail:
801 LOGFS_BUG(sb);
802 return -EIO;
803}
804
805const struct inode_operations logfs_symlink_iops = {
806 .readlink = generic_readlink,
807 .follow_link = page_follow_link_light,
808};
809
810const struct inode_operations logfs_dir_iops = {
811 .create = logfs_create,
812 .link = logfs_link,
813 .lookup = logfs_lookup,
814 .mkdir = logfs_mkdir,
815 .mknod = logfs_mknod,
816 .rename = logfs_rename,
817 .rmdir = logfs_rmdir,
818 .permission = logfs_permission,
819 .symlink = logfs_symlink,
820 .unlink = logfs_unlink,
821};
822const struct file_operations logfs_dir_fops = {
823 .fsync = logfs_fsync,
824 .ioctl = logfs_ioctl,
825 .readdir = logfs_readdir,
826 .read = generic_read_dir,
827};
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
new file mode 100644
index 000000000000..370f367a933e
--- /dev/null
+++ b/fs/logfs/file.c
@@ -0,0 +1,263 @@
1/*
2 * fs/logfs/file.c - prepare_write, commit_write and friends
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */
8#include "logfs.h"
9#include <linux/sched.h>
10#include <linux/writeback.h>
11
12static int logfs_write_begin(struct file *file, struct address_space *mapping,
13 loff_t pos, unsigned len, unsigned flags,
14 struct page **pagep, void **fsdata)
15{
16 struct inode *inode = mapping->host;
17 struct page *page;
18 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
19
20 page = grab_cache_page_write_begin(mapping, index, flags);
21 if (!page)
22 return -ENOMEM;
23 *pagep = page;
24
25 if ((len == PAGE_CACHE_SIZE) || PageUptodate(page))
26 return 0;
27 if ((pos & PAGE_CACHE_MASK) >= i_size_read(inode)) {
28 unsigned start = pos & (PAGE_CACHE_SIZE - 1);
29 unsigned end = start + len;
30
31 /* Reading beyond i_size is simple: memset to zero */
32 zero_user_segments(page, 0, start, end, PAGE_CACHE_SIZE);
33 return 0;
34 }
35 return logfs_readpage_nolock(page);
36}
37
38static int logfs_write_end(struct file *file, struct address_space *mapping,
39 loff_t pos, unsigned len, unsigned copied, struct page *page,
40 void *fsdata)
41{
42 struct inode *inode = mapping->host;
43 pgoff_t index = page->index;
44 unsigned start = pos & (PAGE_CACHE_SIZE - 1);
45 unsigned end = start + copied;
46 int ret = 0;
47
48 BUG_ON(PAGE_CACHE_SIZE != inode->i_sb->s_blocksize);
49 BUG_ON(page->index > I3_BLOCKS);
50
51 if (copied < len) {
52 /*
53 * Short write of a non-initialized paged. Just tell userspace
54 * to retry the entire page.
55 */
56 if (!PageUptodate(page)) {
57 copied = 0;
58 goto out;
59 }
60 }
61 if (copied == 0)
62 goto out; /* FIXME: do we need to update inode? */
63
64 if (i_size_read(inode) < (index << PAGE_CACHE_SHIFT) + end) {
65 i_size_write(inode, (index << PAGE_CACHE_SHIFT) + end);
66 mark_inode_dirty_sync(inode);
67 }
68
69 SetPageUptodate(page);
70 if (!PageDirty(page)) {
71 if (!get_page_reserve(inode, page))
72 __set_page_dirty_nobuffers(page);
73 else
74 ret = logfs_write_buf(inode, page, WF_LOCK);
75 }
76out:
77 unlock_page(page);
78 page_cache_release(page);
79 return ret ? ret : copied;
80}
81
82int logfs_readpage(struct file *file, struct page *page)
83{
84 int ret;
85
86 ret = logfs_readpage_nolock(page);
87 unlock_page(page);
88 return ret;
89}
90
91/* Clear the page's dirty flag in the radix tree. */
92/* TODO: mucking with PageWriteback is silly. Add a generic function to clear
93 * the dirty bit from the radix tree for filesystems that don't have to wait
94 * for page writeback to finish (i.e. any compressing filesystem).
95 */
96static void clear_radix_tree_dirty(struct page *page)
97{
98 BUG_ON(PagePrivate(page) || page->private);
99 set_page_writeback(page);
100 end_page_writeback(page);
101}
102
103static int __logfs_writepage(struct page *page)
104{
105 struct inode *inode = page->mapping->host;
106 int err;
107
108 err = logfs_write_buf(inode, page, WF_LOCK);
109 if (err)
110 set_page_dirty(page);
111 else
112 clear_radix_tree_dirty(page);
113 unlock_page(page);
114 return err;
115}
116
117static int logfs_writepage(struct page *page, struct writeback_control *wbc)
118{
119 struct inode *inode = page->mapping->host;
120 loff_t i_size = i_size_read(inode);
121 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
122 unsigned offset;
123 u64 bix;
124 level_t level;
125
126 log_file("logfs_writepage(%lx, %lx, %p)\n", inode->i_ino, page->index,
127 page);
128
129 logfs_unpack_index(page->index, &bix, &level);
130
131 /* Indirect blocks are never truncated */
132 if (level != 0)
133 return __logfs_writepage(page);
134
135 /*
136 * TODO: everything below is a near-verbatim copy of nobh_writepage().
137 * The relevant bits should be factored out after logfs is merged.
138 */
139
140 /* Is the page fully inside i_size? */
141 if (bix < end_index)
142 return __logfs_writepage(page);
143
144 /* Is the page fully outside i_size? (truncate in progress) */
145 offset = i_size & (PAGE_CACHE_SIZE-1);
146 if (bix > end_index || offset == 0) {
147 unlock_page(page);
148 return 0; /* don't care */
149 }
150
151 /*
152 * The page straddles i_size. It must be zeroed out on each and every
153 * writepage invokation because it may be mmapped. "A file is mapped
154 * in multiples of the page size. For a file that is not a multiple of
155 * the page size, the remaining memory is zeroed when mapped, and
156 * writes to that region are not written out to the file."
157 */
158 zero_user_segment(page, offset, PAGE_CACHE_SIZE);
159 return __logfs_writepage(page);
160}
161
162static void logfs_invalidatepage(struct page *page, unsigned long offset)
163{
164 move_page_to_btree(page);
165 BUG_ON(PagePrivate(page) || page->private);
166}
167
168static int logfs_releasepage(struct page *page, gfp_t only_xfs_uses_this)
169{
170 return 0; /* None of these are easy to release */
171}
172
173
174int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
175 unsigned long arg)
176{
177 struct logfs_inode *li = logfs_inode(inode);
178 unsigned int oldflags, flags;
179 int err;
180
181 switch (cmd) {
182 case FS_IOC_GETFLAGS:
183 flags = li->li_flags & LOGFS_FL_USER_VISIBLE;
184 return put_user(flags, (int __user *)arg);
185 case FS_IOC_SETFLAGS:
186 if (IS_RDONLY(inode))
187 return -EROFS;
188
189 if (!is_owner_or_cap(inode))
190 return -EACCES;
191
192 err = get_user(flags, (int __user *)arg);
193 if (err)
194 return err;
195
196 mutex_lock(&inode->i_mutex);
197 oldflags = li->li_flags;
198 flags &= LOGFS_FL_USER_MODIFIABLE;
199 flags |= oldflags & ~LOGFS_FL_USER_MODIFIABLE;
200 li->li_flags = flags;
201 mutex_unlock(&inode->i_mutex);
202
203 inode->i_ctime = CURRENT_TIME;
204 mark_inode_dirty_sync(inode);
205 return 0;
206
207 default:
208 return -ENOTTY;
209 }
210}
211
212int logfs_fsync(struct file *file, struct dentry *dentry, int datasync)
213{
214 struct super_block *sb = dentry->d_inode->i_sb;
215 struct logfs_super *super = logfs_super(sb);
216
217 /* FIXME: write anchor */
218 super->s_devops->sync(sb);
219 return 0;
220}
221
222static int logfs_setattr(struct dentry *dentry, struct iattr *attr)
223{
224 struct inode *inode = dentry->d_inode;
225 int err = 0;
226
227 if (attr->ia_valid & ATTR_SIZE)
228 err = logfs_truncate(inode, attr->ia_size);
229 attr->ia_valid &= ~ATTR_SIZE;
230
231 if (!err)
232 err = inode_change_ok(inode, attr);
233 if (!err)
234 err = inode_setattr(inode, attr);
235 return err;
236}
237
238const struct inode_operations logfs_reg_iops = {
239 .setattr = logfs_setattr,
240};
241
242const struct file_operations logfs_reg_fops = {
243 .aio_read = generic_file_aio_read,
244 .aio_write = generic_file_aio_write,
245 .fsync = logfs_fsync,
246 .ioctl = logfs_ioctl,
247 .llseek = generic_file_llseek,
248 .mmap = generic_file_readonly_mmap,
249 .open = generic_file_open,
250 .read = do_sync_read,
251 .write = do_sync_write,
252};
253
254const struct address_space_operations logfs_reg_aops = {
255 .invalidatepage = logfs_invalidatepage,
256 .readpage = logfs_readpage,
257 .releasepage = logfs_releasepage,
258 .set_page_dirty = __set_page_dirty_nobuffers,
259 .writepage = logfs_writepage,
260 .writepages = generic_writepages,
261 .write_begin = logfs_write_begin,
262 .write_end = logfs_write_end,
263};
diff --git a/fs/logfs/gc.c b/fs/logfs/gc.c
new file mode 100644
index 000000000000..76c242fbe1b0
--- /dev/null
+++ b/fs/logfs/gc.c
@@ -0,0 +1,739 @@
1/*
2 * fs/logfs/gc.c - garbage collection code
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */
8#include "logfs.h"
9#include <linux/sched.h>
10#include <linux/slab.h>
11
12/*
13 * Wear leveling needs to kick in when the difference between low erase
14 * counts and high erase counts gets too big. A good value for "too big"
15 * may be somewhat below 10% of maximum erase count for the device.
16 * Why not 397, to pick a nice round number with no specific meaning? :)
17 *
18 * WL_RATELIMIT is the minimum time between two wear level events. A huge
19 * number of segments may fulfil the requirements for wear leveling at the
20 * same time. If that happens we don't want to cause a latency from hell,
21 * but just gently pick one segment every so often and minimize overhead.
22 */
23#define WL_DELTA 397
24#define WL_RATELIMIT 100
25#define MAX_OBJ_ALIASES 2600
26#define SCAN_RATIO 512 /* number of scanned segments per gc'd segment */
27#define LIST_SIZE 64 /* base size of candidate lists */
28#define SCAN_ROUNDS 128 /* maximum number of complete medium scans */
29#define SCAN_ROUNDS_HIGH 4 /* maximum number of higher-level scans */
30
31static int no_free_segments(struct super_block *sb)
32{
33 struct logfs_super *super = logfs_super(sb);
34
35 return super->s_free_list.count;
36}
37
38/* journal has distance -1, top-most ifile layer distance 0 */
39static u8 root_distance(struct super_block *sb, gc_level_t __gc_level)
40{
41 struct logfs_super *super = logfs_super(sb);
42 u8 gc_level = (__force u8)__gc_level;
43
44 switch (gc_level) {
45 case 0: /* fall through */
46 case 1: /* fall through */
47 case 2: /* fall through */
48 case 3:
49 /* file data or indirect blocks */
50 return super->s_ifile_levels + super->s_iblock_levels - gc_level;
51 case 6: /* fall through */
52 case 7: /* fall through */
53 case 8: /* fall through */
54 case 9:
55 /* inode file data or indirect blocks */
56 return super->s_ifile_levels - (gc_level - 6);
57 default:
58 printk(KERN_ERR"LOGFS: segment of unknown level %x found\n",
59 gc_level);
60 WARN_ON(1);
61 return super->s_ifile_levels + super->s_iblock_levels;
62 }
63}
64
65static int segment_is_reserved(struct super_block *sb, u32 segno)
66{
67 struct logfs_super *super = logfs_super(sb);
68 struct logfs_area *area;
69 void *reserved;
70 int i;
71
72 /* Some segments are reserved. Just pretend they were all valid */
73 reserved = btree_lookup32(&super->s_reserved_segments, segno);
74 if (reserved)
75 return 1;
76
77 /* Currently open segments */
78 for_each_area(i) {
79 area = super->s_area[i];
80 if (area->a_is_open && area->a_segno == segno)
81 return 1;
82 }
83
84 return 0;
85}
86
87static void logfs_mark_segment_bad(struct super_block *sb, u32 segno)
88{
89 BUG();
90}
91
92/*
93 * Returns the bytes consumed by valid objects in this segment. Object headers
94 * are counted, the segment header is not.
95 */
96static u32 logfs_valid_bytes(struct super_block *sb, u32 segno, u32 *ec,
97 gc_level_t *gc_level)
98{
99 struct logfs_segment_entry se;
100 u32 ec_level;
101
102 logfs_get_segment_entry(sb, segno, &se);
103 if (se.ec_level == cpu_to_be32(BADSEG) ||
104 se.valid == cpu_to_be32(RESERVED))
105 return RESERVED;
106
107 ec_level = be32_to_cpu(se.ec_level);
108 *ec = ec_level >> 4;
109 *gc_level = GC_LEVEL(ec_level & 0xf);
110 return be32_to_cpu(se.valid);
111}
112
113static void logfs_cleanse_block(struct super_block *sb, u64 ofs, u64 ino,
114 u64 bix, gc_level_t gc_level)
115{
116 struct inode *inode;
117 int err, cookie;
118
119 inode = logfs_safe_iget(sb, ino, &cookie);
120 err = logfs_rewrite_block(inode, bix, ofs, gc_level, 0);
121 BUG_ON(err);
122 logfs_safe_iput(inode, cookie);
123}
124
125static u32 logfs_gc_segment(struct super_block *sb, u32 segno, u8 dist)
126{
127 struct logfs_super *super = logfs_super(sb);
128 struct logfs_segment_header sh;
129 struct logfs_object_header oh;
130 u64 ofs, ino, bix;
131 u32 seg_ofs, logical_segno, cleaned = 0;
132 int err, len, valid;
133 gc_level_t gc_level;
134
135 LOGFS_BUG_ON(segment_is_reserved(sb, segno), sb);
136
137 btree_insert32(&super->s_reserved_segments, segno, (void *)1, GFP_NOFS);
138 err = wbuf_read(sb, dev_ofs(sb, segno, 0), sizeof(sh), &sh);
139 BUG_ON(err);
140 gc_level = GC_LEVEL(sh.level);
141 logical_segno = be32_to_cpu(sh.segno);
142 if (sh.crc != logfs_crc32(&sh, sizeof(sh), 4)) {
143 logfs_mark_segment_bad(sb, segno);
144 cleaned = -1;
145 goto out;
146 }
147
148 for (seg_ofs = LOGFS_SEGMENT_HEADERSIZE;
149 seg_ofs + sizeof(oh) < super->s_segsize; ) {
150 ofs = dev_ofs(sb, logical_segno, seg_ofs);
151 err = wbuf_read(sb, dev_ofs(sb, segno, seg_ofs), sizeof(oh),
152 &oh);
153 BUG_ON(err);
154
155 if (!memchr_inv(&oh, 0xff, sizeof(oh)))
156 break;
157
158 if (oh.crc != logfs_crc32(&oh, sizeof(oh) - 4, 4)) {
159 logfs_mark_segment_bad(sb, segno);
160 cleaned = super->s_segsize - 1;
161 goto out;
162 }
163
164 ino = be64_to_cpu(oh.ino);
165 bix = be64_to_cpu(oh.bix);
166 len = sizeof(oh) + be16_to_cpu(oh.len);
167 valid = logfs_is_valid_block(sb, ofs, ino, bix, gc_level);
168 if (valid == 1) {
169 logfs_cleanse_block(sb, ofs, ino, bix, gc_level);
170 cleaned += len;
171 } else if (valid == 2) {
172 /* Will be invalid upon journal commit */
173 cleaned += len;
174 }
175 seg_ofs += len;
176 }
177out:
178 btree_remove32(&super->s_reserved_segments, segno);
179 return cleaned;
180}
181
182static struct gc_candidate *add_list(struct gc_candidate *cand,
183 struct candidate_list *list)
184{
185 struct rb_node **p = &list->rb_tree.rb_node;
186 struct rb_node *parent = NULL;
187 struct gc_candidate *cur;
188 int comp;
189
190 cand->list = list;
191 while (*p) {
192 parent = *p;
193 cur = rb_entry(parent, struct gc_candidate, rb_node);
194
195 if (list->sort_by_ec)
196 comp = cand->erase_count < cur->erase_count;
197 else
198 comp = cand->valid < cur->valid;
199
200 if (comp)
201 p = &parent->rb_left;
202 else
203 p = &parent->rb_right;
204 }
205 rb_link_node(&cand->rb_node, parent, p);
206 rb_insert_color(&cand->rb_node, &list->rb_tree);
207
208 if (list->count <= list->maxcount) {
209 list->count++;
210 return NULL;
211 }
212 cand = rb_entry(rb_last(&list->rb_tree), struct gc_candidate, rb_node);
213 rb_erase(&cand->rb_node, &list->rb_tree);
214 cand->list = NULL;
215 return cand;
216}
217
218static void remove_from_list(struct gc_candidate *cand)
219{
220 struct candidate_list *list = cand->list;
221
222 rb_erase(&cand->rb_node, &list->rb_tree);
223 list->count--;
224}
225
226static void free_candidate(struct super_block *sb, struct gc_candidate *cand)
227{
228 struct logfs_super *super = logfs_super(sb);
229
230 btree_remove32(&super->s_cand_tree, cand->segno);
231 kfree(cand);
232}
233
234u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec)
235{
236 struct gc_candidate *cand;
237 u32 segno;
238
239 BUG_ON(list->count == 0);
240
241 cand = rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node);
242 remove_from_list(cand);
243 segno = cand->segno;
244 if (ec)
245 *ec = cand->erase_count;
246 free_candidate(sb, cand);
247 return segno;
248}
249
250/*
251 * We have several lists to manage segments with. The reserve_list is used to
252 * deal with bad blocks. We try to keep the best (lowest ec) segments on this
253 * list.
254 * The free_list contains free segments for normal usage. It usually gets the
255 * second pick after the reserve_list. But when the free_list is running short
256 * it is more important to keep the free_list full than to keep a reserve.
257 *
258 * Segments that are not free are put onto a per-level low_list. If we have
259 * to run garbage collection, we pick a candidate from there. All segments on
260 * those lists should have at least some free space so GC will make progress.
261 *
262 * And last we have the ec_list, which is used to pick segments for wear
263 * leveling.
264 *
265 * If all appropriate lists are full, we simply free the candidate and forget
266 * about that segment for a while. We have better candidates for each purpose.
267 */
268static void __add_candidate(struct super_block *sb, struct gc_candidate *cand)
269{
270 struct logfs_super *super = logfs_super(sb);
271 u32 full = super->s_segsize - LOGFS_SEGMENT_RESERVE;
272
273 if (cand->valid == 0) {
274 /* 100% free segments */
275 log_gc_noisy("add reserve segment %x (ec %x) at %llx\n",
276 cand->segno, cand->erase_count,
277 dev_ofs(sb, cand->segno, 0));
278 cand = add_list(cand, &super->s_reserve_list);
279 if (cand) {
280 log_gc_noisy("add free segment %x (ec %x) at %llx\n",
281 cand->segno, cand->erase_count,
282 dev_ofs(sb, cand->segno, 0));
283 cand = add_list(cand, &super->s_free_list);
284 }
285 } else {
286 /* good candidates for Garbage Collection */
287 if (cand->valid < full)
288 cand = add_list(cand, &super->s_low_list[cand->dist]);
289 /* good candidates for wear leveling,
290 * segments that were recently written get ignored */
291 if (cand)
292 cand = add_list(cand, &super->s_ec_list);
293 }
294 if (cand)
295 free_candidate(sb, cand);
296}
297
298static int add_candidate(struct super_block *sb, u32 segno, u32 valid, u32 ec,
299 u8 dist)
300{
301 struct logfs_super *super = logfs_super(sb);
302 struct gc_candidate *cand;
303
304 cand = kmalloc(sizeof(*cand), GFP_NOFS);
305 if (!cand)
306 return -ENOMEM;
307
308 cand->segno = segno;
309 cand->valid = valid;
310 cand->erase_count = ec;
311 cand->dist = dist;
312
313 btree_insert32(&super->s_cand_tree, segno, cand, GFP_NOFS);
314 __add_candidate(sb, cand);
315 return 0;
316}
317
318static void remove_segment_from_lists(struct super_block *sb, u32 segno)
319{
320 struct logfs_super *super = logfs_super(sb);
321 struct gc_candidate *cand;
322
323 cand = btree_lookup32(&super->s_cand_tree, segno);
324 if (cand) {
325 remove_from_list(cand);
326 free_candidate(sb, cand);
327 }
328}
329
330static void scan_segment(struct super_block *sb, u32 segno)
331{
332 u32 valid, ec = 0;
333 gc_level_t gc_level = 0;
334 u8 dist;
335
336 if (segment_is_reserved(sb, segno))
337 return;
338
339 remove_segment_from_lists(sb, segno);
340 valid = logfs_valid_bytes(sb, segno, &ec, &gc_level);
341 if (valid == RESERVED)
342 return;
343
344 dist = root_distance(sb, gc_level);
345 add_candidate(sb, segno, valid, ec, dist);
346}
347
348static struct gc_candidate *first_in_list(struct candidate_list *list)
349{
350 if (list->count == 0)
351 return NULL;
352 return rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node);
353}
354
355/*
356 * Find the best segment for garbage collection. Main criterion is
357 * the segment requiring the least effort to clean. Secondary
358 * criterion is to GC on the lowest level available.
359 *
360 * So we search the least effort segment on the lowest level first,
361 * then move up and pick another segment iff is requires significantly
362 * less effort. Hence the LOGFS_MAX_OBJECTSIZE in the comparison.
363 */
364static struct gc_candidate *get_candidate(struct super_block *sb)
365{
366 struct logfs_super *super = logfs_super(sb);
367 int i, max_dist;
368 struct gc_candidate *cand = NULL, *this;
369
370 max_dist = min(no_free_segments(sb), LOGFS_NO_AREAS);
371
372 for (i = max_dist; i >= 0; i--) {
373 this = first_in_list(&super->s_low_list[i]);
374 if (!this)
375 continue;
376 if (!cand)
377 cand = this;
378 if (this->valid + LOGFS_MAX_OBJECTSIZE <= cand->valid)
379 cand = this;
380 }
381 return cand;
382}
383
384static int __logfs_gc_once(struct super_block *sb, struct gc_candidate *cand)
385{
386 struct logfs_super *super = logfs_super(sb);
387 gc_level_t gc_level;
388 u32 cleaned, valid, segno, ec;
389 u8 dist;
390
391 if (!cand) {
392 log_gc("GC attempted, but no candidate found\n");
393 return 0;
394 }
395
396 segno = cand->segno;
397 dist = cand->dist;
398 valid = logfs_valid_bytes(sb, segno, &ec, &gc_level);
399 free_candidate(sb, cand);
400 log_gc("GC segment #%02x at %llx, %x required, %x free, %x valid, %llx free\n",
401 segno, (u64)segno << super->s_segshift,
402 dist, no_free_segments(sb), valid,
403 super->s_free_bytes);
404 cleaned = logfs_gc_segment(sb, segno, dist);
405 log_gc("GC segment #%02x complete - now %x valid\n", segno,
406 valid - cleaned);
407 BUG_ON(cleaned != valid);
408 return 1;
409}
410
411static int logfs_gc_once(struct super_block *sb)
412{
413 struct gc_candidate *cand;
414
415 cand = get_candidate(sb);
416 if (cand)
417 remove_from_list(cand);
418 return __logfs_gc_once(sb, cand);
419}
420
421/* returns 1 if a wrap occurs, 0 otherwise */
422static int logfs_scan_some(struct super_block *sb)
423{
424 struct logfs_super *super = logfs_super(sb);
425 u32 segno;
426 int i, ret = 0;
427
428 segno = super->s_sweeper;
429 for (i = SCAN_RATIO; i > 0; i--) {
430 segno++;
431 if (segno >= super->s_no_segs) {
432 segno = 0;
433 ret = 1;
434 /* Break out of the loop. We want to read a single
435 * block from the segment size on next invocation if
436 * SCAN_RATIO is set to match block size
437 */
438 break;
439 }
440
441 scan_segment(sb, segno);
442 }
443 super->s_sweeper = segno;
444 return ret;
445}
446
447/*
448 * In principle, this function should loop forever, looking for GC candidates
449 * and moving data. LogFS is designed in such a way that this loop is
450 * guaranteed to terminate.
451 *
452 * Limiting the loop to some iterations serves purely to catch cases when
453 * these guarantees have failed. An actual endless loop is an obvious bug
454 * and should be reported as such.
455 */
456static void __logfs_gc_pass(struct super_block *sb, int target)
457{
458 struct logfs_super *super = logfs_super(sb);
459 struct logfs_block *block;
460 int round, progress, last_progress = 0;
461
462 /*
463 * Doing too many changes to the segfile at once would result
464 * in a large number of aliases. Write the journal before
465 * things get out of hand.
466 */
467 if (super->s_shadow_tree.no_shadowed_segments >= MAX_OBJ_ALIASES)
468 logfs_write_anchor(sb);
469
470 if (no_free_segments(sb) >= target &&
471 super->s_no_object_aliases < MAX_OBJ_ALIASES)
472 return;
473
474 log_gc("__logfs_gc_pass(%x)\n", target);
475 for (round = 0; round < SCAN_ROUNDS; ) {
476 if (no_free_segments(sb) >= target)
477 goto write_alias;
478
479 /* Sync in-memory state with on-medium state in case they
480 * diverged */
481 logfs_write_anchor(sb);
482 round += logfs_scan_some(sb);
483 if (no_free_segments(sb) >= target)
484 goto write_alias;
485 progress = logfs_gc_once(sb);
486 if (progress)
487 last_progress = round;
488 else if (round - last_progress > 2)
489 break;
490 continue;
491
492 /*
493 * The goto logic is nasty, I just don't know a better way to
494 * code it. GC is supposed to ensure two things:
495 * 1. Enough free segments are available.
496 * 2. The number of aliases is bounded.
497 * When 1. is achieved, we take a look at 2. and write back
498 * some alias-containing blocks, if necessary. However, after
499 * each such write we need to go back to 1., as writes can
500 * consume free segments.
501 */
502write_alias:
503 if (super->s_no_object_aliases < MAX_OBJ_ALIASES)
504 return;
505 if (list_empty(&super->s_object_alias)) {
506 /* All aliases are still in btree */
507 return;
508 }
509 log_gc("Write back one alias\n");
510 block = list_entry(super->s_object_alias.next,
511 struct logfs_block, alias_list);
512 block->ops->write_block(block);
513 /*
514 * To round off the nasty goto logic, we reset round here. It
515 * is a safety-net for GC not making any progress and limited
516 * to something reasonably small. If incremented it for every
517 * single alias, the loop could terminate rather quickly.
518 */
519 round = 0;
520 }
521 LOGFS_BUG(sb);
522}
523
524static int wl_ratelimit(struct super_block *sb, u64 *next_event)
525{
526 struct logfs_super *super = logfs_super(sb);
527
528 if (*next_event < super->s_gec) {
529 *next_event = super->s_gec + WL_RATELIMIT;
530 return 0;
531 }
532 return 1;
533}
534
535static void logfs_wl_pass(struct super_block *sb)
536{
537 struct logfs_super *super = logfs_super(sb);
538 struct gc_candidate *wl_cand, *free_cand;
539
540 if (wl_ratelimit(sb, &super->s_wl_gec_ostore))
541 return;
542
543 wl_cand = first_in_list(&super->s_ec_list);
544 if (!wl_cand)
545 return;
546 free_cand = first_in_list(&super->s_free_list);
547 if (!free_cand)
548 return;
549
550 if (wl_cand->erase_count < free_cand->erase_count + WL_DELTA) {
551 remove_from_list(wl_cand);
552 __logfs_gc_once(sb, wl_cand);
553 }
554}
555
556/*
557 * The journal needs wear leveling as well. But moving the journal is an
558 * expensive operation so we try to avoid it as much as possible. And if we
559 * have to do it, we move the whole journal, not individual segments.
560 *
561 * Ratelimiting is not strictly necessary here, it mainly serves to avoid the
562 * calculations. First we check whether moving the journal would be a
563 * significant improvement. That means that a) the current journal segments
564 * have more wear than the future journal segments and b) the current journal
565 * segments have more wear than normal ostore segments.
566 * Rationale for b) is that we don't have to move the journal if it is aging
567 * less than the ostore, even if the reserve segments age even less (they are
568 * excluded from wear leveling, after all).
569 * Next we check that the superblocks have less wear than the journal. Since
570 * moving the journal requires writing the superblocks, we have to protect the
571 * superblocks even more than the journal.
572 *
573 * Also we double the acceptable wear difference, compared to ostore wear
574 * leveling. Journal data is read and rewritten rapidly, comparatively. So
575 * soft errors have much less time to accumulate and we allow the journal to
576 * be a bit worse than the ostore.
577 */
578static void logfs_journal_wl_pass(struct super_block *sb)
579{
580 struct logfs_super *super = logfs_super(sb);
581 struct gc_candidate *cand;
582 u32 min_journal_ec = -1, max_reserve_ec = 0;
583 int i;
584
585 if (wl_ratelimit(sb, &super->s_wl_gec_journal))
586 return;
587
588 if (super->s_reserve_list.count < super->s_no_journal_segs) {
589 /* Reserve is not full enough to move complete journal */
590 return;
591 }
592
593 journal_for_each(i)
594 if (super->s_journal_seg[i])
595 min_journal_ec = min(min_journal_ec,
596 super->s_journal_ec[i]);
597 cand = rb_entry(rb_first(&super->s_free_list.rb_tree),
598 struct gc_candidate, rb_node);
599 max_reserve_ec = cand->erase_count;
600 for (i = 0; i < 2; i++) {
601 struct logfs_segment_entry se;
602 u32 segno = seg_no(sb, super->s_sb_ofs[i]);
603 u32 ec;
604
605 logfs_get_segment_entry(sb, segno, &se);
606 ec = be32_to_cpu(se.ec_level) >> 4;
607 max_reserve_ec = max(max_reserve_ec, ec);
608 }
609
610 if (min_journal_ec > max_reserve_ec + 2 * WL_DELTA) {
611 do_logfs_journal_wl_pass(sb);
612 }
613}
614
615void logfs_gc_pass(struct super_block *sb)
616{
617 struct logfs_super *super = logfs_super(sb);
618
619 //BUG_ON(mutex_trylock(&logfs_super(sb)->s_w_mutex));
620 /* Write journal before free space is getting saturated with dirty
621 * objects.
622 */
623 if (super->s_dirty_used_bytes + super->s_dirty_free_bytes
624 + LOGFS_MAX_OBJECTSIZE >= super->s_free_bytes)
625 logfs_write_anchor(sb);
626 __logfs_gc_pass(sb, super->s_total_levels);
627 logfs_wl_pass(sb);
628 logfs_journal_wl_pass(sb);
629}
630
631static int check_area(struct super_block *sb, int i)
632{
633 struct logfs_super *super = logfs_super(sb);
634 struct logfs_area *area = super->s_area[i];
635 struct logfs_object_header oh;
636 u32 segno = area->a_segno;
637 u32 ofs = area->a_used_bytes;
638 __be32 crc;
639 int err;
640
641 if (!area->a_is_open)
642 return 0;
643
644 for (ofs = area->a_used_bytes;
645 ofs <= super->s_segsize - sizeof(oh);
646 ofs += (u32)be16_to_cpu(oh.len) + sizeof(oh)) {
647 err = wbuf_read(sb, dev_ofs(sb, segno, ofs), sizeof(oh), &oh);
648 if (err)
649 return err;
650
651 if (!memchr_inv(&oh, 0xff, sizeof(oh)))
652 break;
653
654 crc = logfs_crc32(&oh, sizeof(oh) - 4, 4);
655 if (crc != oh.crc) {
656 printk(KERN_INFO "interrupted header at %llx\n",
657 dev_ofs(sb, segno, ofs));
658 return 0;
659 }
660 }
661 if (ofs != area->a_used_bytes) {
662 printk(KERN_INFO "%x bytes unaccounted data found at %llx\n",
663 ofs - area->a_used_bytes,
664 dev_ofs(sb, segno, area->a_used_bytes));
665 area->a_used_bytes = ofs;
666 }
667 return 0;
668}
669
670int logfs_check_areas(struct super_block *sb)
671{
672 int i, err;
673
674 for_each_area(i) {
675 err = check_area(sb, i);
676 if (err)
677 return err;
678 }
679 return 0;
680}
681
682static void logfs_init_candlist(struct candidate_list *list, int maxcount,
683 int sort_by_ec)
684{
685 list->count = 0;
686 list->maxcount = maxcount;
687 list->sort_by_ec = sort_by_ec;
688 list->rb_tree = RB_ROOT;
689}
690
691int logfs_init_gc(struct super_block *sb)
692{
693 struct logfs_super *super = logfs_super(sb);
694 int i;
695
696 btree_init_mempool32(&super->s_cand_tree, super->s_btree_pool);
697 logfs_init_candlist(&super->s_free_list, LIST_SIZE + SCAN_RATIO, 1);
698 logfs_init_candlist(&super->s_reserve_list,
699 super->s_bad_seg_reserve, 1);
700 for_each_area(i)
701 logfs_init_candlist(&super->s_low_list[i], LIST_SIZE, 0);
702 logfs_init_candlist(&super->s_ec_list, LIST_SIZE, 1);
703 return 0;
704}
705
706static void logfs_cleanup_list(struct super_block *sb,
707 struct candidate_list *list)
708{
709 struct gc_candidate *cand;
710
711 while (list->count) {
712 cand = rb_entry(list->rb_tree.rb_node, struct gc_candidate,
713 rb_node);
714 remove_from_list(cand);
715 free_candidate(sb, cand);
716 }
717 BUG_ON(list->rb_tree.rb_node);
718}
719
720void logfs_cleanup_gc(struct super_block *sb)
721{
722 struct logfs_super *super = logfs_super(sb);
723 int i;
724
725 if (!super->s_free_list.count)
726 return;
727
728 /*
729 * FIXME: The btree may still contain a single empty node. So we
730 * call the grim visitor to clean up that mess. Btree code should
731 * do it for us, really.
732 */
733 btree_grim_visitor32(&super->s_cand_tree, 0, NULL);
734 logfs_cleanup_list(sb, &super->s_free_list);
735 logfs_cleanup_list(sb, &super->s_reserve_list);
736 for_each_area(i)
737 logfs_cleanup_list(sb, &super->s_low_list[i]);
738 logfs_cleanup_list(sb, &super->s_ec_list);
739}
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
new file mode 100644
index 000000000000..14ed27274da2
--- /dev/null
+++ b/fs/logfs/inode.c
@@ -0,0 +1,418 @@
1/*
2 * fs/logfs/inode.c - inode handling code
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */
8#include "logfs.h"
9#include <linux/slab.h>
10#include <linux/writeback.h>
11#include <linux/backing-dev.h>
12
13/*
14 * How soon to reuse old inode numbers? LogFS doesn't store deleted inodes
15 * on the medium. It therefore also lacks a method to store the previous
16 * generation number for deleted inodes. Instead a single generation number
17 * is stored which will be used for new inodes. Being just a 32bit counter,
18 * this can obvious wrap relatively quickly. So we only reuse inodes if we
19 * know that a fair number of inodes can be created before we have to increment
20 * the generation again - effectively adding some bits to the counter.
21 * But being too aggressive here means we keep a very large and very sparse
22 * inode file, wasting space on indirect blocks.
23 * So what is a good value? Beats me. 64k seems moderately bad on both
24 * fronts, so let's use that for now...
25 *
26 * NFS sucks, as everyone already knows.
27 */
28#define INOS_PER_WRAP (0x10000)
29
30/*
31 * Logfs' requirement to read inodes for garbage collection makes life a bit
32 * harder. GC may have to read inodes that are in I_FREEING state, when they
33 * are being written out - and waiting for GC to make progress, naturally.
34 *
35 * So we cannot just call iget() or some variant of it, but first have to check
36 * wether the inode in question might be in I_FREEING state. Therefore we
37 * maintain our own per-sb list of "almost deleted" inodes and check against
38 * that list first. Normally this should be at most 1-2 entries long.
39 *
40 * Also, inodes have logfs-specific reference counting on top of what the vfs
41 * does. When .destroy_inode is called, normally the reference count will drop
42 * to zero and the inode gets deleted. But if GC accessed the inode, its
43 * refcount will remain nonzero and final deletion will have to wait.
44 *
45 * As a result we have two sets of functions to get/put inodes:
46 * logfs_safe_iget/logfs_safe_iput - safe to call from GC context
47 * logfs_iget/iput - normal version
48 */
49static struct kmem_cache *logfs_inode_cache;
50
51static DEFINE_SPINLOCK(logfs_inode_lock);
52
53static void logfs_inode_setops(struct inode *inode)
54{
55 switch (inode->i_mode & S_IFMT) {
56 case S_IFDIR:
57 inode->i_op = &logfs_dir_iops;
58 inode->i_fop = &logfs_dir_fops;
59 inode->i_mapping->a_ops = &logfs_reg_aops;
60 break;
61 case S_IFREG:
62 inode->i_op = &logfs_reg_iops;
63 inode->i_fop = &logfs_reg_fops;
64 inode->i_mapping->a_ops = &logfs_reg_aops;
65 break;
66 case S_IFLNK:
67 inode->i_op = &logfs_symlink_iops;
68 inode->i_mapping->a_ops = &logfs_reg_aops;
69 break;
70 case S_IFSOCK: /* fall through */
71 case S_IFBLK: /* fall through */
72 case S_IFCHR: /* fall through */
73 case S_IFIFO:
74 init_special_inode(inode, inode->i_mode, inode->i_rdev);
75 break;
76 default:
77 BUG();
78 }
79}
80
81static struct inode *__logfs_iget(struct super_block *sb, ino_t ino)
82{
83 struct inode *inode = iget_locked(sb, ino);
84 int err;
85
86 if (!inode)
87 return ERR_PTR(-ENOMEM);
88 if (!(inode->i_state & I_NEW))
89 return inode;
90
91 err = logfs_read_inode(inode);
92 if (err || inode->i_nlink == 0) {
93 /* inode->i_nlink == 0 can be true when called from
94 * block validator */
95 /* set i_nlink to 0 to prevent caching */
96 inode->i_nlink = 0;
97 logfs_inode(inode)->li_flags |= LOGFS_IF_ZOMBIE;
98 iget_failed(inode);
99 if (!err)
100 err = -ENOENT;
101 return ERR_PTR(err);
102 }
103
104 logfs_inode_setops(inode);
105 unlock_new_inode(inode);
106 return inode;
107}
108
109struct inode *logfs_iget(struct super_block *sb, ino_t ino)
110{
111 BUG_ON(ino == LOGFS_INO_MASTER);
112 BUG_ON(ino == LOGFS_INO_SEGFILE);
113 return __logfs_iget(sb, ino);
114}
115
116/*
117 * is_cached is set to 1 if we hand out a cached inode, 0 otherwise.
118 * this allows logfs_iput to do the right thing later
119 */
120struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *is_cached)
121{
122 struct logfs_super *super = logfs_super(sb);
123 struct logfs_inode *li;
124
125 if (ino == LOGFS_INO_MASTER)
126 return super->s_master_inode;
127 if (ino == LOGFS_INO_SEGFILE)
128 return super->s_segfile_inode;
129
130 spin_lock(&logfs_inode_lock);
131 list_for_each_entry(li, &super->s_freeing_list, li_freeing_list)
132 if (li->vfs_inode.i_ino == ino) {
133 li->li_refcount++;
134 spin_unlock(&logfs_inode_lock);
135 *is_cached = 1;
136 return &li->vfs_inode;
137 }
138 spin_unlock(&logfs_inode_lock);
139
140 *is_cached = 0;
141 return __logfs_iget(sb, ino);
142}
143
144static void __logfs_destroy_inode(struct inode *inode)
145{
146 struct logfs_inode *li = logfs_inode(inode);
147
148 BUG_ON(li->li_block);
149 list_del(&li->li_freeing_list);
150 kmem_cache_free(logfs_inode_cache, li);
151}
152
153static void logfs_destroy_inode(struct inode *inode)
154{
155 struct logfs_inode *li = logfs_inode(inode);
156
157 BUG_ON(list_empty(&li->li_freeing_list));
158 spin_lock(&logfs_inode_lock);
159 li->li_refcount--;
160 if (li->li_refcount == 0)
161 __logfs_destroy_inode(inode);
162 spin_unlock(&logfs_inode_lock);
163}
164
165void logfs_safe_iput(struct inode *inode, int is_cached)
166{
167 if (inode->i_ino == LOGFS_INO_MASTER)
168 return;
169 if (inode->i_ino == LOGFS_INO_SEGFILE)
170 return;
171
172 if (is_cached) {
173 logfs_destroy_inode(inode);
174 return;
175 }
176
177 iput(inode);
178}
179
180static void logfs_init_inode(struct super_block *sb, struct inode *inode)
181{
182 struct logfs_inode *li = logfs_inode(inode);
183 int i;
184
185 li->li_flags = 0;
186 li->li_height = 0;
187 li->li_used_bytes = 0;
188 li->li_block = NULL;
189 inode->i_uid = 0;
190 inode->i_gid = 0;
191 inode->i_size = 0;
192 inode->i_blocks = 0;
193 inode->i_ctime = CURRENT_TIME;
194 inode->i_mtime = CURRENT_TIME;
195 inode->i_nlink = 1;
196 INIT_LIST_HEAD(&li->li_freeing_list);
197
198 for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
199 li->li_data[i] = 0;
200
201 return;
202}
203
204static struct inode *logfs_alloc_inode(struct super_block *sb)
205{
206 struct logfs_inode *li;
207
208 li = kmem_cache_alloc(logfs_inode_cache, GFP_NOFS);
209 if (!li)
210 return NULL;
211 logfs_init_inode(sb, &li->vfs_inode);
212 return &li->vfs_inode;
213}
214
215/*
216 * In logfs inodes are written to an inode file. The inode file, like any
217 * other file, is managed with a inode. The inode file's inode, aka master
218 * inode, requires special handling in several respects. First, it cannot be
219 * written to the inode file, so it is stored in the journal instead.
220 *
221 * Secondly, this inode cannot be written back and destroyed before all other
222 * inodes have been written. The ordering is important. Linux' VFS is happily
223 * unaware of the ordering constraint and would ordinarily destroy the master
224 * inode at umount time while other inodes are still in use and dirty. Not
225 * good.
226 *
227 * So logfs makes sure the master inode is not written until all other inodes
228 * have been destroyed. Sadly, this method has another side-effect. The VFS
229 * will notice one remaining inode and print a frightening warning message.
230 * Worse, it is impossible to judge whether such a warning was caused by the
231 * master inode or any other inodes have leaked as well.
232 *
233 * Our attempt of solving this is with logfs_new_meta_inode() below. Its
234 * purpose is to create a new inode that will not trigger the warning if such
235 * an inode is still in use. An ugly hack, no doubt. Suggections for
236 * improvement are welcome.
237 */
238struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino)
239{
240 struct inode *inode;
241
242 inode = logfs_alloc_inode(sb);
243 if (!inode)
244 return ERR_PTR(-ENOMEM);
245
246 inode->i_mode = S_IFREG;
247 inode->i_ino = ino;
248 inode->i_sb = sb;
249
250 /* This is a blatant copy of alloc_inode code. We'd need alloc_inode
251 * to be nonstatic, alas. */
252 {
253 struct address_space * const mapping = &inode->i_data;
254
255 mapping->a_ops = &logfs_reg_aops;
256 mapping->host = inode;
257 mapping->flags = 0;
258 mapping_set_gfp_mask(mapping, GFP_NOFS);
259 mapping->assoc_mapping = NULL;
260 mapping->backing_dev_info = &default_backing_dev_info;
261 inode->i_mapping = mapping;
262 inode->i_nlink = 1;
263 }
264
265 return inode;
266}
267
268struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino)
269{
270 struct inode *inode;
271 int err;
272
273 inode = logfs_new_meta_inode(sb, ino);
274 if (IS_ERR(inode))
275 return inode;
276
277 err = logfs_read_inode(inode);
278 if (err) {
279 destroy_meta_inode(inode);
280 return ERR_PTR(err);
281 }
282 logfs_inode_setops(inode);
283 return inode;
284}
285
286static int logfs_write_inode(struct inode *inode, struct writeback_control *wbc)
287{
288 int ret;
289 long flags = WF_LOCK;
290
291 /* Can only happen if creat() failed. Safe to skip. */
292 if (logfs_inode(inode)->li_flags & LOGFS_IF_STILLBORN)
293 return 0;
294
295 ret = __logfs_write_inode(inode, flags);
296 LOGFS_BUG_ON(ret, inode->i_sb);
297 return ret;
298}
299
300void destroy_meta_inode(struct inode *inode)
301{
302 if (inode) {
303 if (inode->i_data.nrpages)
304 truncate_inode_pages(&inode->i_data, 0);
305 logfs_clear_inode(inode);
306 kmem_cache_free(logfs_inode_cache, logfs_inode(inode));
307 }
308}
309
310/* called with inode_lock held */
311static void logfs_drop_inode(struct inode *inode)
312{
313 struct logfs_super *super = logfs_super(inode->i_sb);
314 struct logfs_inode *li = logfs_inode(inode);
315
316 spin_lock(&logfs_inode_lock);
317 list_move(&li->li_freeing_list, &super->s_freeing_list);
318 spin_unlock(&logfs_inode_lock);
319 generic_drop_inode(inode);
320}
321
322static void logfs_set_ino_generation(struct super_block *sb,
323 struct inode *inode)
324{
325 struct logfs_super *super = logfs_super(sb);
326 u64 ino;
327
328 mutex_lock(&super->s_journal_mutex);
329 ino = logfs_seek_hole(super->s_master_inode, super->s_last_ino);
330 super->s_last_ino = ino;
331 super->s_inos_till_wrap--;
332 if (super->s_inos_till_wrap < 0) {
333 super->s_last_ino = LOGFS_RESERVED_INOS;
334 super->s_generation++;
335 super->s_inos_till_wrap = INOS_PER_WRAP;
336 }
337 inode->i_ino = ino;
338 inode->i_generation = super->s_generation;
339 mutex_unlock(&super->s_journal_mutex);
340}
341
342struct inode *logfs_new_inode(struct inode *dir, int mode)
343{
344 struct super_block *sb = dir->i_sb;
345 struct inode *inode;
346
347 inode = new_inode(sb);
348 if (!inode)
349 return ERR_PTR(-ENOMEM);
350
351 logfs_init_inode(sb, inode);
352
353 /* inherit parent flags */
354 logfs_inode(inode)->li_flags |=
355 logfs_inode(dir)->li_flags & LOGFS_FL_INHERITED;
356
357 inode->i_mode = mode;
358 logfs_set_ino_generation(sb, inode);
359
360 inode->i_uid = current_fsuid();
361 inode->i_gid = current_fsgid();
362 if (dir->i_mode & S_ISGID) {
363 inode->i_gid = dir->i_gid;
364 if (S_ISDIR(mode))
365 inode->i_mode |= S_ISGID;
366 }
367
368 logfs_inode_setops(inode);
369 insert_inode_hash(inode);
370
371 return inode;
372}
373
374static void logfs_init_once(void *_li)
375{
376 struct logfs_inode *li = _li;
377 int i;
378
379 li->li_flags = 0;
380 li->li_used_bytes = 0;
381 li->li_refcount = 1;
382 for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
383 li->li_data[i] = 0;
384 inode_init_once(&li->vfs_inode);
385}
386
387static int logfs_sync_fs(struct super_block *sb, int wait)
388{
389 /* FIXME: write anchor */
390 logfs_super(sb)->s_devops->sync(sb);
391 return 0;
392}
393
394const struct super_operations logfs_super_operations = {
395 .alloc_inode = logfs_alloc_inode,
396 .clear_inode = logfs_clear_inode,
397 .delete_inode = logfs_delete_inode,
398 .destroy_inode = logfs_destroy_inode,
399 .drop_inode = logfs_drop_inode,
400 .write_inode = logfs_write_inode,
401 .statfs = logfs_statfs,
402 .sync_fs = logfs_sync_fs,
403};
404
405int logfs_init_inode_cache(void)
406{
407 logfs_inode_cache = kmem_cache_create("logfs_inode_cache",
408 sizeof(struct logfs_inode), 0, SLAB_RECLAIM_ACCOUNT,
409 logfs_init_once);
410 if (!logfs_inode_cache)
411 return -ENOMEM;
412 return 0;
413}
414
415void logfs_destroy_inode_cache(void)
416{
417 kmem_cache_destroy(logfs_inode_cache);
418}
diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c
new file mode 100644
index 000000000000..fb0a613f885b
--- /dev/null
+++ b/fs/logfs/journal.c
@@ -0,0 +1,898 @@
1/*
2 * fs/logfs/journal.c - journal handling code
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */
8#include "logfs.h"
9#include <linux/slab.h>
10
11static void logfs_calc_free(struct super_block *sb)
12{
13 struct logfs_super *super = logfs_super(sb);
14 u64 reserve, no_segs = super->s_no_segs;
15 s64 free;
16 int i;
17
18 /* superblock segments */
19 no_segs -= 2;
20 super->s_no_journal_segs = 0;
21 /* journal */
22 journal_for_each(i)
23 if (super->s_journal_seg[i]) {
24 no_segs--;
25 super->s_no_journal_segs++;
26 }
27
28 /* open segments plus one extra per level for GC */
29 no_segs -= 2 * super->s_total_levels;
30
31 free = no_segs * (super->s_segsize - LOGFS_SEGMENT_RESERVE);
32 free -= super->s_used_bytes;
33 /* just a bit extra */
34 free -= super->s_total_levels * 4096;
35
36 /* Bad blocks are 'paid' for with speed reserve - the filesystem
37 * simply gets slower as bad blocks accumulate. Until the bad blocks
38 * exceed the speed reserve - then the filesystem gets smaller.
39 */
40 reserve = super->s_bad_segments + super->s_bad_seg_reserve;
41 reserve *= super->s_segsize - LOGFS_SEGMENT_RESERVE;
42 reserve = max(reserve, super->s_speed_reserve);
43 free -= reserve;
44 if (free < 0)
45 free = 0;
46
47 super->s_free_bytes = free;
48}
49
50static void reserve_sb_and_journal(struct super_block *sb)
51{
52 struct logfs_super *super = logfs_super(sb);
53 struct btree_head32 *head = &super->s_reserved_segments;
54 int i, err;
55
56 err = btree_insert32(head, seg_no(sb, super->s_sb_ofs[0]), (void *)1,
57 GFP_KERNEL);
58 BUG_ON(err);
59
60 err = btree_insert32(head, seg_no(sb, super->s_sb_ofs[1]), (void *)1,
61 GFP_KERNEL);
62 BUG_ON(err);
63
64 journal_for_each(i) {
65 if (!super->s_journal_seg[i])
66 continue;
67 err = btree_insert32(head, super->s_journal_seg[i], (void *)1,
68 GFP_KERNEL);
69 BUG_ON(err);
70 }
71}
72
73static void read_dynsb(struct super_block *sb,
74 struct logfs_je_dynsb *dynsb)
75{
76 struct logfs_super *super = logfs_super(sb);
77
78 super->s_gec = be64_to_cpu(dynsb->ds_gec);
79 super->s_sweeper = be64_to_cpu(dynsb->ds_sweeper);
80 super->s_victim_ino = be64_to_cpu(dynsb->ds_victim_ino);
81 super->s_rename_dir = be64_to_cpu(dynsb->ds_rename_dir);
82 super->s_rename_pos = be64_to_cpu(dynsb->ds_rename_pos);
83 super->s_used_bytes = be64_to_cpu(dynsb->ds_used_bytes);
84 super->s_generation = be32_to_cpu(dynsb->ds_generation);
85}
86
87static void read_anchor(struct super_block *sb,
88 struct logfs_je_anchor *da)
89{
90 struct logfs_super *super = logfs_super(sb);
91 struct inode *inode = super->s_master_inode;
92 struct logfs_inode *li = logfs_inode(inode);
93 int i;
94
95 super->s_last_ino = be64_to_cpu(da->da_last_ino);
96 li->li_flags = 0;
97 li->li_height = da->da_height;
98 i_size_write(inode, be64_to_cpu(da->da_size));
99 li->li_used_bytes = be64_to_cpu(da->da_used_bytes);
100
101 for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
102 li->li_data[i] = be64_to_cpu(da->da_data[i]);
103}
104
105static void read_erasecount(struct super_block *sb,
106 struct logfs_je_journal_ec *ec)
107{
108 struct logfs_super *super = logfs_super(sb);
109 int i;
110
111 journal_for_each(i)
112 super->s_journal_ec[i] = be32_to_cpu(ec->ec[i]);
113}
114
115static int read_area(struct super_block *sb, struct logfs_je_area *a)
116{
117 struct logfs_super *super = logfs_super(sb);
118 struct logfs_area *area = super->s_area[a->gc_level];
119 u64 ofs;
120 u32 writemask = ~(super->s_writesize - 1);
121
122 if (a->gc_level >= LOGFS_NO_AREAS)
123 return -EIO;
124 if (a->vim != VIM_DEFAULT)
125 return -EIO; /* TODO: close area and continue */
126
127 area->a_used_bytes = be32_to_cpu(a->used_bytes);
128 area->a_written_bytes = area->a_used_bytes & writemask;
129 area->a_segno = be32_to_cpu(a->segno);
130 if (area->a_segno)
131 area->a_is_open = 1;
132
133 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
134 if (super->s_writesize > 1)
135 logfs_buf_recover(area, ofs, a + 1, super->s_writesize);
136 else
137 logfs_buf_recover(area, ofs, NULL, 0);
138 return 0;
139}
140
141static void *unpack(void *from, void *to)
142{
143 struct logfs_journal_header *jh = from;
144 void *data = from + sizeof(struct logfs_journal_header);
145 int err;
146 size_t inlen, outlen;
147
148 inlen = be16_to_cpu(jh->h_len);
149 outlen = be16_to_cpu(jh->h_datalen);
150
151 if (jh->h_compr == COMPR_NONE)
152 memcpy(to, data, inlen);
153 else {
154 err = logfs_uncompress(data, to, inlen, outlen);
155 BUG_ON(err);
156 }
157 return to;
158}
159
160static int __read_je_header(struct super_block *sb, u64 ofs,
161 struct logfs_journal_header *jh)
162{
163 struct logfs_super *super = logfs_super(sb);
164 size_t bufsize = max_t(size_t, sb->s_blocksize, super->s_writesize)
165 + MAX_JOURNAL_HEADER;
166 u16 type, len, datalen;
167 int err;
168
169 /* read header only */
170 err = wbuf_read(sb, ofs, sizeof(*jh), jh);
171 if (err)
172 return err;
173 type = be16_to_cpu(jh->h_type);
174 len = be16_to_cpu(jh->h_len);
175 datalen = be16_to_cpu(jh->h_datalen);
176 if (len > sb->s_blocksize)
177 return -EIO;
178 if ((type < JE_FIRST) || (type > JE_LAST))
179 return -EIO;
180 if (datalen > bufsize)
181 return -EIO;
182 return 0;
183}
184
185static int __read_je_payload(struct super_block *sb, u64 ofs,
186 struct logfs_journal_header *jh)
187{
188 u16 len;
189 int err;
190
191 len = be16_to_cpu(jh->h_len);
192 err = wbuf_read(sb, ofs + sizeof(*jh), len, jh + 1);
193 if (err)
194 return err;
195 if (jh->h_crc != logfs_crc32(jh, len + sizeof(*jh), 4)) {
196 /* Old code was confused. It forgot about the header length
197 * and stopped calculating the crc 16 bytes before the end
198 * of data - ick!
199 * FIXME: Remove this hack once the old code is fixed.
200 */
201 if (jh->h_crc == logfs_crc32(jh, len, 4))
202 WARN_ON_ONCE(1);
203 else
204 return -EIO;
205 }
206 return 0;
207}
208
209/*
210 * jh needs to be large enough to hold the complete entry, not just the header
211 */
212static int __read_je(struct super_block *sb, u64 ofs,
213 struct logfs_journal_header *jh)
214{
215 int err;
216
217 err = __read_je_header(sb, ofs, jh);
218 if (err)
219 return err;
220 return __read_je_payload(sb, ofs, jh);
221}
222
223static int read_je(struct super_block *sb, u64 ofs)
224{
225 struct logfs_super *super = logfs_super(sb);
226 struct logfs_journal_header *jh = super->s_compressed_je;
227 void *scratch = super->s_je;
228 u16 type, datalen;
229 int err;
230
231 err = __read_je(sb, ofs, jh);
232 if (err)
233 return err;
234 type = be16_to_cpu(jh->h_type);
235 datalen = be16_to_cpu(jh->h_datalen);
236
237 switch (type) {
238 case JE_DYNSB:
239 read_dynsb(sb, unpack(jh, scratch));
240 break;
241 case JE_ANCHOR:
242 read_anchor(sb, unpack(jh, scratch));
243 break;
244 case JE_ERASECOUNT:
245 read_erasecount(sb, unpack(jh, scratch));
246 break;
247 case JE_AREA:
248 read_area(sb, unpack(jh, scratch));
249 break;
250 case JE_OBJ_ALIAS:
251 err = logfs_load_object_aliases(sb, unpack(jh, scratch),
252 datalen);
253 break;
254 default:
255 WARN_ON_ONCE(1);
256 return -EIO;
257 }
258 return err;
259}
260
261static int logfs_read_segment(struct super_block *sb, u32 segno)
262{
263 struct logfs_super *super = logfs_super(sb);
264 struct logfs_journal_header *jh = super->s_compressed_je;
265 u64 ofs, seg_ofs = dev_ofs(sb, segno, 0);
266 u32 h_ofs, last_ofs = 0;
267 u16 len, datalen, last_len = 0;
268 int i, err;
269
270 /* search for most recent commit */
271 for (h_ofs = 0; h_ofs < super->s_segsize; h_ofs += sizeof(*jh)) {
272 ofs = seg_ofs + h_ofs;
273 err = __read_je_header(sb, ofs, jh);
274 if (err)
275 continue;
276 if (jh->h_type != cpu_to_be16(JE_COMMIT))
277 continue;
278 err = __read_je_payload(sb, ofs, jh);
279 if (err)
280 continue;
281 len = be16_to_cpu(jh->h_len);
282 datalen = be16_to_cpu(jh->h_datalen);
283 if ((datalen > sizeof(super->s_je_array)) ||
284 (datalen % sizeof(__be64)))
285 continue;
286 last_ofs = h_ofs;
287 last_len = datalen;
288 h_ofs += ALIGN(len, sizeof(*jh)) - sizeof(*jh);
289 }
290 /* read commit */
291 if (last_ofs == 0)
292 return -ENOENT;
293 ofs = seg_ofs + last_ofs;
294 log_journal("Read commit from %llx\n", ofs);
295 err = __read_je(sb, ofs, jh);
296 BUG_ON(err); /* We should have caught it in the scan loop already */
297 if (err)
298 return err;
299 /* uncompress */
300 unpack(jh, super->s_je_array);
301 super->s_no_je = last_len / sizeof(__be64);
302 /* iterate over array */
303 for (i = 0; i < super->s_no_je; i++) {
304 err = read_je(sb, be64_to_cpu(super->s_je_array[i]));
305 if (err)
306 return err;
307 }
308 super->s_journal_area->a_segno = segno;
309 return 0;
310}
311
312static u64 read_gec(struct super_block *sb, u32 segno)
313{
314 struct logfs_segment_header sh;
315 __be32 crc;
316 int err;
317
318 if (!segno)
319 return 0;
320 err = wbuf_read(sb, dev_ofs(sb, segno, 0), sizeof(sh), &sh);
321 if (err)
322 return 0;
323 crc = logfs_crc32(&sh, sizeof(sh), 4);
324 if (crc != sh.crc) {
325 WARN_ON(sh.gec != cpu_to_be64(0xffffffffffffffffull));
326 /* Most likely it was just erased */
327 return 0;
328 }
329 return be64_to_cpu(sh.gec);
330}
331
332static int logfs_read_journal(struct super_block *sb)
333{
334 struct logfs_super *super = logfs_super(sb);
335 u64 gec[LOGFS_JOURNAL_SEGS], max;
336 u32 segno;
337 int i, max_i;
338
339 max = 0;
340 max_i = -1;
341 journal_for_each(i) {
342 segno = super->s_journal_seg[i];
343 gec[i] = read_gec(sb, super->s_journal_seg[i]);
344 if (gec[i] > max) {
345 max = gec[i];
346 max_i = i;
347 }
348 }
349 if (max_i == -1)
350 return -EIO;
351 /* FIXME: Try older segments in case of error */
352 return logfs_read_segment(sb, super->s_journal_seg[max_i]);
353}
354
355/*
356 * First search the current segment (outer loop), then pick the next segment
357 * in the array, skipping any zero entries (inner loop).
358 */
359static void journal_get_free_segment(struct logfs_area *area)
360{
361 struct logfs_super *super = logfs_super(area->a_sb);
362 int i;
363
364 journal_for_each(i) {
365 if (area->a_segno != super->s_journal_seg[i])
366 continue;
367
368 do {
369 i++;
370 if (i == LOGFS_JOURNAL_SEGS)
371 i = 0;
372 } while (!super->s_journal_seg[i]);
373
374 area->a_segno = super->s_journal_seg[i];
375 area->a_erase_count = ++(super->s_journal_ec[i]);
376 log_journal("Journal now at %x (ec %x)\n", area->a_segno,
377 area->a_erase_count);
378 return;
379 }
380 BUG();
381}
382
383static void journal_get_erase_count(struct logfs_area *area)
384{
385 /* erase count is stored globally and incremented in
386 * journal_get_free_segment() - nothing to do here */
387}
388
389static int journal_erase_segment(struct logfs_area *area)
390{
391 struct super_block *sb = area->a_sb;
392 union {
393 struct logfs_segment_header sh;
394 unsigned char c[ALIGN(sizeof(struct logfs_segment_header), 16)];
395 } u;
396 u64 ofs;
397 int err;
398
399 err = logfs_erase_segment(sb, area->a_segno, 1);
400 if (err)
401 return err;
402
403 memset(&u, 0, sizeof(u));
404 u.sh.pad = 0;
405 u.sh.type = SEG_JOURNAL;
406 u.sh.level = 0;
407 u.sh.segno = cpu_to_be32(area->a_segno);
408 u.sh.ec = cpu_to_be32(area->a_erase_count);
409 u.sh.gec = cpu_to_be64(logfs_super(sb)->s_gec);
410 u.sh.crc = logfs_crc32(&u.sh, sizeof(u.sh), 4);
411
412 /* This causes a bug in segment.c. Not yet. */
413 //logfs_set_segment_erased(sb, area->a_segno, area->a_erase_count, 0);
414
415 ofs = dev_ofs(sb, area->a_segno, 0);
416 area->a_used_bytes = sizeof(u);
417 logfs_buf_write(area, ofs, &u, sizeof(u));
418 return 0;
419}
420
421static size_t __logfs_write_header(struct logfs_super *super,
422 struct logfs_journal_header *jh, size_t len, size_t datalen,
423 u16 type, u8 compr)
424{
425 jh->h_len = cpu_to_be16(len);
426 jh->h_type = cpu_to_be16(type);
427 jh->h_datalen = cpu_to_be16(datalen);
428 jh->h_compr = compr;
429 jh->h_pad[0] = 'H';
430 jh->h_pad[1] = 'E';
431 jh->h_pad[2] = 'A';
432 jh->h_pad[3] = 'D';
433 jh->h_pad[4] = 'R';
434 jh->h_crc = logfs_crc32(jh, len + sizeof(*jh), 4);
435 return ALIGN(len, 16) + sizeof(*jh);
436}
437
438static size_t logfs_write_header(struct logfs_super *super,
439 struct logfs_journal_header *jh, size_t datalen, u16 type)
440{
441 size_t len = datalen;
442
443 return __logfs_write_header(super, jh, len, datalen, type, COMPR_NONE);
444}
445
446static inline size_t logfs_journal_erasecount_size(struct logfs_super *super)
447{
448 return LOGFS_JOURNAL_SEGS * sizeof(__be32);
449}
450
451static void *logfs_write_erasecount(struct super_block *sb, void *_ec,
452 u16 *type, size_t *len)
453{
454 struct logfs_super *super = logfs_super(sb);
455 struct logfs_je_journal_ec *ec = _ec;
456 int i;
457
458 journal_for_each(i)
459 ec->ec[i] = cpu_to_be32(super->s_journal_ec[i]);
460 *type = JE_ERASECOUNT;
461 *len = logfs_journal_erasecount_size(super);
462 return ec;
463}
464
465static void account_shadow(void *_shadow, unsigned long _sb, u64 ignore,
466 size_t ignore2)
467{
468 struct logfs_shadow *shadow = _shadow;
469 struct super_block *sb = (void *)_sb;
470 struct logfs_super *super = logfs_super(sb);
471
472 /* consume new space */
473 super->s_free_bytes -= shadow->new_len;
474 super->s_used_bytes += shadow->new_len;
475 super->s_dirty_used_bytes -= shadow->new_len;
476
477 /* free up old space */
478 super->s_free_bytes += shadow->old_len;
479 super->s_used_bytes -= shadow->old_len;
480 super->s_dirty_free_bytes -= shadow->old_len;
481
482 logfs_set_segment_used(sb, shadow->old_ofs, -shadow->old_len);
483 logfs_set_segment_used(sb, shadow->new_ofs, shadow->new_len);
484
485 log_journal("account_shadow(%llx, %llx, %x) %llx->%llx %x->%x\n",
486 shadow->ino, shadow->bix, shadow->gc_level,
487 shadow->old_ofs, shadow->new_ofs,
488 shadow->old_len, shadow->new_len);
489 mempool_free(shadow, super->s_shadow_pool);
490}
491
492static void account_shadows(struct super_block *sb)
493{
494 struct logfs_super *super = logfs_super(sb);
495 struct inode *inode = super->s_master_inode;
496 struct logfs_inode *li = logfs_inode(inode);
497 struct shadow_tree *tree = &super->s_shadow_tree;
498
499 btree_grim_visitor64(&tree->new, (unsigned long)sb, account_shadow);
500 btree_grim_visitor64(&tree->old, (unsigned long)sb, account_shadow);
501 btree_grim_visitor32(&tree->segment_map, 0, NULL);
502 tree->no_shadowed_segments = 0;
503
504 if (li->li_block) {
505 /*
506 * We never actually use the structure, when attached to the
507 * master inode. But it is easier to always free it here than
508 * to have checks in several places elsewhere when allocating
509 * it.
510 */
511 li->li_block->ops->free_block(sb, li->li_block);
512 }
513 BUG_ON((s64)li->li_used_bytes < 0);
514}
515
516static void *__logfs_write_anchor(struct super_block *sb, void *_da,
517 u16 *type, size_t *len)
518{
519 struct logfs_super *super = logfs_super(sb);
520 struct logfs_je_anchor *da = _da;
521 struct inode *inode = super->s_master_inode;
522 struct logfs_inode *li = logfs_inode(inode);
523 int i;
524
525 da->da_height = li->li_height;
526 da->da_last_ino = cpu_to_be64(super->s_last_ino);
527 da->da_size = cpu_to_be64(i_size_read(inode));
528 da->da_used_bytes = cpu_to_be64(li->li_used_bytes);
529 for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
530 da->da_data[i] = cpu_to_be64(li->li_data[i]);
531 *type = JE_ANCHOR;
532 *len = sizeof(*da);
533 return da;
534}
535
536static void *logfs_write_dynsb(struct super_block *sb, void *_dynsb,
537 u16 *type, size_t *len)
538{
539 struct logfs_super *super = logfs_super(sb);
540 struct logfs_je_dynsb *dynsb = _dynsb;
541
542 dynsb->ds_gec = cpu_to_be64(super->s_gec);
543 dynsb->ds_sweeper = cpu_to_be64(super->s_sweeper);
544 dynsb->ds_victim_ino = cpu_to_be64(super->s_victim_ino);
545 dynsb->ds_rename_dir = cpu_to_be64(super->s_rename_dir);
546 dynsb->ds_rename_pos = cpu_to_be64(super->s_rename_pos);
547 dynsb->ds_used_bytes = cpu_to_be64(super->s_used_bytes);
548 dynsb->ds_generation = cpu_to_be32(super->s_generation);
549 *type = JE_DYNSB;
550 *len = sizeof(*dynsb);
551 return dynsb;
552}
553
554static void write_wbuf(struct super_block *sb, struct logfs_area *area,
555 void *wbuf)
556{
557 struct logfs_super *super = logfs_super(sb);
558 struct address_space *mapping = super->s_mapping_inode->i_mapping;
559 u64 ofs;
560 pgoff_t index;
561 int page_ofs;
562 struct page *page;
563
564 ofs = dev_ofs(sb, area->a_segno,
565 area->a_used_bytes & ~(super->s_writesize - 1));
566 index = ofs >> PAGE_SHIFT;
567 page_ofs = ofs & (PAGE_SIZE - 1);
568
569 page = find_lock_page(mapping, index);
570 BUG_ON(!page);
571 memcpy(wbuf, page_address(page) + page_ofs, super->s_writesize);
572 unlock_page(page);
573}
574
575static void *logfs_write_area(struct super_block *sb, void *_a,
576 u16 *type, size_t *len)
577{
578 struct logfs_super *super = logfs_super(sb);
579 struct logfs_area *area = super->s_area[super->s_sum_index];
580 struct logfs_je_area *a = _a;
581
582 a->vim = VIM_DEFAULT;
583 a->gc_level = super->s_sum_index;
584 a->used_bytes = cpu_to_be32(area->a_used_bytes);
585 a->segno = cpu_to_be32(area->a_segno);
586 if (super->s_writesize > 1)
587 write_wbuf(sb, area, a + 1);
588
589 *type = JE_AREA;
590 *len = sizeof(*a) + super->s_writesize;
591 return a;
592}
593
594static void *logfs_write_commit(struct super_block *sb, void *h,
595 u16 *type, size_t *len)
596{
597 struct logfs_super *super = logfs_super(sb);
598
599 *type = JE_COMMIT;
600 *len = super->s_no_je * sizeof(__be64);
601 return super->s_je_array;
602}
603
604static size_t __logfs_write_je(struct super_block *sb, void *buf, u16 type,
605 size_t len)
606{
607 struct logfs_super *super = logfs_super(sb);
608 void *header = super->s_compressed_je;
609 void *data = header + sizeof(struct logfs_journal_header);
610 ssize_t compr_len, pad_len;
611 u8 compr = COMPR_ZLIB;
612
613 if (len == 0)
614 return logfs_write_header(super, header, 0, type);
615
616 BUG_ON(len > sb->s_blocksize);
617 compr_len = logfs_compress(buf, data, len, sb->s_blocksize);
618 if (compr_len < 0 || type == JE_ANCHOR) {
619 memcpy(data, buf, len);
620 compr_len = len;
621 compr = COMPR_NONE;
622 }
623
624 pad_len = ALIGN(compr_len, 16);
625 memset(data + compr_len, 0, pad_len - compr_len);
626
627 return __logfs_write_header(super, header, compr_len, len, type, compr);
628}
629
630static s64 logfs_get_free_bytes(struct logfs_area *area, size_t *bytes,
631 int must_pad)
632{
633 u32 writesize = logfs_super(area->a_sb)->s_writesize;
634 s32 ofs;
635 int ret;
636
637 ret = logfs_open_area(area, *bytes);
638 if (ret)
639 return -EAGAIN;
640
641 ofs = area->a_used_bytes;
642 area->a_used_bytes += *bytes;
643
644 if (must_pad) {
645 area->a_used_bytes = ALIGN(area->a_used_bytes, writesize);
646 *bytes = area->a_used_bytes - ofs;
647 }
648
649 return dev_ofs(area->a_sb, area->a_segno, ofs);
650}
651
652static int logfs_write_je_buf(struct super_block *sb, void *buf, u16 type,
653 size_t buf_len)
654{
655 struct logfs_super *super = logfs_super(sb);
656 struct logfs_area *area = super->s_journal_area;
657 struct logfs_journal_header *jh = super->s_compressed_je;
658 size_t len;
659 int must_pad = 0;
660 s64 ofs;
661
662 len = __logfs_write_je(sb, buf, type, buf_len);
663 if (jh->h_type == cpu_to_be16(JE_COMMIT))
664 must_pad = 1;
665
666 ofs = logfs_get_free_bytes(area, &len, must_pad);
667 if (ofs < 0)
668 return ofs;
669 logfs_buf_write(area, ofs, super->s_compressed_je, len);
670 BUG_ON(super->s_no_je >= MAX_JOURNAL_ENTRIES);
671 super->s_je_array[super->s_no_je++] = cpu_to_be64(ofs);
672 return 0;
673}
674
675static int logfs_write_je(struct super_block *sb,
676 void* (*write)(struct super_block *sb, void *scratch,
677 u16 *type, size_t *len))
678{
679 void *buf;
680 size_t len;
681 u16 type;
682
683 buf = write(sb, logfs_super(sb)->s_je, &type, &len);
684 return logfs_write_je_buf(sb, buf, type, len);
685}
686
687int write_alias_journal(struct super_block *sb, u64 ino, u64 bix,
688 level_t level, int child_no, __be64 val)
689{
690 struct logfs_super *super = logfs_super(sb);
691 struct logfs_obj_alias *oa = super->s_je;
692 int err = 0, fill = super->s_je_fill;
693
694 log_aliases("logfs_write_obj_aliases #%x(%llx, %llx, %x, %x) %llx\n",
695 fill, ino, bix, level, child_no, be64_to_cpu(val));
696 oa[fill].ino = cpu_to_be64(ino);
697 oa[fill].bix = cpu_to_be64(bix);
698 oa[fill].val = val;
699 oa[fill].level = (__force u8)level;
700 oa[fill].child_no = cpu_to_be16(child_no);
701 fill++;
702 if (fill >= sb->s_blocksize / sizeof(*oa)) {
703 err = logfs_write_je_buf(sb, oa, JE_OBJ_ALIAS, sb->s_blocksize);
704 fill = 0;
705 }
706
707 super->s_je_fill = fill;
708 return err;
709}
710
711static int logfs_write_obj_aliases(struct super_block *sb)
712{
713 struct logfs_super *super = logfs_super(sb);
714 int err;
715
716 log_journal("logfs_write_obj_aliases: %d aliases to write\n",
717 super->s_no_object_aliases);
718 super->s_je_fill = 0;
719 err = logfs_write_obj_aliases_pagecache(sb);
720 if (err)
721 return err;
722
723 if (super->s_je_fill)
724 err = logfs_write_je_buf(sb, super->s_je, JE_OBJ_ALIAS,
725 super->s_je_fill
726 * sizeof(struct logfs_obj_alias));
727 return err;
728}
729
730/*
731 * Write all journal entries. The goto logic ensures that all journal entries
732 * are written whenever a new segment is used. It is ugly and potentially a
733 * bit wasteful, but robustness is more important. With this we can *always*
734 * erase all journal segments except the one containing the most recent commit.
735 */
736void logfs_write_anchor(struct super_block *sb)
737{
738 struct logfs_super *super = logfs_super(sb);
739 struct logfs_area *area = super->s_journal_area;
740 int i, err;
741
742 if (!(super->s_flags & LOGFS_SB_FLAG_DIRTY))
743 return;
744 super->s_flags &= ~LOGFS_SB_FLAG_DIRTY;
745
746 BUG_ON(super->s_flags & LOGFS_SB_FLAG_SHUTDOWN);
747 mutex_lock(&super->s_journal_mutex);
748
749 /* Do this first or suffer corruption */
750 logfs_sync_segments(sb);
751 account_shadows(sb);
752
753again:
754 super->s_no_je = 0;
755 for_each_area(i) {
756 if (!super->s_area[i]->a_is_open)
757 continue;
758 super->s_sum_index = i;
759 err = logfs_write_je(sb, logfs_write_area);
760 if (err)
761 goto again;
762 }
763 err = logfs_write_obj_aliases(sb);
764 if (err)
765 goto again;
766 err = logfs_write_je(sb, logfs_write_erasecount);
767 if (err)
768 goto again;
769 err = logfs_write_je(sb, __logfs_write_anchor);
770 if (err)
771 goto again;
772 err = logfs_write_je(sb, logfs_write_dynsb);
773 if (err)
774 goto again;
775 /*
776 * Order is imperative. First we sync all writes, including the
777 * non-committed journal writes. Then we write the final commit and
778 * sync the current journal segment.
779 * There is a theoretical bug here. Syncing the journal segment will
780 * write a number of journal entries and the final commit. All these
781 * are written in a single operation. If the device layer writes the
782 * data back-to-front, the commit will precede the other journal
783 * entries, leaving a race window.
784 * Two fixes are possible. Preferred is to fix the device layer to
785 * ensure writes happen front-to-back. Alternatively we can insert
786 * another logfs_sync_area() super->s_devops->sync() combo before
787 * writing the commit.
788 */
789 /*
790 * On another subject, super->s_devops->sync is usually not necessary.
791 * Unless called from sys_sync or friends, a barrier would suffice.
792 */
793 super->s_devops->sync(sb);
794 err = logfs_write_je(sb, logfs_write_commit);
795 if (err)
796 goto again;
797 log_journal("Write commit to %llx\n",
798 be64_to_cpu(super->s_je_array[super->s_no_je - 1]));
799 logfs_sync_area(area);
800 BUG_ON(area->a_used_bytes != area->a_written_bytes);
801 super->s_devops->sync(sb);
802
803 mutex_unlock(&super->s_journal_mutex);
804 return;
805}
806
807void do_logfs_journal_wl_pass(struct super_block *sb)
808{
809 struct logfs_super *super = logfs_super(sb);
810 struct logfs_area *area = super->s_journal_area;
811 struct btree_head32 *head = &super->s_reserved_segments;
812 u32 segno, ec;
813 int i, err;
814
815 log_journal("Journal requires wear-leveling.\n");
816 /* Drop old segments */
817 journal_for_each(i)
818 if (super->s_journal_seg[i]) {
819 btree_remove32(head, super->s_journal_seg[i]);
820 logfs_set_segment_unreserved(sb,
821 super->s_journal_seg[i],
822 super->s_journal_ec[i]);
823 super->s_journal_seg[i] = 0;
824 super->s_journal_ec[i] = 0;
825 }
826 /* Get new segments */
827 for (i = 0; i < super->s_no_journal_segs; i++) {
828 segno = get_best_cand(sb, &super->s_reserve_list, &ec);
829 super->s_journal_seg[i] = segno;
830 super->s_journal_ec[i] = ec;
831 logfs_set_segment_reserved(sb, segno);
832 err = btree_insert32(head, segno, (void *)1, GFP_KERNEL);
833 BUG_ON(err); /* mempool should prevent this */
834 err = logfs_erase_segment(sb, segno, 1);
835 BUG_ON(err); /* FIXME: remount-ro would be nicer */
836 }
837 /* Manually move journal_area */
838 freeseg(sb, area->a_segno);
839 area->a_segno = super->s_journal_seg[0];
840 area->a_is_open = 0;
841 area->a_used_bytes = 0;
842 /* Write journal */
843 logfs_write_anchor(sb);
844 /* Write superblocks */
845 err = logfs_write_sb(sb);
846 BUG_ON(err);
847}
848
849static const struct logfs_area_ops journal_area_ops = {
850 .get_free_segment = journal_get_free_segment,
851 .get_erase_count = journal_get_erase_count,
852 .erase_segment = journal_erase_segment,
853};
854
855int logfs_init_journal(struct super_block *sb)
856{
857 struct logfs_super *super = logfs_super(sb);
858 size_t bufsize = max_t(size_t, sb->s_blocksize, super->s_writesize)
859 + MAX_JOURNAL_HEADER;
860 int ret = -ENOMEM;
861
862 mutex_init(&super->s_journal_mutex);
863 btree_init_mempool32(&super->s_reserved_segments, super->s_btree_pool);
864
865 super->s_je = kzalloc(bufsize, GFP_KERNEL);
866 if (!super->s_je)
867 return ret;
868
869 super->s_compressed_je = kzalloc(bufsize, GFP_KERNEL);
870 if (!super->s_compressed_je)
871 return ret;
872
873 super->s_master_inode = logfs_new_meta_inode(sb, LOGFS_INO_MASTER);
874 if (IS_ERR(super->s_master_inode))
875 return PTR_ERR(super->s_master_inode);
876
877 ret = logfs_read_journal(sb);
878 if (ret)
879 return -EIO;
880
881 reserve_sb_and_journal(sb);
882 logfs_calc_free(sb);
883
884 super->s_journal_area->a_ops = &journal_area_ops;
885 return 0;
886}
887
888void logfs_cleanup_journal(struct super_block *sb)
889{
890 struct logfs_super *super = logfs_super(sb);
891
892 btree_grim_visitor32(&super->s_reserved_segments, 0, NULL);
893 destroy_meta_inode(super->s_master_inode);
894 super->s_master_inode = NULL;
895
896 kfree(super->s_compressed_je);
897 kfree(super->s_je);
898}
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
new file mode 100644
index 000000000000..0a3df1a0c936
--- /dev/null
+++ b/fs/logfs/logfs.h
@@ -0,0 +1,736 @@
1/*
2 * fs/logfs/logfs.h
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 *
8 * Private header for logfs.
9 */
10#ifndef FS_LOGFS_LOGFS_H
11#define FS_LOGFS_LOGFS_H
12
13#undef __CHECK_ENDIAN__
14#define __CHECK_ENDIAN__
15
16#include <linux/btree.h>
17#include <linux/crc32.h>
18#include <linux/fs.h>
19#include <linux/kernel.h>
20#include <linux/mempool.h>
21#include <linux/pagemap.h>
22#include <linux/mtd/mtd.h>
23#include "logfs_abi.h"
24
25#define LOGFS_DEBUG_SUPER (0x0001)
26#define LOGFS_DEBUG_SEGMENT (0x0002)
27#define LOGFS_DEBUG_JOURNAL (0x0004)
28#define LOGFS_DEBUG_DIR (0x0008)
29#define LOGFS_DEBUG_FILE (0x0010)
30#define LOGFS_DEBUG_INODE (0x0020)
31#define LOGFS_DEBUG_READWRITE (0x0040)
32#define LOGFS_DEBUG_GC (0x0080)
33#define LOGFS_DEBUG_GC_NOISY (0x0100)
34#define LOGFS_DEBUG_ALIASES (0x0200)
35#define LOGFS_DEBUG_BLOCKMOVE (0x0400)
36#define LOGFS_DEBUG_ALL (0xffffffff)
37
38#define LOGFS_DEBUG (0x01)
39/*
40 * To enable specific log messages, simply define LOGFS_DEBUG to match any
41 * or all of the above.
42 */
43#ifndef LOGFS_DEBUG
44#define LOGFS_DEBUG (0)
45#endif
46
47#define log_cond(cond, fmt, arg...) do { \
48 if (cond) \
49 printk(KERN_DEBUG fmt, ##arg); \
50} while (0)
51
52#define log_super(fmt, arg...) \
53 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_SUPER, fmt, ##arg)
54#define log_segment(fmt, arg...) \
55 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_SEGMENT, fmt, ##arg)
56#define log_journal(fmt, arg...) \
57 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_JOURNAL, fmt, ##arg)
58#define log_dir(fmt, arg...) \
59 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_DIR, fmt, ##arg)
60#define log_file(fmt, arg...) \
61 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_FILE, fmt, ##arg)
62#define log_inode(fmt, arg...) \
63 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_INODE, fmt, ##arg)
64#define log_readwrite(fmt, arg...) \
65 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_READWRITE, fmt, ##arg)
66#define log_gc(fmt, arg...) \
67 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_GC, fmt, ##arg)
68#define log_gc_noisy(fmt, arg...) \
69 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_GC_NOISY, fmt, ##arg)
70#define log_aliases(fmt, arg...) \
71 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_ALIASES, fmt, ##arg)
72#define log_blockmove(fmt, arg...) \
73 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_BLOCKMOVE, fmt, ##arg)
74
75#define PG_pre_locked PG_owner_priv_1
76#define PagePreLocked(page) test_bit(PG_pre_locked, &(page)->flags)
77#define SetPagePreLocked(page) set_bit(PG_pre_locked, &(page)->flags)
78#define ClearPagePreLocked(page) clear_bit(PG_pre_locked, &(page)->flags)
79
80/* FIXME: This should really be somewhere in the 64bit area. */
81#define LOGFS_LINK_MAX (1<<30)
82
83/* Read-only filesystem */
84#define LOGFS_SB_FLAG_RO 0x0001
85#define LOGFS_SB_FLAG_DIRTY 0x0002
86#define LOGFS_SB_FLAG_OBJ_ALIAS 0x0004
87#define LOGFS_SB_FLAG_SHUTDOWN 0x0008
88
89/* Write Control Flags */
90#define WF_LOCK 0x01 /* take write lock */
91#define WF_WRITE 0x02 /* write block */
92#define WF_DELETE 0x04 /* delete old block */
93
94typedef u8 __bitwise level_t;
95typedef u8 __bitwise gc_level_t;
96
97#define LEVEL(level) ((__force level_t)(level))
98#define GC_LEVEL(gc_level) ((__force gc_level_t)(gc_level))
99
100#define SUBLEVEL(level) ( (void)((level) == LEVEL(1)), \
101 (__force level_t)((__force u8)(level) - 1) )
102
103/**
104 * struct logfs_area - area management information
105 *
106 * @a_sb: the superblock this area belongs to
107 * @a_is_open: 1 if the area is currently open, else 0
108 * @a_segno: segment number of area
109 * @a_written_bytes: number of bytes already written back
110 * @a_used_bytes: number of used bytes
111 * @a_ops: area operations (either journal or ostore)
112 * @a_erase_count: erase count
113 * @a_level: GC level
114 */
115struct logfs_area { /* a segment open for writing */
116 struct super_block *a_sb;
117 int a_is_open;
118 u32 a_segno;
119 u32 a_written_bytes;
120 u32 a_used_bytes;
121 const struct logfs_area_ops *a_ops;
122 u32 a_erase_count;
123 gc_level_t a_level;
124};
125
126/**
127 * struct logfs_area_ops - area operations
128 *
129 * @get_free_segment: fill area->ofs with the offset of a free segment
130 * @get_erase_count: fill area->erase_count (needs area->ofs)
131 * @erase_segment: erase and setup segment
132 */
133struct logfs_area_ops {
134 void (*get_free_segment)(struct logfs_area *area);
135 void (*get_erase_count)(struct logfs_area *area);
136 int (*erase_segment)(struct logfs_area *area);
137};
138
139/**
140 * struct logfs_device_ops - device access operations
141 *
142 * @readpage: read one page (mm page)
143 * @writeseg: write one segment. may be a partial segment
144 * @erase: erase one segment
145 * @read: read from the device
146 * @erase: erase part of the device
147 */
148struct logfs_device_ops {
149 struct page *(*find_first_sb)(struct super_block *sb, u64 *ofs);
150 struct page *(*find_last_sb)(struct super_block *sb, u64 *ofs);
151 int (*write_sb)(struct super_block *sb, struct page *page);
152 int (*readpage)(void *_sb, struct page *page);
153 void (*writeseg)(struct super_block *sb, u64 ofs, size_t len);
154 int (*erase)(struct super_block *sb, loff_t ofs, size_t len,
155 int ensure_write);
156 void (*sync)(struct super_block *sb);
157 void (*put_device)(struct super_block *sb);
158};
159
160/**
161 * struct candidate_list - list of similar candidates
162 */
163struct candidate_list {
164 struct rb_root rb_tree;
165 int count;
166 int maxcount;
167 int sort_by_ec;
168};
169
170/**
171 * struct gc_candidate - "candidate" segment to be garbage collected next
172 *
173 * @list: list (either free of low)
174 * @segno: segment number
175 * @valid: number of valid bytes
176 * @erase_count: erase count of segment
177 * @dist: distance from tree root
178 *
179 * Candidates can be on two lists. The free list contains electees rather
180 * than candidates - segments that no longer contain any valid data. The
181 * low list contains candidates to be picked for GC. It should be kept
182 * short. It is not required to always pick a perfect candidate. In the
183 * worst case GC will have to move more data than absolutely necessary.
184 */
185struct gc_candidate {
186 struct rb_node rb_node;
187 struct candidate_list *list;
188 u32 segno;
189 u32 valid;
190 u32 erase_count;
191 u8 dist;
192};
193
194/**
195 * struct logfs_journal_entry - temporary structure used during journal scan
196 *
197 * @used:
198 * @version: normalized version
199 * @len: length
200 * @offset: offset
201 */
202struct logfs_journal_entry {
203 int used;
204 s16 version;
205 u16 len;
206 u16 datalen;
207 u64 offset;
208};
209
210enum transaction_state {
211 CREATE_1 = 1,
212 CREATE_2,
213 UNLINK_1,
214 UNLINK_2,
215 CROSS_RENAME_1,
216 CROSS_RENAME_2,
217 TARGET_RENAME_1,
218 TARGET_RENAME_2,
219 TARGET_RENAME_3
220};
221
222/**
223 * struct logfs_transaction - essential fields to support atomic dirops
224 *
225 * @ino: target inode
226 * @dir: inode of directory containing dentry
227 * @pos: pos of dentry in directory
228 */
229struct logfs_transaction {
230 enum transaction_state state;
231 u64 ino;
232 u64 dir;
233 u64 pos;
234};
235
236/**
237 * struct logfs_shadow - old block in the shadow of a not-yet-committed new one
238 * @old_ofs: offset of old block on medium
239 * @new_ofs: offset of new block on medium
240 * @ino: inode number
241 * @bix: block index
242 * @old_len: size of old block, including header
243 * @new_len: size of new block, including header
244 * @level: block level
245 */
246struct logfs_shadow {
247 u64 old_ofs;
248 u64 new_ofs;
249 u64 ino;
250 u64 bix;
251 int old_len;
252 int new_len;
253 gc_level_t gc_level;
254};
255
256/**
257 * struct shadow_tree
258 * @new: shadows where old_ofs==0, indexed by new_ofs
259 * @old: shadows where old_ofs!=0, indexed by old_ofs
260 * @segment_map: bitfield of segments containing shadows
261 * @no_shadowed_segment: number of segments containing shadows
262 */
263struct shadow_tree {
264 struct btree_head64 new;
265 struct btree_head64 old;
266 struct btree_head32 segment_map;
267 int no_shadowed_segments;
268};
269
270struct object_alias_item {
271 struct list_head list;
272 __be64 val;
273 int child_no;
274};
275
276/**
277 * struct logfs_block - contains any block state
278 * @type: indirect block or inode
279 * @full: number of fully populated children
280 * @partial: number of partially populated children
281 *
282 * Most blocks are directly represented by page cache pages. But when a block
283 * becomes dirty, is part of a transaction, contains aliases or is otherwise
284 * special, a struct logfs_block is allocated to track the additional state.
285 * Inodes are very similar to indirect blocks, so they can also get one of
286 * these structures added when appropriate.
287 */
288#define BLOCK_INDIRECT 1 /* Indirect block */
289#define BLOCK_INODE 2 /* Inode */
290struct logfs_block_ops;
291struct logfs_block {
292 struct list_head alias_list;
293 struct list_head item_list;
294 struct super_block *sb;
295 u64 ino;
296 u64 bix;
297 level_t level;
298 struct page *page;
299 struct inode *inode;
300 struct logfs_transaction *ta;
301 unsigned long alias_map[LOGFS_BLOCK_FACTOR / BITS_PER_LONG];
302 struct logfs_block_ops *ops;
303 int full;
304 int partial;
305 int reserved_bytes;
306};
307
308typedef int write_alias_t(struct super_block *sb, u64 ino, u64 bix,
309 level_t level, int child_no, __be64 val);
310struct logfs_block_ops {
311 void (*write_block)(struct logfs_block *block);
312 void (*free_block)(struct super_block *sb, struct logfs_block*block);
313 int (*write_alias)(struct super_block *sb,
314 struct logfs_block *block,
315 write_alias_t *write_one_alias);
316};
317
318#define MAX_JOURNAL_ENTRIES 256
319
320struct logfs_super {
321 struct mtd_info *s_mtd; /* underlying device */
322 struct block_device *s_bdev; /* underlying device */
323 const struct logfs_device_ops *s_devops;/* device access */
324 struct inode *s_master_inode; /* inode file */
325 struct inode *s_segfile_inode; /* segment file */
326 struct inode *s_mapping_inode; /* device mapping */
327 atomic_t s_pending_writes; /* outstanting bios */
328 long s_flags;
329 mempool_t *s_btree_pool; /* for btree nodes */
330 mempool_t *s_alias_pool; /* aliases in segment.c */
331 u64 s_feature_incompat;
332 u64 s_feature_ro_compat;
333 u64 s_feature_compat;
334 u64 s_feature_flags;
335 u64 s_sb_ofs[2];
336 struct page *s_erase_page; /* for dev_bdev.c */
337 /* alias.c fields */
338 struct btree_head32 s_segment_alias; /* remapped segments */
339 int s_no_object_aliases;
340 struct list_head s_object_alias; /* remapped objects */
341 struct btree_head128 s_object_alias_tree; /* remapped objects */
342 struct mutex s_object_alias_mutex;
343 /* dir.c fields */
344 struct mutex s_dirop_mutex; /* for creat/unlink/rename */
345 u64 s_victim_ino; /* used for atomic dir-ops */
346 u64 s_rename_dir; /* source directory ino */
347 u64 s_rename_pos; /* position of source dd */
348 /* gc.c fields */
349 long s_segsize; /* size of a segment */
350 int s_segshift; /* log2 of segment size */
351 long s_segmask; /* 1 << s_segshift - 1 */
352 long s_no_segs; /* segments on device */
353 long s_no_journal_segs; /* segments used for journal */
354 long s_no_blocks; /* blocks per segment */
355 long s_writesize; /* minimum write size */
356 int s_writeshift; /* log2 of write size */
357 u64 s_size; /* filesystem size */
358 struct logfs_area *s_area[LOGFS_NO_AREAS]; /* open segment array */
359 u64 s_gec; /* global erase count */
360 u64 s_wl_gec_ostore; /* time of last wl event */
361 u64 s_wl_gec_journal; /* time of last wl event */
362 u64 s_sweeper; /* current sweeper pos */
363 u8 s_ifile_levels; /* max level of ifile */
364 u8 s_iblock_levels; /* max level of regular files */
365 u8 s_data_levels; /* # of segments to leaf block*/
366 u8 s_total_levels; /* sum of above three */
367 struct btree_head32 s_cand_tree; /* all candidates */
368 struct candidate_list s_free_list; /* 100% free segments */
369 struct candidate_list s_reserve_list; /* Bad segment reserve */
370 struct candidate_list s_low_list[LOGFS_NO_AREAS];/* good candidates */
371 struct candidate_list s_ec_list; /* wear level candidates */
372 struct btree_head32 s_reserved_segments;/* sb, journal, bad, etc. */
373 /* inode.c fields */
374 u64 s_last_ino; /* highest ino used */
375 long s_inos_till_wrap;
376 u32 s_generation; /* i_generation for new files */
377 struct list_head s_freeing_list; /* inodes being freed */
378 /* journal.c fields */
379 struct mutex s_journal_mutex;
380 void *s_je; /* journal entry to compress */
381 void *s_compressed_je; /* block to write to journal */
382 u32 s_journal_seg[LOGFS_JOURNAL_SEGS]; /* journal segments */
383 u32 s_journal_ec[LOGFS_JOURNAL_SEGS]; /* journal erasecounts */
384 u64 s_last_version;
385 struct logfs_area *s_journal_area; /* open journal segment */
386 __be64 s_je_array[MAX_JOURNAL_ENTRIES];
387 int s_no_je;
388
389 int s_sum_index; /* for the 12 summaries */
390 struct shadow_tree s_shadow_tree;
391 int s_je_fill; /* index of current je */
392 /* readwrite.c fields */
393 struct mutex s_write_mutex;
394 int s_lock_count;
395 mempool_t *s_block_pool; /* struct logfs_block pool */
396 mempool_t *s_shadow_pool; /* struct logfs_shadow pool */
397 /*
398 * Space accounting:
399 * - s_used_bytes specifies space used to store valid data objects.
400 * - s_dirty_used_bytes is space used to store non-committed data
401 * objects. Those objects have already been written themselves,
402 * but they don't become valid until all indirect blocks up to the
403 * journal have been written as well.
404 * - s_dirty_free_bytes is space used to store the old copy of a
405 * replaced object, as long as the replacement is non-committed.
406 * In other words, it is the amount of space freed when all dirty
407 * blocks are written back.
408 * - s_free_bytes is the amount of free space available for any
409 * purpose.
410 * - s_root_reserve is the amount of free space available only to
411 * the root user. Non-privileged users can no longer write once
412 * this watermark has been reached.
413 * - s_speed_reserve is space which remains unused to speed up
414 * garbage collection performance.
415 * - s_dirty_pages is the space reserved for currently dirty pages.
416 * It is a pessimistic estimate, so some/most will get freed on
417 * page writeback.
418 *
419 * s_used_bytes + s_free_bytes + s_speed_reserve = total usable size
420 */
421 u64 s_free_bytes;
422 u64 s_used_bytes;
423 u64 s_dirty_free_bytes;
424 u64 s_dirty_used_bytes;
425 u64 s_root_reserve;
426 u64 s_speed_reserve;
427 u64 s_dirty_pages;
428 /* Bad block handling:
429 * - s_bad_seg_reserve is a number of segments usually kept
430 * free. When encountering bad blocks, the affected segment's data
431 * is _temporarily_ moved to a reserved segment.
432 * - s_bad_segments is the number of known bad segments.
433 */
434 u32 s_bad_seg_reserve;
435 u32 s_bad_segments;
436};
437
438/**
439 * struct logfs_inode - in-memory inode
440 *
441 * @vfs_inode: struct inode
442 * @li_data: data pointers
443 * @li_used_bytes: number of used bytes
444 * @li_freeing_list: used to track inodes currently being freed
445 * @li_flags: inode flags
446 * @li_refcount: number of internal (GC-induced) references
447 */
448struct logfs_inode {
449 struct inode vfs_inode;
450 u64 li_data[LOGFS_EMBEDDED_FIELDS];
451 u64 li_used_bytes;
452 struct list_head li_freeing_list;
453 struct logfs_block *li_block;
454 u32 li_flags;
455 u8 li_height;
456 int li_refcount;
457};
458
459#define journal_for_each(__i) for (__i = 0; __i < LOGFS_JOURNAL_SEGS; __i++)
460#define for_each_area(__i) for (__i = 0; __i < LOGFS_NO_AREAS; __i++)
461#define for_each_area_down(__i) for (__i = LOGFS_NO_AREAS - 1; __i >= 0; __i--)
462
463/* compr.c */
464int logfs_compress(void *in, void *out, size_t inlen, size_t outlen);
465int logfs_uncompress(void *in, void *out, size_t inlen, size_t outlen);
466int __init logfs_compr_init(void);
467void logfs_compr_exit(void);
468
469/* dev_bdev.c */
470#ifdef CONFIG_BLOCK
471int logfs_get_sb_bdev(struct file_system_type *type, int flags,
472 const char *devname, struct vfsmount *mnt);
473#else
474static inline int logfs_get_sb_bdev(struct file_system_type *type, int flags,
475 const char *devname, struct vfsmount *mnt)
476{
477 return -ENODEV;
478}
479#endif
480
481/* dev_mtd.c */
482#ifdef CONFIG_MTD
483int logfs_get_sb_mtd(struct file_system_type *type, int flags,
484 int mtdnr, struct vfsmount *mnt);
485#else
486static inline int logfs_get_sb_mtd(struct file_system_type *type, int flags,
487 int mtdnr, struct vfsmount *mnt)
488{
489 return -ENODEV;
490}
491#endif
492
493/* dir.c */
494extern const struct inode_operations logfs_symlink_iops;
495extern const struct inode_operations logfs_dir_iops;
496extern const struct file_operations logfs_dir_fops;
497int logfs_replay_journal(struct super_block *sb);
498
499/* file.c */
500extern const struct inode_operations logfs_reg_iops;
501extern const struct file_operations logfs_reg_fops;
502extern const struct address_space_operations logfs_reg_aops;
503int logfs_readpage(struct file *file, struct page *page);
504int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
505 unsigned long arg);
506int logfs_fsync(struct file *file, struct dentry *dentry, int datasync);
507
508/* gc.c */
509u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec);
510void logfs_gc_pass(struct super_block *sb);
511int logfs_check_areas(struct super_block *sb);
512int logfs_init_gc(struct super_block *sb);
513void logfs_cleanup_gc(struct super_block *sb);
514
515/* inode.c */
516extern const struct super_operations logfs_super_operations;
517struct inode *logfs_iget(struct super_block *sb, ino_t ino);
518struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *cookie);
519void logfs_safe_iput(struct inode *inode, int cookie);
520struct inode *logfs_new_inode(struct inode *dir, int mode);
521struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino);
522struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino);
523int logfs_init_inode_cache(void);
524void logfs_destroy_inode_cache(void);
525void destroy_meta_inode(struct inode *inode);
526void logfs_set_blocks(struct inode *inode, u64 no);
527/* these logically belong into inode.c but actually reside in readwrite.c */
528int logfs_read_inode(struct inode *inode);
529int __logfs_write_inode(struct inode *inode, long flags);
530void logfs_delete_inode(struct inode *inode);
531void logfs_clear_inode(struct inode *inode);
532
533/* journal.c */
534void logfs_write_anchor(struct super_block *sb);
535int logfs_init_journal(struct super_block *sb);
536void logfs_cleanup_journal(struct super_block *sb);
537int write_alias_journal(struct super_block *sb, u64 ino, u64 bix,
538 level_t level, int child_no, __be64 val);
539void do_logfs_journal_wl_pass(struct super_block *sb);
540
541/* readwrite.c */
542pgoff_t logfs_pack_index(u64 bix, level_t level);
543void logfs_unpack_index(pgoff_t index, u64 *bix, level_t *level);
544int logfs_inode_write(struct inode *inode, const void *buf, size_t count,
545 loff_t bix, long flags, struct shadow_tree *shadow_tree);
546int logfs_readpage_nolock(struct page *page);
547int logfs_write_buf(struct inode *inode, struct page *page, long flags);
548int logfs_delete(struct inode *inode, pgoff_t index,
549 struct shadow_tree *shadow_tree);
550int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs,
551 gc_level_t gc_level, long flags);
552int logfs_is_valid_block(struct super_block *sb, u64 ofs, u64 ino, u64 bix,
553 gc_level_t gc_level);
554int logfs_truncate(struct inode *inode, u64 size);
555u64 logfs_seek_hole(struct inode *inode, u64 bix);
556u64 logfs_seek_data(struct inode *inode, u64 bix);
557int logfs_open_segfile(struct super_block *sb);
558int logfs_init_rw(struct super_block *sb);
559void logfs_cleanup_rw(struct super_block *sb);
560void logfs_add_transaction(struct inode *inode, struct logfs_transaction *ta);
561void logfs_del_transaction(struct inode *inode, struct logfs_transaction *ta);
562void logfs_write_block(struct logfs_block *block, long flags);
563int logfs_write_obj_aliases_pagecache(struct super_block *sb);
564void logfs_get_segment_entry(struct super_block *sb, u32 segno,
565 struct logfs_segment_entry *se);
566void logfs_set_segment_used(struct super_block *sb, u64 ofs, int increment);
567void logfs_set_segment_erased(struct super_block *sb, u32 segno, u32 ec,
568 gc_level_t gc_level);
569void logfs_set_segment_reserved(struct super_block *sb, u32 segno);
570void logfs_set_segment_unreserved(struct super_block *sb, u32 segno, u32 ec);
571struct logfs_block *__alloc_block(struct super_block *sb,
572 u64 ino, u64 bix, level_t level);
573void __free_block(struct super_block *sb, struct logfs_block *block);
574void btree_write_block(struct logfs_block *block);
575void initialize_block_counters(struct page *page, struct logfs_block *block,
576 __be64 *array, int page_is_empty);
577int logfs_exist_block(struct inode *inode, u64 bix);
578int get_page_reserve(struct inode *inode, struct page *page);
579extern struct logfs_block_ops indirect_block_ops;
580
581/* segment.c */
582int logfs_erase_segment(struct super_block *sb, u32 ofs, int ensure_erase);
583int wbuf_read(struct super_block *sb, u64 ofs, size_t len, void *buf);
584int logfs_segment_read(struct inode *inode, struct page *page, u64 ofs, u64 bix,
585 level_t level);
586int logfs_segment_write(struct inode *inode, struct page *page,
587 struct logfs_shadow *shadow);
588int logfs_segment_delete(struct inode *inode, struct logfs_shadow *shadow);
589int logfs_load_object_aliases(struct super_block *sb,
590 struct logfs_obj_alias *oa, int count);
591void move_page_to_btree(struct page *page);
592int logfs_init_mapping(struct super_block *sb);
593void logfs_sync_area(struct logfs_area *area);
594void logfs_sync_segments(struct super_block *sb);
595void freeseg(struct super_block *sb, u32 segno);
596
597/* area handling */
598int logfs_init_areas(struct super_block *sb);
599void logfs_cleanup_areas(struct super_block *sb);
600int logfs_open_area(struct logfs_area *area, size_t bytes);
601void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
602 int use_filler);
603
604static inline void logfs_buf_write(struct logfs_area *area, u64 ofs,
605 void *buf, size_t len)
606{
607 __logfs_buf_write(area, ofs, buf, len, 0);
608}
609
610static inline void logfs_buf_recover(struct logfs_area *area, u64 ofs,
611 void *buf, size_t len)
612{
613 __logfs_buf_write(area, ofs, buf, len, 1);
614}
615
616/* super.c */
617struct page *emergency_read_begin(struct address_space *mapping, pgoff_t index);
618void emergency_read_end(struct page *page);
619void logfs_crash_dump(struct super_block *sb);
620void *memchr_inv(const void *s, int c, size_t n);
621int logfs_statfs(struct dentry *dentry, struct kstatfs *stats);
622int logfs_get_sb_device(struct file_system_type *type, int flags,
623 struct mtd_info *mtd, struct block_device *bdev,
624 const struct logfs_device_ops *devops, struct vfsmount *mnt);
625int logfs_check_ds(struct logfs_disk_super *ds);
626int logfs_write_sb(struct super_block *sb);
627
628static inline struct logfs_super *logfs_super(struct super_block *sb)
629{
630 return sb->s_fs_info;
631}
632
633static inline struct logfs_inode *logfs_inode(struct inode *inode)
634{
635 return container_of(inode, struct logfs_inode, vfs_inode);
636}
637
638static inline void logfs_set_ro(struct super_block *sb)
639{
640 logfs_super(sb)->s_flags |= LOGFS_SB_FLAG_RO;
641}
642
643#define LOGFS_BUG(sb) do { \
644 struct super_block *__sb = sb; \
645 logfs_crash_dump(__sb); \
646 logfs_super(__sb)->s_flags |= LOGFS_SB_FLAG_RO; \
647 BUG(); \
648} while (0)
649
650#define LOGFS_BUG_ON(condition, sb) \
651 do { if (unlikely(condition)) LOGFS_BUG((sb)); } while (0)
652
653static inline __be32 logfs_crc32(void *data, size_t len, size_t skip)
654{
655 return cpu_to_be32(crc32(~0, data+skip, len-skip));
656}
657
658static inline u8 logfs_type(struct inode *inode)
659{
660 return (inode->i_mode >> 12) & 15;
661}
662
663static inline pgoff_t logfs_index(struct super_block *sb, u64 pos)
664{
665 return pos >> sb->s_blocksize_bits;
666}
667
668static inline u64 dev_ofs(struct super_block *sb, u32 segno, u32 ofs)
669{
670 return ((u64)segno << logfs_super(sb)->s_segshift) + ofs;
671}
672
673static inline u32 seg_no(struct super_block *sb, u64 ofs)
674{
675 return ofs >> logfs_super(sb)->s_segshift;
676}
677
678static inline u32 seg_ofs(struct super_block *sb, u64 ofs)
679{
680 return ofs & logfs_super(sb)->s_segmask;
681}
682
683static inline u64 seg_align(struct super_block *sb, u64 ofs)
684{
685 return ofs & ~logfs_super(sb)->s_segmask;
686}
687
688static inline struct logfs_block *logfs_block(struct page *page)
689{
690 return (void *)page->private;
691}
692
693static inline level_t shrink_level(gc_level_t __level)
694{
695 u8 level = (__force u8)__level;
696
697 if (level >= LOGFS_MAX_LEVELS)
698 level -= LOGFS_MAX_LEVELS;
699 return (__force level_t)level;
700}
701
702static inline gc_level_t expand_level(u64 ino, level_t __level)
703{
704 u8 level = (__force u8)__level;
705
706 if (ino == LOGFS_INO_MASTER) {
707 /* ifile has seperate areas */
708 level += LOGFS_MAX_LEVELS;
709 }
710 return (__force gc_level_t)level;
711}
712
713static inline int logfs_block_shift(struct super_block *sb, level_t level)
714{
715 level = shrink_level((__force gc_level_t)level);
716 return (__force int)level * (sb->s_blocksize_bits - 3);
717}
718
719static inline u64 logfs_block_mask(struct super_block *sb, level_t level)
720{
721 return ~0ull << logfs_block_shift(sb, level);
722}
723
724static inline struct logfs_area *get_area(struct super_block *sb,
725 gc_level_t gc_level)
726{
727 return logfs_super(sb)->s_area[(__force u8)gc_level];
728}
729
730static inline void logfs_mempool_destroy(mempool_t *pool)
731{
732 if (pool)
733 mempool_destroy(pool);
734}
735
736#endif
diff --git a/fs/logfs/logfs_abi.h b/fs/logfs/logfs_abi.h
new file mode 100644
index 000000000000..f674725663fe
--- /dev/null
+++ b/fs/logfs/logfs_abi.h
@@ -0,0 +1,629 @@
1/*
2 * fs/logfs/logfs_abi.h
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 *
8 * Public header for logfs.
9 */
10#ifndef FS_LOGFS_LOGFS_ABI_H
11#define FS_LOGFS_LOGFS_ABI_H
12
13/* For out-of-kernel compiles */
14#ifndef BUILD_BUG_ON
15#define BUILD_BUG_ON(condition) /**/
16#endif
17
18#define SIZE_CHECK(type, size) \
19static inline void check_##type(void) \
20{ \
21 BUILD_BUG_ON(sizeof(struct type) != (size)); \
22}
23
24/*
25 * Throughout the logfs code, we're constantly dealing with blocks at
26 * various positions or offsets. To remove confusion, we stricly
27 * distinguish between a "position" - the logical position within a
28 * file and an "offset" - the physical location within the device.
29 *
30 * Any usage of the term offset for a logical location or position for
31 * a physical one is a bug and should get fixed.
32 */
33
34/*
35 * Block are allocated in one of several segments depending on their
36 * level. The following levels are used:
37 * 0 - regular data block
38 * 1 - i1 indirect blocks
39 * 2 - i2 indirect blocks
40 * 3 - i3 indirect blocks
41 * 4 - i4 indirect blocks
42 * 5 - i5 indirect blocks
43 * 6 - ifile data blocks
44 * 7 - ifile i1 indirect blocks
45 * 8 - ifile i2 indirect blocks
46 * 9 - ifile i3 indirect blocks
47 * 10 - ifile i4 indirect blocks
48 * 11 - ifile i5 indirect blocks
49 * Potential levels to be used in the future:
50 * 12 - gc recycled blocks, long-lived data
51 * 13 - replacement blocks, short-lived data
52 *
53 * Levels 1-11 are necessary for robust gc operations and help seperate
54 * short-lived metadata from longer-lived file data. In the future,
55 * file data should get seperated into several segments based on simple
56 * heuristics. Old data recycled during gc operation is expected to be
57 * long-lived. New data is of uncertain life expectancy. New data
58 * used to replace older blocks in existing files is expected to be
59 * short-lived.
60 */
61
62
63/* Magic numbers. 64bit for superblock, 32bit for statfs f_type */
64#define LOGFS_MAGIC 0x7a3a8e5cb9d5bf67ull
65#define LOGFS_MAGIC_U32 0xc97e8168u
66
67/*
68 * Various blocksize related macros. Blocksize is currently fixed at 4KiB.
69 * Sooner or later that should become configurable and the macros replaced
70 * by something superblock-dependent. Pointers in indirect blocks are and
71 * will remain 64bit.
72 *
73 * LOGFS_BLOCKSIZE - self-explaining
74 * LOGFS_BLOCK_FACTOR - number of pointers per indirect block
75 * LOGFS_BLOCK_BITS - log2 of LOGFS_BLOCK_FACTOR, used for shifts
76 */
77#define LOGFS_BLOCKSIZE (4096ull)
78#define LOGFS_BLOCK_FACTOR (LOGFS_BLOCKSIZE / sizeof(u64))
79#define LOGFS_BLOCK_BITS (9)
80
81/*
82 * Number of blocks at various levels of indirection. There are 16 direct
83 * block pointers plus a single indirect pointer.
84 */
85#define I0_BLOCKS (16)
86#define I1_BLOCKS LOGFS_BLOCK_FACTOR
87#define I2_BLOCKS (LOGFS_BLOCK_FACTOR * I1_BLOCKS)
88#define I3_BLOCKS (LOGFS_BLOCK_FACTOR * I2_BLOCKS)
89#define I4_BLOCKS (LOGFS_BLOCK_FACTOR * I3_BLOCKS)
90#define I5_BLOCKS (LOGFS_BLOCK_FACTOR * I4_BLOCKS)
91
92#define INDIRECT_INDEX I0_BLOCKS
93#define LOGFS_EMBEDDED_FIELDS (I0_BLOCKS + 1)
94
95/*
96 * Sizes at which files require another level of indirection. Files smaller
97 * than LOGFS_EMBEDDED_SIZE can be completely stored in the inode itself,
98 * similar like ext2 fast symlinks.
99 *
100 * Data at a position smaller than LOGFS_I0_SIZE is accessed through the
101 * direct pointers, else through the 1x indirect pointer and so forth.
102 */
103#define LOGFS_EMBEDDED_SIZE (LOGFS_EMBEDDED_FIELDS * sizeof(u64))
104#define LOGFS_I0_SIZE (I0_BLOCKS * LOGFS_BLOCKSIZE)
105#define LOGFS_I1_SIZE (I1_BLOCKS * LOGFS_BLOCKSIZE)
106#define LOGFS_I2_SIZE (I2_BLOCKS * LOGFS_BLOCKSIZE)
107#define LOGFS_I3_SIZE (I3_BLOCKS * LOGFS_BLOCKSIZE)
108#define LOGFS_I4_SIZE (I4_BLOCKS * LOGFS_BLOCKSIZE)
109#define LOGFS_I5_SIZE (I5_BLOCKS * LOGFS_BLOCKSIZE)
110
111/*
112 * Each indirect block pointer must have this flag set, if all block pointers
113 * behind it are set, i.e. there is no hole hidden in the shadow of this
114 * indirect block pointer.
115 */
116#define LOGFS_FULLY_POPULATED (1ULL << 63)
117#define pure_ofs(ofs) (ofs & ~LOGFS_FULLY_POPULATED)
118
119/*
120 * LogFS needs to seperate data into levels. Each level is defined as the
121 * maximal possible distance from the master inode (inode of the inode file).
122 * Data blocks reside on level 0, 1x indirect block on level 1, etc.
123 * Inodes reside on level 6, indirect blocks for the inode file on levels 7-11.
124 * This effort is necessary to guarantee garbage collection to always make
125 * progress.
126 *
127 * LOGFS_MAX_INDIRECT is the maximal indirection through indirect blocks,
128 * LOGFS_MAX_LEVELS is one more for the actual data level of a file. It is
129 * the maximal number of levels for one file.
130 * LOGFS_NO_AREAS is twice that, as the inode file and regular files are
131 * effectively stacked on top of each other.
132 */
133#define LOGFS_MAX_INDIRECT (5)
134#define LOGFS_MAX_LEVELS (LOGFS_MAX_INDIRECT + 1)
135#define LOGFS_NO_AREAS (2 * LOGFS_MAX_LEVELS)
136
137/* Maximum size of filenames */
138#define LOGFS_MAX_NAMELEN (255)
139
140/* Number of segments in the primary journal. */
141#define LOGFS_JOURNAL_SEGS (16)
142
143/* Maximum number of free/erased/etc. segments in journal entries */
144#define MAX_CACHED_SEGS (64)
145
146
147/*
148 * LOGFS_OBJECT_HEADERSIZE is the size of a single header in the object store,
149 * LOGFS_MAX_OBJECTSIZE the size of the largest possible object, including
150 * its header,
151 * LOGFS_SEGMENT_RESERVE is the amount of space reserved for each segment for
152 * its segment header and the padded space at the end when no further objects
153 * fit.
154 */
155#define LOGFS_OBJECT_HEADERSIZE (0x1c)
156#define LOGFS_SEGMENT_HEADERSIZE (0x18)
157#define LOGFS_MAX_OBJECTSIZE (LOGFS_OBJECT_HEADERSIZE + LOGFS_BLOCKSIZE)
158#define LOGFS_SEGMENT_RESERVE \
159 (LOGFS_SEGMENT_HEADERSIZE + LOGFS_MAX_OBJECTSIZE - 1)
160
161/*
162 * Segment types:
163 * SEG_SUPER - Data or indirect block
164 * SEG_JOURNAL - Inode
165 * SEG_OSTORE - Dentry
166 */
167enum {
168 SEG_SUPER = 0x01,
169 SEG_JOURNAL = 0x02,
170 SEG_OSTORE = 0x03,
171};
172
173/**
174 * struct logfs_segment_header - per-segment header in the ostore
175 *
176 * @crc: crc32 of header (there is no data)
177 * @pad: unused, must be 0
178 * @type: segment type, see above
179 * @level: GC level for all objects in this segment
180 * @segno: segment number
181 * @ec: erase count for this segment
182 * @gec: global erase count at time of writing
183 */
184struct logfs_segment_header {
185 __be32 crc;
186 __be16 pad;
187 __u8 type;
188 __u8 level;
189 __be32 segno;
190 __be32 ec;
191 __be64 gec;
192};
193
194SIZE_CHECK(logfs_segment_header, LOGFS_SEGMENT_HEADERSIZE);
195
196#define LOGFS_FEATURES_INCOMPAT (0ull)
197#define LOGFS_FEATURES_RO_COMPAT (0ull)
198#define LOGFS_FEATURES_COMPAT (0ull)
199
200/**
201 * struct logfs_disk_super - on-medium superblock
202 *
203 * @ds_magic: magic number, must equal LOGFS_MAGIC
204 * @ds_crc: crc32 of structure starting with the next field
205 * @ds_ifile_levels: maximum number of levels for ifile
206 * @ds_iblock_levels: maximum number of levels for regular files
207 * @ds_data_levels: number of seperate levels for data
208 * @pad0: reserved, must be 0
209 * @ds_feature_incompat: incompatible filesystem features
210 * @ds_feature_ro_compat: read-only compatible filesystem features
211 * @ds_feature_compat: compatible filesystem features
212 * @ds_flags: flags
213 * @ds_segment_shift: log2 of segment size
214 * @ds_block_shift: log2 of block size
215 * @ds_write_shift: log2 of write size
216 * @pad1: reserved, must be 0
217 * @ds_journal_seg: segments used by primary journal
218 * @ds_root_reserve: bytes reserved for the superuser
219 * @ds_speed_reserve: bytes reserved to speed up GC
220 * @ds_bad_seg_reserve: number of segments reserved to handle bad blocks
221 * @pad2: reserved, must be 0
222 * @pad3: reserved, must be 0
223 *
224 * Contains only read-only fields. Read-write fields like the amount of used
225 * space is tracked in the dynamic superblock, which is stored in the journal.
226 */
227struct logfs_disk_super {
228 struct logfs_segment_header ds_sh;
229 __be64 ds_magic;
230
231 __be32 ds_crc;
232 __u8 ds_ifile_levels;
233 __u8 ds_iblock_levels;
234 __u8 ds_data_levels;
235 __u8 ds_segment_shift;
236 __u8 ds_block_shift;
237 __u8 ds_write_shift;
238 __u8 pad0[6];
239
240 __be64 ds_filesystem_size;
241 __be32 ds_segment_size;
242 __be32 ds_bad_seg_reserve;
243
244 __be64 ds_feature_incompat;
245 __be64 ds_feature_ro_compat;
246
247 __be64 ds_feature_compat;
248 __be64 ds_feature_flags;
249
250 __be64 ds_root_reserve;
251 __be64 ds_speed_reserve;
252
253 __be32 ds_journal_seg[LOGFS_JOURNAL_SEGS];
254
255 __be64 ds_super_ofs[2];
256 __be64 pad3[8];
257};
258
259SIZE_CHECK(logfs_disk_super, 256);
260
261/*
262 * Object types:
263 * OBJ_BLOCK - Data or indirect block
264 * OBJ_INODE - Inode
265 * OBJ_DENTRY - Dentry
266 */
267enum {
268 OBJ_BLOCK = 0x04,
269 OBJ_INODE = 0x05,
270 OBJ_DENTRY = 0x06,
271};
272
273/**
274 * struct logfs_object_header - per-object header in the ostore
275 *
276 * @crc: crc32 of header, excluding data_crc
277 * @len: length of data
278 * @type: object type, see above
279 * @compr: compression type
280 * @ino: inode number
281 * @bix: block index
282 * @data_crc: crc32 of payload
283 */
284struct logfs_object_header {
285 __be32 crc;
286 __be16 len;
287 __u8 type;
288 __u8 compr;
289 __be64 ino;
290 __be64 bix;
291 __be32 data_crc;
292} __attribute__((packed));
293
294SIZE_CHECK(logfs_object_header, LOGFS_OBJECT_HEADERSIZE);
295
296/*
297 * Reserved inode numbers:
298 * LOGFS_INO_MASTER - master inode (for inode file)
299 * LOGFS_INO_ROOT - root directory
300 * LOGFS_INO_SEGFILE - per-segment used bytes and erase count
301 */
302enum {
303 LOGFS_INO_MAPPING = 0x00,
304 LOGFS_INO_MASTER = 0x01,
305 LOGFS_INO_ROOT = 0x02,
306 LOGFS_INO_SEGFILE = 0x03,
307 LOGFS_RESERVED_INOS = 0x10,
308};
309
310/*
311 * Inode flags. High bits should never be written to the medium. They are
312 * reserved for in-memory usage.
313 * Low bits should either remain in sync with the corresponding FS_*_FL or
314 * reuse slots that obviously don't make sense for logfs.
315 *
316 * LOGFS_IF_DIRTY Inode must be written back
317 * LOGFS_IF_ZOMBIE Inode has been deleted
318 * LOGFS_IF_STILLBORN -ENOSPC happened when creating inode
319 */
320#define LOGFS_IF_COMPRESSED 0x00000004 /* == FS_COMPR_FL */
321#define LOGFS_IF_DIRTY 0x20000000
322#define LOGFS_IF_ZOMBIE 0x40000000
323#define LOGFS_IF_STILLBORN 0x80000000
324
325/* Flags available to chattr */
326#define LOGFS_FL_USER_VISIBLE (LOGFS_IF_COMPRESSED)
327#define LOGFS_FL_USER_MODIFIABLE (LOGFS_IF_COMPRESSED)
328/* Flags inherited from parent directory on file/directory creation */
329#define LOGFS_FL_INHERITED (LOGFS_IF_COMPRESSED)
330
331/**
332 * struct logfs_disk_inode - on-medium inode
333 *
334 * @di_mode: file mode
335 * @di_pad: reserved, must be 0
336 * @di_flags: inode flags, see above
337 * @di_uid: user id
338 * @di_gid: group id
339 * @di_ctime: change time
340 * @di_mtime: modify time
341 * @di_refcount: reference count (aka nlink or link count)
342 * @di_generation: inode generation, for nfs
343 * @di_used_bytes: number of bytes used
344 * @di_size: file size
345 * @di_data: data pointers
346 */
347struct logfs_disk_inode {
348 __be16 di_mode;
349 __u8 di_height;
350 __u8 di_pad;
351 __be32 di_flags;
352 __be32 di_uid;
353 __be32 di_gid;
354
355 __be64 di_ctime;
356 __be64 di_mtime;
357
358 __be64 di_atime;
359 __be32 di_refcount;
360 __be32 di_generation;
361
362 __be64 di_used_bytes;
363 __be64 di_size;
364
365 __be64 di_data[LOGFS_EMBEDDED_FIELDS];
366};
367
368SIZE_CHECK(logfs_disk_inode, 200);
369
370#define INODE_POINTER_OFS \
371 (offsetof(struct logfs_disk_inode, di_data) / sizeof(__be64))
372#define INODE_USED_OFS \
373 (offsetof(struct logfs_disk_inode, di_used_bytes) / sizeof(__be64))
374#define INODE_SIZE_OFS \
375 (offsetof(struct logfs_disk_inode, di_size) / sizeof(__be64))
376#define INODE_HEIGHT_OFS (0)
377
378/**
379 * struct logfs_disk_dentry - on-medium dentry structure
380 *
381 * @ino: inode number
382 * @namelen: length of file name
383 * @type: file type, identical to bits 12..15 of mode
384 * @name: file name
385 */
386/* FIXME: add 6 bytes of padding to remove the __packed */
387struct logfs_disk_dentry {
388 __be64 ino;
389 __be16 namelen;
390 __u8 type;
391 __u8 name[LOGFS_MAX_NAMELEN];
392} __attribute__((packed));
393
394SIZE_CHECK(logfs_disk_dentry, 266);
395
396#define RESERVED 0xffffffff
397#define BADSEG 0xffffffff
398/**
399 * struct logfs_segment_entry - segment file entry
400 *
401 * @ec_level: erase count and level
402 * @valid: number of valid bytes
403 *
404 * Segment file contains one entry for every segment. ec_level contains the
405 * erasecount in the upper 28 bits and the level in the lower 4 bits. An
406 * ec_level of BADSEG (-1) identifies bad segments. valid contains the number
407 * of valid bytes or RESERVED (-1 again) if the segment is used for either the
408 * superblock or the journal, or when the segment is bad.
409 */
410struct logfs_segment_entry {
411 __be32 ec_level;
412 __be32 valid;
413};
414
415SIZE_CHECK(logfs_segment_entry, 8);
416
417/**
418 * struct logfs_journal_header - header for journal entries (JEs)
419 *
420 * @h_crc: crc32 of journal entry
421 * @h_len: length of compressed journal entry,
422 * not including header
423 * @h_datalen: length of uncompressed data
424 * @h_type: JE type
425 * @h_compr: compression type
426 * @h_pad: reserved
427 */
428struct logfs_journal_header {
429 __be32 h_crc;
430 __be16 h_len;
431 __be16 h_datalen;
432 __be16 h_type;
433 __u8 h_compr;
434 __u8 h_pad[5];
435};
436
437SIZE_CHECK(logfs_journal_header, 16);
438
439/*
440 * Life expectency of data.
441 * VIM_DEFAULT - default vim
442 * VIM_SEGFILE - for segment file only - very short-living
443 * VIM_GC - GC'd data - likely long-living
444 */
445enum logfs_vim {
446 VIM_DEFAULT = 0,
447 VIM_SEGFILE = 1,
448};
449
450/**
451 * struct logfs_je_area - wbuf header
452 *
453 * @segno: segment number of area
454 * @used_bytes: number of bytes already used
455 * @gc_level: GC level
456 * @vim: life expectancy of data
457 *
458 * "Areas" are segments currently being used for writing. There is at least
459 * one area per GC level. Several may be used to seperate long-living from
460 * short-living data. If an area with unknown vim is encountered, it can
461 * simply be closed.
462 * The write buffer immediately follow this header.
463 */
464struct logfs_je_area {
465 __be32 segno;
466 __be32 used_bytes;
467 __u8 gc_level;
468 __u8 vim;
469} __attribute__((packed));
470
471SIZE_CHECK(logfs_je_area, 10);
472
473#define MAX_JOURNAL_HEADER \
474 (sizeof(struct logfs_journal_header) + sizeof(struct logfs_je_area))
475
476/**
477 * struct logfs_je_dynsb - dynamic superblock
478 *
479 * @ds_gec: global erase count
480 * @ds_sweeper: current position of GC "sweeper"
481 * @ds_rename_dir: source directory ino (see dir.c documentation)
482 * @ds_rename_pos: position of source dd (see dir.c documentation)
483 * @ds_victim_ino: victims of incomplete dir operation (see dir.c)
484 * @ds_victim_ino: parent inode of victim (see dir.c)
485 * @ds_used_bytes: number of used bytes
486 */
487struct logfs_je_dynsb {
488 __be64 ds_gec;
489 __be64 ds_sweeper;
490
491 __be64 ds_rename_dir;
492 __be64 ds_rename_pos;
493
494 __be64 ds_victim_ino;
495 __be64 ds_victim_parent; /* XXX */
496
497 __be64 ds_used_bytes;
498 __be32 ds_generation;
499 __be32 pad;
500};
501
502SIZE_CHECK(logfs_je_dynsb, 64);
503
504/**
505 * struct logfs_je_anchor - anchor of filesystem tree, aka master inode
506 *
507 * @da_size: size of inode file
508 * @da_last_ino: last created inode
509 * @da_used_bytes: number of bytes used
510 * @da_data: data pointers
511 */
512struct logfs_je_anchor {
513 __be64 da_size;
514 __be64 da_last_ino;
515
516 __be64 da_used_bytes;
517 u8 da_height;
518 u8 pad[7];
519
520 __be64 da_data[LOGFS_EMBEDDED_FIELDS];
521};
522
523SIZE_CHECK(logfs_je_anchor, 168);
524
525/**
526 * struct logfs_je_spillout - spillout entry (from 1st to 2nd journal)
527 *
528 * @so_segment: segments used for 2nd journal
529 *
530 * Length of the array is given by h_len field in the header.
531 */
532struct logfs_je_spillout {
533 __be64 so_segment[0];
534};
535
536SIZE_CHECK(logfs_je_spillout, 0);
537
538/**
539 * struct logfs_je_journal_ec - erase counts for all journal segments
540 *
541 * @ec: erase count
542 *
543 * Length of the array is given by h_len field in the header.
544 */
545struct logfs_je_journal_ec {
546 __be32 ec[0];
547};
548
549SIZE_CHECK(logfs_je_journal_ec, 0);
550
551/**
552 * struct logfs_je_free_segments - list of free segmetns with erase count
553 */
554struct logfs_je_free_segments {
555 __be32 segno;
556 __be32 ec;
557};
558
559SIZE_CHECK(logfs_je_free_segments, 8);
560
561/**
562 * struct logfs_seg_alias - list of segment aliases
563 */
564struct logfs_seg_alias {
565 __be32 old_segno;
566 __be32 new_segno;
567};
568
569SIZE_CHECK(logfs_seg_alias, 8);
570
571/**
572 * struct logfs_obj_alias - list of object aliases
573 */
574struct logfs_obj_alias {
575 __be64 ino;
576 __be64 bix;
577 __be64 val;
578 u8 level;
579 u8 pad[5];
580 __be16 child_no;
581};
582
583SIZE_CHECK(logfs_obj_alias, 32);
584
585/**
586 * Compression types.
587 *
588 * COMPR_NONE - uncompressed
589 * COMPR_ZLIB - compressed with zlib
590 */
591enum {
592 COMPR_NONE = 0,
593 COMPR_ZLIB = 1,
594};
595
596/*
597 * Journal entries come in groups of 16. First group contains unique
598 * entries, next groups contain one entry per level
599 *
600 * JE_FIRST - smallest possible journal entry number
601 *
602 * JEG_BASE - base group, containing unique entries
603 * JE_COMMIT - commit entry, validates all previous entries
604 * JE_DYNSB - dynamic superblock, anything that ought to be in the
605 * superblock but cannot because it is read-write data
606 * JE_ANCHOR - anchor aka master inode aka inode file's inode
607 * JE_ERASECOUNT erasecounts for all journal segments
608 * JE_SPILLOUT - unused
609 * JE_SEG_ALIAS - aliases segments
610 * JE_AREA - area description
611 *
612 * JE_LAST - largest possible journal entry number
613 */
614enum {
615 JE_FIRST = 0x01,
616
617 JEG_BASE = 0x00,
618 JE_COMMIT = 0x02,
619 JE_DYNSB = 0x03,
620 JE_ANCHOR = 0x04,
621 JE_ERASECOUNT = 0x05,
622 JE_SPILLOUT = 0x06,
623 JE_OBJ_ALIAS = 0x0d,
624 JE_AREA = 0x0e,
625
626 JE_LAST = 0x0e,
627};
628
629#endif
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
new file mode 100644
index 000000000000..3159db6958e5
--- /dev/null
+++ b/fs/logfs/readwrite.c
@@ -0,0 +1,2267 @@
1/*
2 * fs/logfs/readwrite.c
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 *
8 *
9 * Actually contains five sets of very similar functions:
10 * read read blocks from a file
11 * seek_hole find next hole
12 * seek_data find next data block
13 * valid check whether a block still belongs to a file
14 * write write blocks to a file
15 * delete delete a block (for directories and ifile)
16 * rewrite move existing blocks of a file to a new location (gc helper)
17 * truncate truncate a file
18 */
19#include "logfs.h"
20#include <linux/sched.h>
21#include <linux/slab.h>
22
23static u64 adjust_bix(u64 bix, level_t level)
24{
25 switch (level) {
26 case 0:
27 return bix;
28 case LEVEL(1):
29 return max_t(u64, bix, I0_BLOCKS);
30 case LEVEL(2):
31 return max_t(u64, bix, I1_BLOCKS);
32 case LEVEL(3):
33 return max_t(u64, bix, I2_BLOCKS);
34 case LEVEL(4):
35 return max_t(u64, bix, I3_BLOCKS);
36 case LEVEL(5):
37 return max_t(u64, bix, I4_BLOCKS);
38 default:
39 WARN_ON(1);
40 return bix;
41 }
42}
43
44static inline u64 maxbix(u8 height)
45{
46 return 1ULL << (LOGFS_BLOCK_BITS * height);
47}
48
49/**
50 * The inode address space is cut in two halves. Lower half belongs to data
51 * pages, upper half to indirect blocks. If the high bit (INDIRECT_BIT) is
52 * set, the actual block index (bix) and level can be derived from the page
53 * index.
54 *
55 * The lowest three bits of the block index are set to 0 after packing and
56 * unpacking. Since the lowest n bits (9 for 4KiB blocksize) are ignored
57 * anyway this is harmless.
58 */
59#define ARCH_SHIFT (BITS_PER_LONG - 32)
60#define INDIRECT_BIT (0x80000000UL << ARCH_SHIFT)
61#define LEVEL_SHIFT (28 + ARCH_SHIFT)
62static inline pgoff_t first_indirect_block(void)
63{
64 return INDIRECT_BIT | (1ULL << LEVEL_SHIFT);
65}
66
67pgoff_t logfs_pack_index(u64 bix, level_t level)
68{
69 pgoff_t index;
70
71 BUG_ON(bix >= INDIRECT_BIT);
72 if (level == 0)
73 return bix;
74
75 index = INDIRECT_BIT;
76 index |= (__force long)level << LEVEL_SHIFT;
77 index |= bix >> ((__force u8)level * LOGFS_BLOCK_BITS);
78 return index;
79}
80
81void logfs_unpack_index(pgoff_t index, u64 *bix, level_t *level)
82{
83 u8 __level;
84
85 if (!(index & INDIRECT_BIT)) {
86 *bix = index;
87 *level = 0;
88 return;
89 }
90
91 __level = (index & ~INDIRECT_BIT) >> LEVEL_SHIFT;
92 *level = LEVEL(__level);
93 *bix = (index << (__level * LOGFS_BLOCK_BITS)) & ~INDIRECT_BIT;
94 *bix = adjust_bix(*bix, *level);
95 return;
96}
97#undef ARCH_SHIFT
98#undef INDIRECT_BIT
99#undef LEVEL_SHIFT
100
101/*
102 * Time is stored as nanoseconds since the epoch.
103 */
104static struct timespec be64_to_timespec(__be64 betime)
105{
106 return ns_to_timespec(be64_to_cpu(betime));
107}
108
109static __be64 timespec_to_be64(struct timespec tsp)
110{
111 return cpu_to_be64((u64)tsp.tv_sec * NSEC_PER_SEC + tsp.tv_nsec);
112}
113
114static void logfs_disk_to_inode(struct logfs_disk_inode *di, struct inode*inode)
115{
116 struct logfs_inode *li = logfs_inode(inode);
117 int i;
118
119 inode->i_mode = be16_to_cpu(di->di_mode);
120 li->li_height = di->di_height;
121 li->li_flags = be32_to_cpu(di->di_flags);
122 inode->i_uid = be32_to_cpu(di->di_uid);
123 inode->i_gid = be32_to_cpu(di->di_gid);
124 inode->i_size = be64_to_cpu(di->di_size);
125 logfs_set_blocks(inode, be64_to_cpu(di->di_used_bytes));
126 inode->i_atime = be64_to_timespec(di->di_atime);
127 inode->i_ctime = be64_to_timespec(di->di_ctime);
128 inode->i_mtime = be64_to_timespec(di->di_mtime);
129 inode->i_nlink = be32_to_cpu(di->di_refcount);
130 inode->i_generation = be32_to_cpu(di->di_generation);
131
132 switch (inode->i_mode & S_IFMT) {
133 case S_IFSOCK: /* fall through */
134 case S_IFBLK: /* fall through */
135 case S_IFCHR: /* fall through */
136 case S_IFIFO:
137 inode->i_rdev = be64_to_cpu(di->di_data[0]);
138 break;
139 case S_IFDIR: /* fall through */
140 case S_IFREG: /* fall through */
141 case S_IFLNK:
142 for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
143 li->li_data[i] = be64_to_cpu(di->di_data[i]);
144 break;
145 default:
146 BUG();
147 }
148}
149
150static void logfs_inode_to_disk(struct inode *inode, struct logfs_disk_inode*di)
151{
152 struct logfs_inode *li = logfs_inode(inode);
153 int i;
154
155 di->di_mode = cpu_to_be16(inode->i_mode);
156 di->di_height = li->li_height;
157 di->di_pad = 0;
158 di->di_flags = cpu_to_be32(li->li_flags);
159 di->di_uid = cpu_to_be32(inode->i_uid);
160 di->di_gid = cpu_to_be32(inode->i_gid);
161 di->di_size = cpu_to_be64(i_size_read(inode));
162 di->di_used_bytes = cpu_to_be64(li->li_used_bytes);
163 di->di_atime = timespec_to_be64(inode->i_atime);
164 di->di_ctime = timespec_to_be64(inode->i_ctime);
165 di->di_mtime = timespec_to_be64(inode->i_mtime);
166 di->di_refcount = cpu_to_be32(inode->i_nlink);
167 di->di_generation = cpu_to_be32(inode->i_generation);
168
169 switch (inode->i_mode & S_IFMT) {
170 case S_IFSOCK: /* fall through */
171 case S_IFBLK: /* fall through */
172 case S_IFCHR: /* fall through */
173 case S_IFIFO:
174 di->di_data[0] = cpu_to_be64(inode->i_rdev);
175 break;
176 case S_IFDIR: /* fall through */
177 case S_IFREG: /* fall through */
178 case S_IFLNK:
179 for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
180 di->di_data[i] = cpu_to_be64(li->li_data[i]);
181 break;
182 default:
183 BUG();
184 }
185}
186
187static void __logfs_set_blocks(struct inode *inode)
188{
189 struct super_block *sb = inode->i_sb;
190 struct logfs_inode *li = logfs_inode(inode);
191
192 inode->i_blocks = ULONG_MAX;
193 if (li->li_used_bytes >> sb->s_blocksize_bits < ULONG_MAX)
194 inode->i_blocks = ALIGN(li->li_used_bytes, 512) >> 9;
195}
196
197void logfs_set_blocks(struct inode *inode, u64 bytes)
198{
199 struct logfs_inode *li = logfs_inode(inode);
200
201 li->li_used_bytes = bytes;
202 __logfs_set_blocks(inode);
203}
204
205static void prelock_page(struct super_block *sb, struct page *page, int lock)
206{
207 struct logfs_super *super = logfs_super(sb);
208
209 BUG_ON(!PageLocked(page));
210 if (lock) {
211 BUG_ON(PagePreLocked(page));
212 SetPagePreLocked(page);
213 } else {
214 /* We are in GC path. */
215 if (PagePreLocked(page))
216 super->s_lock_count++;
217 else
218 SetPagePreLocked(page);
219 }
220}
221
222static void preunlock_page(struct super_block *sb, struct page *page, int lock)
223{
224 struct logfs_super *super = logfs_super(sb);
225
226 BUG_ON(!PageLocked(page));
227 if (lock)
228 ClearPagePreLocked(page);
229 else {
230 /* We are in GC path. */
231 BUG_ON(!PagePreLocked(page));
232 if (super->s_lock_count)
233 super->s_lock_count--;
234 else
235 ClearPagePreLocked(page);
236 }
237}
238
239/*
240 * Logfs is prone to an AB-BA deadlock where one task tries to acquire
241 * s_write_mutex with a locked page and GC tries to get that page while holding
242 * s_write_mutex.
243 * To solve this issue logfs will ignore the page lock iff the page in question
244 * is waiting for s_write_mutex. We annotate this fact by setting PG_pre_locked
245 * in addition to PG_locked.
246 */
247static void logfs_get_wblocks(struct super_block *sb, struct page *page,
248 int lock)
249{
250 struct logfs_super *super = logfs_super(sb);
251
252 if (page)
253 prelock_page(sb, page, lock);
254
255 if (lock) {
256 mutex_lock(&super->s_write_mutex);
257 logfs_gc_pass(sb);
258 /* FIXME: We also have to check for shadowed space
259 * and mempool fill grade */
260 }
261}
262
263static void logfs_put_wblocks(struct super_block *sb, struct page *page,
264 int lock)
265{
266 struct logfs_super *super = logfs_super(sb);
267
268 if (page)
269 preunlock_page(sb, page, lock);
270 /* Order matters - we must clear PG_pre_locked before releasing
271 * s_write_mutex or we could race against another task. */
272 if (lock)
273 mutex_unlock(&super->s_write_mutex);
274}
275
276static struct page *logfs_get_read_page(struct inode *inode, u64 bix,
277 level_t level)
278{
279 return find_or_create_page(inode->i_mapping,
280 logfs_pack_index(bix, level), GFP_NOFS);
281}
282
283static void logfs_put_read_page(struct page *page)
284{
285 unlock_page(page);
286 page_cache_release(page);
287}
288
289static void logfs_lock_write_page(struct page *page)
290{
291 int loop = 0;
292
293 while (unlikely(!trylock_page(page))) {
294 if (loop++ > 0x1000) {
295 /* Has been observed once so far... */
296 printk(KERN_ERR "stack at %p\n", &loop);
297 BUG();
298 }
299 if (PagePreLocked(page)) {
300 /* Holder of page lock is waiting for us, it
301 * is safe to use this page. */
302 break;
303 }
304 /* Some other process has this page locked and has
305 * nothing to do with us. Wait for it to finish.
306 */
307 schedule();
308 }
309 BUG_ON(!PageLocked(page));
310}
311
312static struct page *logfs_get_write_page(struct inode *inode, u64 bix,
313 level_t level)
314{
315 struct address_space *mapping = inode->i_mapping;
316 pgoff_t index = logfs_pack_index(bix, level);
317 struct page *page;
318 int err;
319
320repeat:
321 page = find_get_page(mapping, index);
322 if (!page) {
323 page = __page_cache_alloc(GFP_NOFS);
324 if (!page)
325 return NULL;
326 err = add_to_page_cache_lru(page, mapping, index, GFP_NOFS);
327 if (unlikely(err)) {
328 page_cache_release(page);
329 if (err == -EEXIST)
330 goto repeat;
331 return NULL;
332 }
333 } else logfs_lock_write_page(page);
334 BUG_ON(!PageLocked(page));
335 return page;
336}
337
338static void logfs_unlock_write_page(struct page *page)
339{
340 if (!PagePreLocked(page))
341 unlock_page(page);
342}
343
344static void logfs_put_write_page(struct page *page)
345{
346 logfs_unlock_write_page(page);
347 page_cache_release(page);
348}
349
350static struct page *logfs_get_page(struct inode *inode, u64 bix, level_t level,
351 int rw)
352{
353 if (rw == READ)
354 return logfs_get_read_page(inode, bix, level);
355 else
356 return logfs_get_write_page(inode, bix, level);
357}
358
359static void logfs_put_page(struct page *page, int rw)
360{
361 if (rw == READ)
362 logfs_put_read_page(page);
363 else
364 logfs_put_write_page(page);
365}
366
367static unsigned long __get_bits(u64 val, int skip, int no)
368{
369 u64 ret = val;
370
371 ret >>= skip * no;
372 ret <<= 64 - no;
373 ret >>= 64 - no;
374 return ret;
375}
376
377static unsigned long get_bits(u64 val, level_t skip)
378{
379 return __get_bits(val, (__force int)skip, LOGFS_BLOCK_BITS);
380}
381
382static inline void init_shadow_tree(struct super_block *sb,
383 struct shadow_tree *tree)
384{
385 struct logfs_super *super = logfs_super(sb);
386
387 btree_init_mempool64(&tree->new, super->s_btree_pool);
388 btree_init_mempool64(&tree->old, super->s_btree_pool);
389}
390
391static void indirect_write_block(struct logfs_block *block)
392{
393 struct page *page;
394 struct inode *inode;
395 int ret;
396
397 page = block->page;
398 inode = page->mapping->host;
399 logfs_lock_write_page(page);
400 ret = logfs_write_buf(inode, page, 0);
401 logfs_unlock_write_page(page);
402 /*
403 * This needs some rework. Unless you want your filesystem to run
404 * completely synchronously (you don't), the filesystem will always
405 * report writes as 'successful' before the actual work has been
406 * done. The actual work gets done here and this is where any errors
407 * will show up. And there isn't much we can do about it, really.
408 *
409 * Some attempts to fix the errors (move from bad blocks, retry io,...)
410 * have already been done, so anything left should be either a broken
411 * device or a bug somewhere in logfs itself. Being relatively new,
412 * the odds currently favor a bug, so for now the line below isn't
413 * entirely tasteles.
414 */
415 BUG_ON(ret);
416}
417
418static void inode_write_block(struct logfs_block *block)
419{
420 struct inode *inode;
421 int ret;
422
423 inode = block->inode;
424 if (inode->i_ino == LOGFS_INO_MASTER)
425 logfs_write_anchor(inode->i_sb);
426 else {
427 ret = __logfs_write_inode(inode, 0);
428 /* see indirect_write_block comment */
429 BUG_ON(ret);
430 }
431}
432
433/*
434 * This silences a false, yet annoying gcc warning. I hate it when my editor
435 * jumps into bitops.h each time I recompile this file.
436 * TODO: Complain to gcc folks about this and upgrade compiler.
437 */
438static unsigned long fnb(const unsigned long *addr,
439 unsigned long size, unsigned long offset)
440{
441 return find_next_bit(addr, size, offset);
442}
443
444static __be64 inode_val0(struct inode *inode)
445{
446 struct logfs_inode *li = logfs_inode(inode);
447 u64 val;
448
449 /*
450 * Explicit shifting generates good code, but must match the format
451 * of the structure. Add some paranoia just in case.
452 */
453 BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_mode) != 0);
454 BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_height) != 2);
455 BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_flags) != 4);
456
457 val = (u64)inode->i_mode << 48 |
458 (u64)li->li_height << 40 |
459 (u64)li->li_flags;
460 return cpu_to_be64(val);
461}
462
463static int inode_write_alias(struct super_block *sb,
464 struct logfs_block *block, write_alias_t *write_one_alias)
465{
466 struct inode *inode = block->inode;
467 struct logfs_inode *li = logfs_inode(inode);
468 unsigned long pos;
469 u64 ino , bix;
470 __be64 val;
471 level_t level;
472 int err;
473
474 for (pos = 0; ; pos++) {
475 pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos);
476 if (pos >= LOGFS_EMBEDDED_FIELDS + INODE_POINTER_OFS)
477 return 0;
478
479 switch (pos) {
480 case INODE_HEIGHT_OFS:
481 val = inode_val0(inode);
482 break;
483 case INODE_USED_OFS:
484 val = cpu_to_be64(li->li_used_bytes);;
485 break;
486 case INODE_SIZE_OFS:
487 val = cpu_to_be64(i_size_read(inode));
488 break;
489 case INODE_POINTER_OFS ... INODE_POINTER_OFS + LOGFS_EMBEDDED_FIELDS - 1:
490 val = cpu_to_be64(li->li_data[pos - INODE_POINTER_OFS]);
491 break;
492 default:
493 BUG();
494 }
495
496 ino = LOGFS_INO_MASTER;
497 bix = inode->i_ino;
498 level = LEVEL(0);
499 err = write_one_alias(sb, ino, bix, level, pos, val);
500 if (err)
501 return err;
502 }
503}
504
505static int indirect_write_alias(struct super_block *sb,
506 struct logfs_block *block, write_alias_t *write_one_alias)
507{
508 unsigned long pos;
509 struct page *page = block->page;
510 u64 ino , bix;
511 __be64 *child, val;
512 level_t level;
513 int err;
514
515 for (pos = 0; ; pos++) {
516 pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos);
517 if (pos >= LOGFS_BLOCK_FACTOR)
518 return 0;
519
520 ino = page->mapping->host->i_ino;
521 logfs_unpack_index(page->index, &bix, &level);
522 child = kmap_atomic(page, KM_USER0);
523 val = child[pos];
524 kunmap_atomic(child, KM_USER0);
525 err = write_one_alias(sb, ino, bix, level, pos, val);
526 if (err)
527 return err;
528 }
529}
530
531int logfs_write_obj_aliases_pagecache(struct super_block *sb)
532{
533 struct logfs_super *super = logfs_super(sb);
534 struct logfs_block *block;
535 int err;
536
537 list_for_each_entry(block, &super->s_object_alias, alias_list) {
538 err = block->ops->write_alias(sb, block, write_alias_journal);
539 if (err)
540 return err;
541 }
542 return 0;
543}
544
545void __free_block(struct super_block *sb, struct logfs_block *block)
546{
547 BUG_ON(!list_empty(&block->item_list));
548 list_del(&block->alias_list);
549 mempool_free(block, logfs_super(sb)->s_block_pool);
550}
551
552static void inode_free_block(struct super_block *sb, struct logfs_block *block)
553{
554 struct inode *inode = block->inode;
555
556 logfs_inode(inode)->li_block = NULL;
557 __free_block(sb, block);
558}
559
560static void indirect_free_block(struct super_block *sb,
561 struct logfs_block *block)
562{
563 ClearPagePrivate(block->page);
564 block->page->private = 0;
565 __free_block(sb, block);
566}
567
568
569static struct logfs_block_ops inode_block_ops = {
570 .write_block = inode_write_block,
571 .free_block = inode_free_block,
572 .write_alias = inode_write_alias,
573};
574
575struct logfs_block_ops indirect_block_ops = {
576 .write_block = indirect_write_block,
577 .free_block = indirect_free_block,
578 .write_alias = indirect_write_alias,
579};
580
581struct logfs_block *__alloc_block(struct super_block *sb,
582 u64 ino, u64 bix, level_t level)
583{
584 struct logfs_super *super = logfs_super(sb);
585 struct logfs_block *block;
586
587 block = mempool_alloc(super->s_block_pool, GFP_NOFS);
588 memset(block, 0, sizeof(*block));
589 INIT_LIST_HEAD(&block->alias_list);
590 INIT_LIST_HEAD(&block->item_list);
591 block->sb = sb;
592 block->ino = ino;
593 block->bix = bix;
594 block->level = level;
595 return block;
596}
597
598static void alloc_inode_block(struct inode *inode)
599{
600 struct logfs_inode *li = logfs_inode(inode);
601 struct logfs_block *block;
602
603 if (li->li_block)
604 return;
605
606 block = __alloc_block(inode->i_sb, LOGFS_INO_MASTER, inode->i_ino, 0);
607 block->inode = inode;
608 li->li_block = block;
609 block->ops = &inode_block_ops;
610}
611
612void initialize_block_counters(struct page *page, struct logfs_block *block,
613 __be64 *array, int page_is_empty)
614{
615 u64 ptr;
616 int i, start;
617
618 block->partial = 0;
619 block->full = 0;
620 start = 0;
621 if (page->index < first_indirect_block()) {
622 /* Counters are pointless on level 0 */
623 return;
624 }
625 if (page->index == first_indirect_block()) {
626 /* Skip unused pointers */
627 start = I0_BLOCKS;
628 block->full = I0_BLOCKS;
629 }
630 if (!page_is_empty) {
631 for (i = start; i < LOGFS_BLOCK_FACTOR; i++) {
632 ptr = be64_to_cpu(array[i]);
633 if (ptr)
634 block->partial++;
635 if (ptr & LOGFS_FULLY_POPULATED)
636 block->full++;
637 }
638 }
639}
640
641static void alloc_data_block(struct inode *inode, struct page *page)
642{
643 struct logfs_block *block;
644 u64 bix;
645 level_t level;
646
647 if (PagePrivate(page))
648 return;
649
650 logfs_unpack_index(page->index, &bix, &level);
651 block = __alloc_block(inode->i_sb, inode->i_ino, bix, level);
652 block->page = page;
653 SetPagePrivate(page);
654 page->private = (unsigned long)block;
655 block->ops = &indirect_block_ops;
656}
657
658static void alloc_indirect_block(struct inode *inode, struct page *page,
659 int page_is_empty)
660{
661 struct logfs_block *block;
662 __be64 *array;
663
664 if (PagePrivate(page))
665 return;
666
667 alloc_data_block(inode, page);
668
669 block = logfs_block(page);
670 array = kmap_atomic(page, KM_USER0);
671 initialize_block_counters(page, block, array, page_is_empty);
672 kunmap_atomic(array, KM_USER0);
673}
674
675static void block_set_pointer(struct page *page, int index, u64 ptr)
676{
677 struct logfs_block *block = logfs_block(page);
678 __be64 *array;
679 u64 oldptr;
680
681 BUG_ON(!block);
682 array = kmap_atomic(page, KM_USER0);
683 oldptr = be64_to_cpu(array[index]);
684 array[index] = cpu_to_be64(ptr);
685 kunmap_atomic(array, KM_USER0);
686 SetPageUptodate(page);
687
688 block->full += !!(ptr & LOGFS_FULLY_POPULATED)
689 - !!(oldptr & LOGFS_FULLY_POPULATED);
690 block->partial += !!ptr - !!oldptr;
691}
692
693static u64 block_get_pointer(struct page *page, int index)
694{
695 __be64 *block;
696 u64 ptr;
697
698 block = kmap_atomic(page, KM_USER0);
699 ptr = be64_to_cpu(block[index]);
700 kunmap_atomic(block, KM_USER0);
701 return ptr;
702}
703
704static int logfs_read_empty(struct page *page)
705{
706 zero_user_segment(page, 0, PAGE_CACHE_SIZE);
707 return 0;
708}
709
710static int logfs_read_direct(struct inode *inode, struct page *page)
711{
712 struct logfs_inode *li = logfs_inode(inode);
713 pgoff_t index = page->index;
714 u64 block;
715
716 block = li->li_data[index];
717 if (!block)
718 return logfs_read_empty(page);
719
720 return logfs_segment_read(inode, page, block, index, 0);
721}
722
723static int logfs_read_loop(struct inode *inode, struct page *page,
724 int rw_context)
725{
726 struct logfs_inode *li = logfs_inode(inode);
727 u64 bix, bofs = li->li_data[INDIRECT_INDEX];
728 level_t level, target_level;
729 int ret;
730 struct page *ipage;
731
732 logfs_unpack_index(page->index, &bix, &target_level);
733 if (!bofs)
734 return logfs_read_empty(page);
735
736 if (bix >= maxbix(li->li_height))
737 return logfs_read_empty(page);
738
739 for (level = LEVEL(li->li_height);
740 (__force u8)level > (__force u8)target_level;
741 level = SUBLEVEL(level)){
742 ipage = logfs_get_page(inode, bix, level, rw_context);
743 if (!ipage)
744 return -ENOMEM;
745
746 ret = logfs_segment_read(inode, ipage, bofs, bix, level);
747 if (ret) {
748 logfs_put_read_page(ipage);
749 return ret;
750 }
751
752 bofs = block_get_pointer(ipage, get_bits(bix, SUBLEVEL(level)));
753 logfs_put_page(ipage, rw_context);
754 if (!bofs)
755 return logfs_read_empty(page);
756 }
757
758 return logfs_segment_read(inode, page, bofs, bix, 0);
759}
760
761static int logfs_read_block(struct inode *inode, struct page *page,
762 int rw_context)
763{
764 pgoff_t index = page->index;
765
766 if (index < I0_BLOCKS)
767 return logfs_read_direct(inode, page);
768 return logfs_read_loop(inode, page, rw_context);
769}
770
771static int logfs_exist_loop(struct inode *inode, u64 bix)
772{
773 struct logfs_inode *li = logfs_inode(inode);
774 u64 bofs = li->li_data[INDIRECT_INDEX];
775 level_t level;
776 int ret;
777 struct page *ipage;
778
779 if (!bofs)
780 return 0;
781 if (bix >= maxbix(li->li_height))
782 return 0;
783
784 for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)) {
785 ipage = logfs_get_read_page(inode, bix, level);
786 if (!ipage)
787 return -ENOMEM;
788
789 ret = logfs_segment_read(inode, ipage, bofs, bix, level);
790 if (ret) {
791 logfs_put_read_page(ipage);
792 return ret;
793 }
794
795 bofs = block_get_pointer(ipage, get_bits(bix, SUBLEVEL(level)));
796 logfs_put_read_page(ipage);
797 if (!bofs)
798 return 0;
799 }
800
801 return 1;
802}
803
804int logfs_exist_block(struct inode *inode, u64 bix)
805{
806 struct logfs_inode *li = logfs_inode(inode);
807
808 if (bix < I0_BLOCKS)
809 return !!li->li_data[bix];
810 return logfs_exist_loop(inode, bix);
811}
812
813static u64 seek_holedata_direct(struct inode *inode, u64 bix, int data)
814{
815 struct logfs_inode *li = logfs_inode(inode);
816
817 for (; bix < I0_BLOCKS; bix++)
818 if (data ^ (li->li_data[bix] == 0))
819 return bix;
820 return I0_BLOCKS;
821}
822
823static u64 seek_holedata_loop(struct inode *inode, u64 bix, int data)
824{
825 struct logfs_inode *li = logfs_inode(inode);
826 __be64 *rblock;
827 u64 increment, bofs = li->li_data[INDIRECT_INDEX];
828 level_t level;
829 int ret, slot;
830 struct page *page;
831
832 BUG_ON(!bofs);
833
834 for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)) {
835 increment = 1 << (LOGFS_BLOCK_BITS * ((__force u8)level-1));
836 page = logfs_get_read_page(inode, bix, level);
837 if (!page)
838 return bix;
839
840 ret = logfs_segment_read(inode, page, bofs, bix, level);
841 if (ret) {
842 logfs_put_read_page(page);
843 return bix;
844 }
845
846 slot = get_bits(bix, SUBLEVEL(level));
847 rblock = kmap_atomic(page, KM_USER0);
848 while (slot < LOGFS_BLOCK_FACTOR) {
849 if (data && (rblock[slot] != 0))
850 break;
851 if (!data && !(be64_to_cpu(rblock[slot]) & LOGFS_FULLY_POPULATED))
852 break;
853 slot++;
854 bix += increment;
855 bix &= ~(increment - 1);
856 }
857 if (slot >= LOGFS_BLOCK_FACTOR) {
858 kunmap_atomic(rblock, KM_USER0);
859 logfs_put_read_page(page);
860 return bix;
861 }
862 bofs = be64_to_cpu(rblock[slot]);
863 kunmap_atomic(rblock, KM_USER0);
864 logfs_put_read_page(page);
865 if (!bofs) {
866 BUG_ON(data);
867 return bix;
868 }
869 }
870 return bix;
871}
872
873/**
874 * logfs_seek_hole - find next hole starting at a given block index
875 * @inode: inode to search in
876 * @bix: block index to start searching
877 *
878 * Returns next hole. If the file doesn't contain any further holes, the
879 * block address next to eof is returned instead.
880 */
881u64 logfs_seek_hole(struct inode *inode, u64 bix)
882{
883 struct logfs_inode *li = logfs_inode(inode);
884
885 if (bix < I0_BLOCKS) {
886 bix = seek_holedata_direct(inode, bix, 0);
887 if (bix < I0_BLOCKS)
888 return bix;
889 }
890
891 if (!li->li_data[INDIRECT_INDEX])
892 return bix;
893 else if (li->li_data[INDIRECT_INDEX] & LOGFS_FULLY_POPULATED)
894 bix = maxbix(li->li_height);
895 else {
896 bix = seek_holedata_loop(inode, bix, 0);
897 if (bix < maxbix(li->li_height))
898 return bix;
899 /* Should not happen anymore. But if some port writes semi-
900 * corrupt images (as this one used to) we might run into it.
901 */
902 WARN_ON_ONCE(bix == maxbix(li->li_height));
903 }
904
905 return bix;
906}
907
908static u64 __logfs_seek_data(struct inode *inode, u64 bix)
909{
910 struct logfs_inode *li = logfs_inode(inode);
911
912 if (bix < I0_BLOCKS) {
913 bix = seek_holedata_direct(inode, bix, 1);
914 if (bix < I0_BLOCKS)
915 return bix;
916 }
917
918 if (bix < maxbix(li->li_height)) {
919 if (!li->li_data[INDIRECT_INDEX])
920 bix = maxbix(li->li_height);
921 else
922 return seek_holedata_loop(inode, bix, 1);
923 }
924
925 return bix;
926}
927
928/**
929 * logfs_seek_data - find next data block after a given block index
930 * @inode: inode to search in
931 * @bix: block index to start searching
932 *
933 * Returns next data block. If the file doesn't contain any further data
934 * blocks, the last block in the file is returned instead.
935 */
936u64 logfs_seek_data(struct inode *inode, u64 bix)
937{
938 struct super_block *sb = inode->i_sb;
939 u64 ret, end;
940
941 ret = __logfs_seek_data(inode, bix);
942 end = i_size_read(inode) >> sb->s_blocksize_bits;
943 if (ret >= end)
944 ret = max(bix, end);
945 return ret;
946}
947
948static int logfs_is_valid_direct(struct logfs_inode *li, u64 bix, u64 ofs)
949{
950 return pure_ofs(li->li_data[bix]) == ofs;
951}
952
953static int __logfs_is_valid_loop(struct inode *inode, u64 bix,
954 u64 ofs, u64 bofs)
955{
956 struct logfs_inode *li = logfs_inode(inode);
957 level_t level;
958 int ret;
959 struct page *page;
960
961 for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)){
962 page = logfs_get_write_page(inode, bix, level);
963 BUG_ON(!page);
964
965 ret = logfs_segment_read(inode, page, bofs, bix, level);
966 if (ret) {
967 logfs_put_write_page(page);
968 return 0;
969 }
970
971 bofs = block_get_pointer(page, get_bits(bix, SUBLEVEL(level)));
972 logfs_put_write_page(page);
973 if (!bofs)
974 return 0;
975
976 if (pure_ofs(bofs) == ofs)
977 return 1;
978 }
979 return 0;
980}
981
982static int logfs_is_valid_loop(struct inode *inode, u64 bix, u64 ofs)
983{
984 struct logfs_inode *li = logfs_inode(inode);
985 u64 bofs = li->li_data[INDIRECT_INDEX];
986
987 if (!bofs)
988 return 0;
989
990 if (bix >= maxbix(li->li_height))
991 return 0;
992
993 if (pure_ofs(bofs) == ofs)
994 return 1;
995
996 return __logfs_is_valid_loop(inode, bix, ofs, bofs);
997}
998
999static int __logfs_is_valid_block(struct inode *inode, u64 bix, u64 ofs)
1000{
1001 struct logfs_inode *li = logfs_inode(inode);
1002
1003 if ((inode->i_nlink == 0) && atomic_read(&inode->i_count) == 1)
1004 return 0;
1005
1006 if (bix < I0_BLOCKS)
1007 return logfs_is_valid_direct(li, bix, ofs);
1008 return logfs_is_valid_loop(inode, bix, ofs);
1009}
1010
1011/**
1012 * logfs_is_valid_block - check whether this block is still valid
1013 *
1014 * @sb - superblock
1015 * @ofs - block physical offset
1016 * @ino - block inode number
1017 * @bix - block index
1018 * @level - block level
1019 *
1020 * Returns 0 if the block is invalid, 1 if it is valid and 2 if it will
1021 * become invalid once the journal is written.
1022 */
1023int logfs_is_valid_block(struct super_block *sb, u64 ofs, u64 ino, u64 bix,
1024 gc_level_t gc_level)
1025{
1026 struct logfs_super *super = logfs_super(sb);
1027 struct inode *inode;
1028 int ret, cookie;
1029
1030 /* Umount closes a segment with free blocks remaining. Those
1031 * blocks are by definition invalid. */
1032 if (ino == -1)
1033 return 0;
1034
1035 LOGFS_BUG_ON((u64)(u_long)ino != ino, sb);
1036
1037 inode = logfs_safe_iget(sb, ino, &cookie);
1038 if (IS_ERR(inode))
1039 goto invalid;
1040
1041 ret = __logfs_is_valid_block(inode, bix, ofs);
1042 logfs_safe_iput(inode, cookie);
1043 if (ret)
1044 return ret;
1045
1046invalid:
1047 /* Block is nominally invalid, but may still sit in the shadow tree,
1048 * waiting for a journal commit.
1049 */
1050 if (btree_lookup64(&super->s_shadow_tree.old, ofs))
1051 return 2;
1052 return 0;
1053}
1054
1055int logfs_readpage_nolock(struct page *page)
1056{
1057 struct inode *inode = page->mapping->host;
1058 int ret = -EIO;
1059
1060 ret = logfs_read_block(inode, page, READ);
1061
1062 if (ret) {
1063 ClearPageUptodate(page);
1064 SetPageError(page);
1065 } else {
1066 SetPageUptodate(page);
1067 ClearPageError(page);
1068 }
1069 flush_dcache_page(page);
1070
1071 return ret;
1072}
1073
1074static int logfs_reserve_bytes(struct inode *inode, int bytes)
1075{
1076 struct logfs_super *super = logfs_super(inode->i_sb);
1077 u64 available = super->s_free_bytes + super->s_dirty_free_bytes
1078 - super->s_dirty_used_bytes - super->s_dirty_pages;
1079
1080 if (!bytes)
1081 return 0;
1082
1083 if (available < bytes)
1084 return -ENOSPC;
1085
1086 if (available < bytes + super->s_root_reserve &&
1087 !capable(CAP_SYS_RESOURCE))
1088 return -ENOSPC;
1089
1090 return 0;
1091}
1092
1093int get_page_reserve(struct inode *inode, struct page *page)
1094{
1095 struct logfs_super *super = logfs_super(inode->i_sb);
1096 int ret;
1097
1098 if (logfs_block(page) && logfs_block(page)->reserved_bytes)
1099 return 0;
1100
1101 logfs_get_wblocks(inode->i_sb, page, WF_LOCK);
1102 ret = logfs_reserve_bytes(inode, 6 * LOGFS_MAX_OBJECTSIZE);
1103 if (!ret) {
1104 alloc_data_block(inode, page);
1105 logfs_block(page)->reserved_bytes += 6 * LOGFS_MAX_OBJECTSIZE;
1106 super->s_dirty_pages += 6 * LOGFS_MAX_OBJECTSIZE;
1107 }
1108 logfs_put_wblocks(inode->i_sb, page, WF_LOCK);
1109 return ret;
1110}
1111
1112/*
1113 * We are protected by write lock. Push victims up to superblock level
1114 * and release transaction when appropriate.
1115 */
1116/* FIXME: This is currently called from the wrong spots. */
1117static void logfs_handle_transaction(struct inode *inode,
1118 struct logfs_transaction *ta)
1119{
1120 struct logfs_super *super = logfs_super(inode->i_sb);
1121
1122 if (!ta)
1123 return;
1124 logfs_inode(inode)->li_block->ta = NULL;
1125
1126 if (inode->i_ino != LOGFS_INO_MASTER) {
1127 BUG(); /* FIXME: Yes, this needs more thought */
1128 /* just remember the transaction until inode is written */
1129 //BUG_ON(logfs_inode(inode)->li_transaction);
1130 //logfs_inode(inode)->li_transaction = ta;
1131 return;
1132 }
1133
1134 switch (ta->state) {
1135 case CREATE_1: /* fall through */
1136 case UNLINK_1:
1137 BUG_ON(super->s_victim_ino);
1138 super->s_victim_ino = ta->ino;
1139 break;
1140 case CREATE_2: /* fall through */
1141 case UNLINK_2:
1142 BUG_ON(super->s_victim_ino != ta->ino);
1143 super->s_victim_ino = 0;
1144 /* transaction ends here - free it */
1145 kfree(ta);
1146 break;
1147 case CROSS_RENAME_1:
1148 BUG_ON(super->s_rename_dir);
1149 BUG_ON(super->s_rename_pos);
1150 super->s_rename_dir = ta->dir;
1151 super->s_rename_pos = ta->pos;
1152 break;
1153 case CROSS_RENAME_2:
1154 BUG_ON(super->s_rename_dir != ta->dir);
1155 BUG_ON(super->s_rename_pos != ta->pos);
1156 super->s_rename_dir = 0;
1157 super->s_rename_pos = 0;
1158 kfree(ta);
1159 break;
1160 case TARGET_RENAME_1:
1161 BUG_ON(super->s_rename_dir);
1162 BUG_ON(super->s_rename_pos);
1163 BUG_ON(super->s_victim_ino);
1164 super->s_rename_dir = ta->dir;
1165 super->s_rename_pos = ta->pos;
1166 super->s_victim_ino = ta->ino;
1167 break;
1168 case TARGET_RENAME_2:
1169 BUG_ON(super->s_rename_dir != ta->dir);
1170 BUG_ON(super->s_rename_pos != ta->pos);
1171 BUG_ON(super->s_victim_ino != ta->ino);
1172 super->s_rename_dir = 0;
1173 super->s_rename_pos = 0;
1174 break;
1175 case TARGET_RENAME_3:
1176 BUG_ON(super->s_rename_dir);
1177 BUG_ON(super->s_rename_pos);
1178 BUG_ON(super->s_victim_ino != ta->ino);
1179 super->s_victim_ino = 0;
1180 kfree(ta);
1181 break;
1182 default:
1183 BUG();
1184 }
1185}
1186
1187/*
1188 * Not strictly a reservation, but rather a check that we still have enough
1189 * space to satisfy the write.
1190 */
1191static int logfs_reserve_blocks(struct inode *inode, int blocks)
1192{
1193 return logfs_reserve_bytes(inode, blocks * LOGFS_MAX_OBJECTSIZE);
1194}
1195
1196struct write_control {
1197 u64 ofs;
1198 long flags;
1199};
1200
1201static struct logfs_shadow *alloc_shadow(struct inode *inode, u64 bix,
1202 level_t level, u64 old_ofs)
1203{
1204 struct logfs_super *super = logfs_super(inode->i_sb);
1205 struct logfs_shadow *shadow;
1206
1207 shadow = mempool_alloc(super->s_shadow_pool, GFP_NOFS);
1208 memset(shadow, 0, sizeof(*shadow));
1209 shadow->ino = inode->i_ino;
1210 shadow->bix = bix;
1211 shadow->gc_level = expand_level(inode->i_ino, level);
1212 shadow->old_ofs = old_ofs & ~LOGFS_FULLY_POPULATED;
1213 return shadow;
1214}
1215
1216static void free_shadow(struct inode *inode, struct logfs_shadow *shadow)
1217{
1218 struct logfs_super *super = logfs_super(inode->i_sb);
1219
1220 mempool_free(shadow, super->s_shadow_pool);
1221}
1222
1223static void mark_segment(struct shadow_tree *tree, u32 segno)
1224{
1225 int err;
1226
1227 if (!btree_lookup32(&tree->segment_map, segno)) {
1228 err = btree_insert32(&tree->segment_map, segno, (void *)1,
1229 GFP_NOFS);
1230 BUG_ON(err);
1231 tree->no_shadowed_segments++;
1232 }
1233}
1234
1235/**
1236 * fill_shadow_tree - Propagate shadow tree changes due to a write
1237 * @inode: Inode owning the page
1238 * @page: Struct page that was written
1239 * @shadow: Shadow for the current write
1240 *
1241 * Writes in logfs can result in two semi-valid objects. The old object
1242 * is still valid as long as it can be reached by following pointers on
1243 * the medium. Only when writes propagate all the way up to the journal
1244 * has the new object safely replaced the old one.
1245 *
1246 * To handle this problem, a struct logfs_shadow is used to represent
1247 * every single write. It is attached to the indirect block, which is
1248 * marked dirty. When the indirect block is written, its shadows are
1249 * handed up to the next indirect block (or inode). Untimately they
1250 * will reach the master inode and be freed upon journal commit.
1251 *
1252 * This function handles a single step in the propagation. It adds the
1253 * shadow for the current write to the tree, along with any shadows in
1254 * the page's tree, in case it was an indirect block. If a page is
1255 * written, the inode parameter is left NULL, if an inode is written,
1256 * the page parameter is left NULL.
1257 */
1258static void fill_shadow_tree(struct inode *inode, struct page *page,
1259 struct logfs_shadow *shadow)
1260{
1261 struct logfs_super *super = logfs_super(inode->i_sb);
1262 struct logfs_block *block = logfs_block(page);
1263 struct shadow_tree *tree = &super->s_shadow_tree;
1264
1265 if (PagePrivate(page)) {
1266 if (block->alias_map)
1267 super->s_no_object_aliases -= bitmap_weight(
1268 block->alias_map, LOGFS_BLOCK_FACTOR);
1269 logfs_handle_transaction(inode, block->ta);
1270 block->ops->free_block(inode->i_sb, block);
1271 }
1272 if (shadow) {
1273 if (shadow->old_ofs)
1274 btree_insert64(&tree->old, shadow->old_ofs, shadow,
1275 GFP_NOFS);
1276 else
1277 btree_insert64(&tree->new, shadow->new_ofs, shadow,
1278 GFP_NOFS);
1279
1280 super->s_dirty_used_bytes += shadow->new_len;
1281 super->s_dirty_free_bytes += shadow->old_len;
1282 mark_segment(tree, shadow->old_ofs >> super->s_segshift);
1283 mark_segment(tree, shadow->new_ofs >> super->s_segshift);
1284 }
1285}
1286
1287static void logfs_set_alias(struct super_block *sb, struct logfs_block *block,
1288 long child_no)
1289{
1290 struct logfs_super *super = logfs_super(sb);
1291
1292 if (block->inode && block->inode->i_ino == LOGFS_INO_MASTER) {
1293 /* Aliases in the master inode are pointless. */
1294 return;
1295 }
1296
1297 if (!test_bit(child_no, block->alias_map)) {
1298 set_bit(child_no, block->alias_map);
1299 super->s_no_object_aliases++;
1300 }
1301 list_move_tail(&block->alias_list, &super->s_object_alias);
1302}
1303
1304/*
1305 * Object aliases can and often do change the size and occupied space of a
1306 * file. So not only do we have to change the pointers, we also have to
1307 * change inode->i_size and li->li_used_bytes. Which is done by setting
1308 * another two object aliases for the inode itself.
1309 */
1310static void set_iused(struct inode *inode, struct logfs_shadow *shadow)
1311{
1312 struct logfs_inode *li = logfs_inode(inode);
1313
1314 if (shadow->new_len == shadow->old_len)
1315 return;
1316
1317 alloc_inode_block(inode);
1318 li->li_used_bytes += shadow->new_len - shadow->old_len;
1319 __logfs_set_blocks(inode);
1320 logfs_set_alias(inode->i_sb, li->li_block, INODE_USED_OFS);
1321 logfs_set_alias(inode->i_sb, li->li_block, INODE_SIZE_OFS);
1322}
1323
1324static int logfs_write_i0(struct inode *inode, struct page *page,
1325 struct write_control *wc)
1326{
1327 struct logfs_shadow *shadow;
1328 u64 bix;
1329 level_t level;
1330 int full, err = 0;
1331
1332 logfs_unpack_index(page->index, &bix, &level);
1333 if (wc->ofs == 0)
1334 if (logfs_reserve_blocks(inode, 1))
1335 return -ENOSPC;
1336
1337 shadow = alloc_shadow(inode, bix, level, wc->ofs);
1338 if (wc->flags & WF_WRITE)
1339 err = logfs_segment_write(inode, page, shadow);
1340 if (wc->flags & WF_DELETE)
1341 logfs_segment_delete(inode, shadow);
1342 if (err) {
1343 free_shadow(inode, shadow);
1344 return err;
1345 }
1346
1347 set_iused(inode, shadow);
1348 full = 1;
1349 if (level != 0) {
1350 alloc_indirect_block(inode, page, 0);
1351 full = logfs_block(page)->full == LOGFS_BLOCK_FACTOR;
1352 }
1353 fill_shadow_tree(inode, page, shadow);
1354 wc->ofs = shadow->new_ofs;
1355 if (wc->ofs && full)
1356 wc->ofs |= LOGFS_FULLY_POPULATED;
1357 return 0;
1358}
1359
1360static int logfs_write_direct(struct inode *inode, struct page *page,
1361 long flags)
1362{
1363 struct logfs_inode *li = logfs_inode(inode);
1364 struct write_control wc = {
1365 .ofs = li->li_data[page->index],
1366 .flags = flags,
1367 };
1368 int err;
1369
1370 alloc_inode_block(inode);
1371
1372 err = logfs_write_i0(inode, page, &wc);
1373 if (err)
1374 return err;
1375
1376 li->li_data[page->index] = wc.ofs;
1377 logfs_set_alias(inode->i_sb, li->li_block,
1378 page->index + INODE_POINTER_OFS);
1379 return 0;
1380}
1381
1382static int ptr_change(u64 ofs, struct page *page)
1383{
1384 struct logfs_block *block = logfs_block(page);
1385 int empty0, empty1, full0, full1;
1386
1387 empty0 = ofs == 0;
1388 empty1 = block->partial == 0;
1389 if (empty0 != empty1)
1390 return 1;
1391
1392 /* The !! is necessary to shrink result to int */
1393 full0 = !!(ofs & LOGFS_FULLY_POPULATED);
1394 full1 = block->full == LOGFS_BLOCK_FACTOR;
1395 if (full0 != full1)
1396 return 1;
1397 return 0;
1398}
1399
1400static int __logfs_write_rec(struct inode *inode, struct page *page,
1401 struct write_control *this_wc,
1402 pgoff_t bix, level_t target_level, level_t level)
1403{
1404 int ret, page_empty = 0;
1405 int child_no = get_bits(bix, SUBLEVEL(level));
1406 struct page *ipage;
1407 struct write_control child_wc = {
1408 .flags = this_wc->flags,
1409 };
1410
1411 ipage = logfs_get_write_page(inode, bix, level);
1412 if (!ipage)
1413 return -ENOMEM;
1414
1415 if (this_wc->ofs) {
1416 ret = logfs_segment_read(inode, ipage, this_wc->ofs, bix, level);
1417 if (ret)
1418 goto out;
1419 } else if (!PageUptodate(ipage)) {
1420 page_empty = 1;
1421 logfs_read_empty(ipage);
1422 }
1423
1424 child_wc.ofs = block_get_pointer(ipage, child_no);
1425
1426 if ((__force u8)level-1 > (__force u8)target_level)
1427 ret = __logfs_write_rec(inode, page, &child_wc, bix,
1428 target_level, SUBLEVEL(level));
1429 else
1430 ret = logfs_write_i0(inode, page, &child_wc);
1431
1432 if (ret)
1433 goto out;
1434
1435 alloc_indirect_block(inode, ipage, page_empty);
1436 block_set_pointer(ipage, child_no, child_wc.ofs);
1437 /* FIXME: first condition seems superfluous */
1438 if (child_wc.ofs || logfs_block(ipage)->partial)
1439 this_wc->flags |= WF_WRITE;
1440 /* the condition on this_wc->ofs ensures that we won't consume extra
1441 * space for indirect blocks in the future, which we cannot reserve */
1442 if (!this_wc->ofs || ptr_change(this_wc->ofs, ipage))
1443 ret = logfs_write_i0(inode, ipage, this_wc);
1444 else
1445 logfs_set_alias(inode->i_sb, logfs_block(ipage), child_no);
1446out:
1447 logfs_put_write_page(ipage);
1448 return ret;
1449}
1450
1451static int logfs_write_rec(struct inode *inode, struct page *page,
1452 pgoff_t bix, level_t target_level, long flags)
1453{
1454 struct logfs_inode *li = logfs_inode(inode);
1455 struct write_control wc = {
1456 .ofs = li->li_data[INDIRECT_INDEX],
1457 .flags = flags,
1458 };
1459 int ret;
1460
1461 alloc_inode_block(inode);
1462
1463 if (li->li_height > (__force u8)target_level)
1464 ret = __logfs_write_rec(inode, page, &wc, bix, target_level,
1465 LEVEL(li->li_height));
1466 else
1467 ret = logfs_write_i0(inode, page, &wc);
1468 if (ret)
1469 return ret;
1470
1471 if (li->li_data[INDIRECT_INDEX] != wc.ofs) {
1472 li->li_data[INDIRECT_INDEX] = wc.ofs;
1473 logfs_set_alias(inode->i_sb, li->li_block,
1474 INDIRECT_INDEX + INODE_POINTER_OFS);
1475 }
1476 return ret;
1477}
1478
1479void logfs_add_transaction(struct inode *inode, struct logfs_transaction *ta)
1480{
1481 alloc_inode_block(inode);
1482 logfs_inode(inode)->li_block->ta = ta;
1483}
1484
1485void logfs_del_transaction(struct inode *inode, struct logfs_transaction *ta)
1486{
1487 struct logfs_block *block = logfs_inode(inode)->li_block;
1488
1489 if (block && block->ta)
1490 block->ta = NULL;
1491}
1492
1493static int grow_inode(struct inode *inode, u64 bix, level_t level)
1494{
1495 struct logfs_inode *li = logfs_inode(inode);
1496 u8 height = (__force u8)level;
1497 struct page *page;
1498 struct write_control wc = {
1499 .flags = WF_WRITE,
1500 };
1501 int err;
1502
1503 BUG_ON(height > 5 || li->li_height > 5);
1504 while (height > li->li_height || bix >= maxbix(li->li_height)) {
1505 page = logfs_get_write_page(inode, I0_BLOCKS + 1,
1506 LEVEL(li->li_height + 1));
1507 if (!page)
1508 return -ENOMEM;
1509 logfs_read_empty(page);
1510 alloc_indirect_block(inode, page, 1);
1511 block_set_pointer(page, 0, li->li_data[INDIRECT_INDEX]);
1512 err = logfs_write_i0(inode, page, &wc);
1513 logfs_put_write_page(page);
1514 if (err)
1515 return err;
1516 li->li_data[INDIRECT_INDEX] = wc.ofs;
1517 wc.ofs = 0;
1518 li->li_height++;
1519 logfs_set_alias(inode->i_sb, li->li_block, INODE_HEIGHT_OFS);
1520 }
1521 return 0;
1522}
1523
1524static int __logfs_write_buf(struct inode *inode, struct page *page, long flags)
1525{
1526 struct logfs_super *super = logfs_super(inode->i_sb);
1527 pgoff_t index = page->index;
1528 u64 bix;
1529 level_t level;
1530 int err;
1531
1532 flags |= WF_WRITE | WF_DELETE;
1533 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
1534
1535 logfs_unpack_index(index, &bix, &level);
1536 if (logfs_block(page) && logfs_block(page)->reserved_bytes)
1537 super->s_dirty_pages -= logfs_block(page)->reserved_bytes;
1538
1539 if (index < I0_BLOCKS)
1540 return logfs_write_direct(inode, page, flags);
1541
1542 bix = adjust_bix(bix, level);
1543 err = grow_inode(inode, bix, level);
1544 if (err)
1545 return err;
1546 return logfs_write_rec(inode, page, bix, level, flags);
1547}
1548
1549int logfs_write_buf(struct inode *inode, struct page *page, long flags)
1550{
1551 struct super_block *sb = inode->i_sb;
1552 int ret;
1553
1554 logfs_get_wblocks(sb, page, flags & WF_LOCK);
1555 ret = __logfs_write_buf(inode, page, flags);
1556 logfs_put_wblocks(sb, page, flags & WF_LOCK);
1557 return ret;
1558}
1559
1560static int __logfs_delete(struct inode *inode, struct page *page)
1561{
1562 long flags = WF_DELETE;
1563
1564 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
1565
1566 if (page->index < I0_BLOCKS)
1567 return logfs_write_direct(inode, page, flags);
1568 return logfs_write_rec(inode, page, page->index, 0, flags);
1569}
1570
1571int logfs_delete(struct inode *inode, pgoff_t index,
1572 struct shadow_tree *shadow_tree)
1573{
1574 struct super_block *sb = inode->i_sb;
1575 struct page *page;
1576 int ret;
1577
1578 page = logfs_get_read_page(inode, index, 0);
1579 if (!page)
1580 return -ENOMEM;
1581
1582 logfs_get_wblocks(sb, page, 1);
1583 ret = __logfs_delete(inode, page);
1584 logfs_put_wblocks(sb, page, 1);
1585
1586 logfs_put_read_page(page);
1587
1588 return ret;
1589}
1590
1591int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs,
1592 gc_level_t gc_level, long flags)
1593{
1594 level_t level = shrink_level(gc_level);
1595 struct page *page;
1596 int err;
1597
1598 page = logfs_get_write_page(inode, bix, level);
1599 if (!page)
1600 return -ENOMEM;
1601
1602 err = logfs_segment_read(inode, page, ofs, bix, level);
1603 if (!err) {
1604 if (level != 0)
1605 alloc_indirect_block(inode, page, 0);
1606 err = logfs_write_buf(inode, page, flags);
1607 if (!err && shrink_level(gc_level) == 0) {
1608 /* Rewrite cannot mark the inode dirty but has to
1609 * write it immediatly.
1610 * Q: Can't we just create an alias for the inode
1611 * instead? And if not, why not?
1612 */
1613 if (inode->i_ino == LOGFS_INO_MASTER)
1614 logfs_write_anchor(inode->i_sb);
1615 else {
1616 err = __logfs_write_inode(inode, flags);
1617 }
1618 }
1619 }
1620 logfs_put_write_page(page);
1621 return err;
1622}
1623
1624static int truncate_data_block(struct inode *inode, struct page *page,
1625 u64 ofs, struct logfs_shadow *shadow, u64 size)
1626{
1627 loff_t pageofs = page->index << inode->i_sb->s_blocksize_bits;
1628 u64 bix;
1629 level_t level;
1630 int err;
1631
1632 /* Does truncation happen within this page? */
1633 if (size <= pageofs || size - pageofs >= PAGE_SIZE)
1634 return 0;
1635
1636 logfs_unpack_index(page->index, &bix, &level);
1637 BUG_ON(level != 0);
1638
1639 err = logfs_segment_read(inode, page, ofs, bix, level);
1640 if (err)
1641 return err;
1642
1643 zero_user_segment(page, size - pageofs, PAGE_CACHE_SIZE);
1644 return logfs_segment_write(inode, page, shadow);
1645}
1646
1647static int logfs_truncate_i0(struct inode *inode, struct page *page,
1648 struct write_control *wc, u64 size)
1649{
1650 struct logfs_shadow *shadow;
1651 u64 bix;
1652 level_t level;
1653 int err = 0;
1654
1655 logfs_unpack_index(page->index, &bix, &level);
1656 BUG_ON(level != 0);
1657 shadow = alloc_shadow(inode, bix, level, wc->ofs);
1658
1659 err = truncate_data_block(inode, page, wc->ofs, shadow, size);
1660 if (err) {
1661 free_shadow(inode, shadow);
1662 return err;
1663 }
1664
1665 logfs_segment_delete(inode, shadow);
1666 set_iused(inode, shadow);
1667 fill_shadow_tree(inode, page, shadow);
1668 wc->ofs = shadow->new_ofs;
1669 return 0;
1670}
1671
1672static int logfs_truncate_direct(struct inode *inode, u64 size)
1673{
1674 struct logfs_inode *li = logfs_inode(inode);
1675 struct write_control wc;
1676 struct page *page;
1677 int e;
1678 int err;
1679
1680 alloc_inode_block(inode);
1681
1682 for (e = I0_BLOCKS - 1; e >= 0; e--) {
1683 if (size > (e+1) * LOGFS_BLOCKSIZE)
1684 break;
1685
1686 wc.ofs = li->li_data[e];
1687 if (!wc.ofs)
1688 continue;
1689
1690 page = logfs_get_write_page(inode, e, 0);
1691 if (!page)
1692 return -ENOMEM;
1693 err = logfs_segment_read(inode, page, wc.ofs, e, 0);
1694 if (err) {
1695 logfs_put_write_page(page);
1696 return err;
1697 }
1698 err = logfs_truncate_i0(inode, page, &wc, size);
1699 logfs_put_write_page(page);
1700 if (err)
1701 return err;
1702
1703 li->li_data[e] = wc.ofs;
1704 }
1705 return 0;
1706}
1707
1708/* FIXME: these need to become per-sb once we support different blocksizes */
1709static u64 __logfs_step[] = {
1710 1,
1711 I1_BLOCKS,
1712 I2_BLOCKS,
1713 I3_BLOCKS,
1714};
1715
1716static u64 __logfs_start_index[] = {
1717 I0_BLOCKS,
1718 I1_BLOCKS,
1719 I2_BLOCKS,
1720 I3_BLOCKS
1721};
1722
1723static inline u64 logfs_step(level_t level)
1724{
1725 return __logfs_step[(__force u8)level];
1726}
1727
1728static inline u64 logfs_factor(u8 level)
1729{
1730 return __logfs_step[level] * LOGFS_BLOCKSIZE;
1731}
1732
1733static inline u64 logfs_start_index(level_t level)
1734{
1735 return __logfs_start_index[(__force u8)level];
1736}
1737
1738static void logfs_unpack_raw_index(pgoff_t index, u64 *bix, level_t *level)
1739{
1740 logfs_unpack_index(index, bix, level);
1741 if (*bix <= logfs_start_index(SUBLEVEL(*level)))
1742 *bix = 0;
1743}
1744
1745static int __logfs_truncate_rec(struct inode *inode, struct page *ipage,
1746 struct write_control *this_wc, u64 size)
1747{
1748 int truncate_happened = 0;
1749 int e, err = 0;
1750 u64 bix, child_bix, next_bix;
1751 level_t level;
1752 struct page *page;
1753 struct write_control child_wc = { /* FIXME: flags */ };
1754
1755 logfs_unpack_raw_index(ipage->index, &bix, &level);
1756 err = logfs_segment_read(inode, ipage, this_wc->ofs, bix, level);
1757 if (err)
1758 return err;
1759
1760 for (e = LOGFS_BLOCK_FACTOR - 1; e >= 0; e--) {
1761 child_bix = bix + e * logfs_step(SUBLEVEL(level));
1762 next_bix = child_bix + logfs_step(SUBLEVEL(level));
1763 if (size > next_bix * LOGFS_BLOCKSIZE)
1764 break;
1765
1766 child_wc.ofs = pure_ofs(block_get_pointer(ipage, e));
1767 if (!child_wc.ofs)
1768 continue;
1769
1770 page = logfs_get_write_page(inode, child_bix, SUBLEVEL(level));
1771 if (!page)
1772 return -ENOMEM;
1773
1774 if ((__force u8)level > 1)
1775 err = __logfs_truncate_rec(inode, page, &child_wc, size);
1776 else
1777 err = logfs_truncate_i0(inode, page, &child_wc, size);
1778 logfs_put_write_page(page);
1779 if (err)
1780 return err;
1781
1782 truncate_happened = 1;
1783 alloc_indirect_block(inode, ipage, 0);
1784 block_set_pointer(ipage, e, child_wc.ofs);
1785 }
1786
1787 if (!truncate_happened) {
1788 printk("ineffectual truncate (%lx, %lx, %llx)\n", inode->i_ino, ipage->index, size);
1789 return 0;
1790 }
1791
1792 this_wc->flags = WF_DELETE;
1793 if (logfs_block(ipage)->partial)
1794 this_wc->flags |= WF_WRITE;
1795
1796 return logfs_write_i0(inode, ipage, this_wc);
1797}
1798
1799static int logfs_truncate_rec(struct inode *inode, u64 size)
1800{
1801 struct logfs_inode *li = logfs_inode(inode);
1802 struct write_control wc = {
1803 .ofs = li->li_data[INDIRECT_INDEX],
1804 };
1805 struct page *page;
1806 int err;
1807
1808 alloc_inode_block(inode);
1809
1810 if (!wc.ofs)
1811 return 0;
1812
1813 page = logfs_get_write_page(inode, 0, LEVEL(li->li_height));
1814 if (!page)
1815 return -ENOMEM;
1816
1817 err = __logfs_truncate_rec(inode, page, &wc, size);
1818 logfs_put_write_page(page);
1819 if (err)
1820 return err;
1821
1822 if (li->li_data[INDIRECT_INDEX] != wc.ofs)
1823 li->li_data[INDIRECT_INDEX] = wc.ofs;
1824 return 0;
1825}
1826
1827static int __logfs_truncate(struct inode *inode, u64 size)
1828{
1829 int ret;
1830
1831 if (size >= logfs_factor(logfs_inode(inode)->li_height))
1832 return 0;
1833
1834 ret = logfs_truncate_rec(inode, size);
1835 if (ret)
1836 return ret;
1837
1838 return logfs_truncate_direct(inode, size);
1839}
1840
1841/*
1842 * Truncate, by changing the segment file, can consume a fair amount
1843 * of resources. So back off from time to time and do some GC.
1844 * 8 or 2048 blocks should be well within safety limits even if
1845 * every single block resided in a different segment.
1846 */
1847#define TRUNCATE_STEP (8 * 1024 * 1024)
1848int logfs_truncate(struct inode *inode, u64 target)
1849{
1850 struct super_block *sb = inode->i_sb;
1851 u64 size = i_size_read(inode);
1852 int err = 0;
1853
1854 size = ALIGN(size, TRUNCATE_STEP);
1855 while (size > target) {
1856 if (size > TRUNCATE_STEP)
1857 size -= TRUNCATE_STEP;
1858 else
1859 size = 0;
1860 if (size < target)
1861 size = target;
1862
1863 logfs_get_wblocks(sb, NULL, 1);
1864 err = __logfs_truncate(inode, target);
1865 if (!err)
1866 err = __logfs_write_inode(inode, 0);
1867 logfs_put_wblocks(sb, NULL, 1);
1868 }
1869
1870 if (!err)
1871 err = vmtruncate(inode, target);
1872
1873 /* I don't trust error recovery yet. */
1874 WARN_ON(err);
1875 return err;
1876}
1877
1878static void move_page_to_inode(struct inode *inode, struct page *page)
1879{
1880 struct logfs_inode *li = logfs_inode(inode);
1881 struct logfs_block *block = logfs_block(page);
1882
1883 if (!block)
1884 return;
1885
1886 log_blockmove("move_page_to_inode(%llx, %llx, %x)\n",
1887 block->ino, block->bix, block->level);
1888 BUG_ON(li->li_block);
1889 block->ops = &inode_block_ops;
1890 block->inode = inode;
1891 li->li_block = block;
1892
1893 block->page = NULL;
1894 page->private = 0;
1895 ClearPagePrivate(page);
1896}
1897
1898static void move_inode_to_page(struct page *page, struct inode *inode)
1899{
1900 struct logfs_inode *li = logfs_inode(inode);
1901 struct logfs_block *block = li->li_block;
1902
1903 if (!block)
1904 return;
1905
1906 log_blockmove("move_inode_to_page(%llx, %llx, %x)\n",
1907 block->ino, block->bix, block->level);
1908 BUG_ON(PagePrivate(page));
1909 block->ops = &indirect_block_ops;
1910 block->page = page;
1911 page->private = (unsigned long)block;
1912 SetPagePrivate(page);
1913
1914 block->inode = NULL;
1915 li->li_block = NULL;
1916}
1917
1918int logfs_read_inode(struct inode *inode)
1919{
1920 struct super_block *sb = inode->i_sb;
1921 struct logfs_super *super = logfs_super(sb);
1922 struct inode *master_inode = super->s_master_inode;
1923 struct page *page;
1924 struct logfs_disk_inode *di;
1925 u64 ino = inode->i_ino;
1926
1927 if (ino << sb->s_blocksize_bits > i_size_read(master_inode))
1928 return -ENODATA;
1929 if (!logfs_exist_block(master_inode, ino))
1930 return -ENODATA;
1931
1932 page = read_cache_page(master_inode->i_mapping, ino,
1933 (filler_t *)logfs_readpage, NULL);
1934 if (IS_ERR(page))
1935 return PTR_ERR(page);
1936
1937 di = kmap_atomic(page, KM_USER0);
1938 logfs_disk_to_inode(di, inode);
1939 kunmap_atomic(di, KM_USER0);
1940 move_page_to_inode(inode, page);
1941 page_cache_release(page);
1942 return 0;
1943}
1944
1945/* Caller must logfs_put_write_page(page); */
1946static struct page *inode_to_page(struct inode *inode)
1947{
1948 struct inode *master_inode = logfs_super(inode->i_sb)->s_master_inode;
1949 struct logfs_disk_inode *di;
1950 struct page *page;
1951
1952 BUG_ON(inode->i_ino == LOGFS_INO_MASTER);
1953
1954 page = logfs_get_write_page(master_inode, inode->i_ino, 0);
1955 if (!page)
1956 return NULL;
1957
1958 di = kmap_atomic(page, KM_USER0);
1959 logfs_inode_to_disk(inode, di);
1960 kunmap_atomic(di, KM_USER0);
1961 move_inode_to_page(page, inode);
1962 return page;
1963}
1964
1965/* Cheaper version of write_inode. All changes are concealed in
1966 * aliases, which are moved back. No write to the medium happens.
1967 */
1968void logfs_clear_inode(struct inode *inode)
1969{
1970 struct super_block *sb = inode->i_sb;
1971 struct logfs_inode *li = logfs_inode(inode);
1972 struct logfs_block *block = li->li_block;
1973 struct page *page;
1974
1975 /* Only deleted files may be dirty at this point */
1976 BUG_ON(inode->i_state & I_DIRTY && inode->i_nlink);
1977 if (!block)
1978 return;
1979 if ((logfs_super(sb)->s_flags & LOGFS_SB_FLAG_SHUTDOWN)) {
1980 block->ops->free_block(inode->i_sb, block);
1981 return;
1982 }
1983
1984 BUG_ON(inode->i_ino < LOGFS_RESERVED_INOS);
1985 page = inode_to_page(inode);
1986 BUG_ON(!page); /* FIXME: Use emergency page */
1987 logfs_put_write_page(page);
1988}
1989
1990static int do_write_inode(struct inode *inode)
1991{
1992 struct super_block *sb = inode->i_sb;
1993 struct inode *master_inode = logfs_super(sb)->s_master_inode;
1994 loff_t size = (inode->i_ino + 1) << inode->i_sb->s_blocksize_bits;
1995 struct page *page;
1996 int err;
1997
1998 BUG_ON(inode->i_ino == LOGFS_INO_MASTER);
1999 /* FIXME: lock inode */
2000
2001 if (i_size_read(master_inode) < size)
2002 i_size_write(master_inode, size);
2003
2004 /* TODO: Tell vfs this inode is clean now */
2005
2006 page = inode_to_page(inode);
2007 if (!page)
2008 return -ENOMEM;
2009
2010 /* FIXME: transaction is part of logfs_block now. Is that enough? */
2011 err = logfs_write_buf(master_inode, page, 0);
2012 logfs_put_write_page(page);
2013 return err;
2014}
2015
2016static void logfs_mod_segment_entry(struct super_block *sb, u32 segno,
2017 int write,
2018 void (*change_se)(struct logfs_segment_entry *, long),
2019 long arg)
2020{
2021 struct logfs_super *super = logfs_super(sb);
2022 struct inode *inode;
2023 struct page *page;
2024 struct logfs_segment_entry *se;
2025 pgoff_t page_no;
2026 int child_no;
2027
2028 page_no = segno >> (sb->s_blocksize_bits - 3);
2029 child_no = segno & ((sb->s_blocksize >> 3) - 1);
2030
2031 inode = super->s_segfile_inode;
2032 page = logfs_get_write_page(inode, page_no, 0);
2033 BUG_ON(!page); /* FIXME: We need some reserve page for this case */
2034 if (!PageUptodate(page))
2035 logfs_read_block(inode, page, WRITE);
2036
2037 if (write)
2038 alloc_indirect_block(inode, page, 0);
2039 se = kmap_atomic(page, KM_USER0);
2040 change_se(se + child_no, arg);
2041 if (write) {
2042 logfs_set_alias(sb, logfs_block(page), child_no);
2043 BUG_ON((int)be32_to_cpu(se[child_no].valid) > super->s_segsize);
2044 }
2045 kunmap_atomic(se, KM_USER0);
2046
2047 logfs_put_write_page(page);
2048}
2049
2050static void __get_segment_entry(struct logfs_segment_entry *se, long _target)
2051{
2052 struct logfs_segment_entry *target = (void *)_target;
2053
2054 *target = *se;
2055}
2056
2057void logfs_get_segment_entry(struct super_block *sb, u32 segno,
2058 struct logfs_segment_entry *se)
2059{
2060 logfs_mod_segment_entry(sb, segno, 0, __get_segment_entry, (long)se);
2061}
2062
2063static void __set_segment_used(struct logfs_segment_entry *se, long increment)
2064{
2065 u32 valid;
2066
2067 valid = be32_to_cpu(se->valid);
2068 valid += increment;
2069 se->valid = cpu_to_be32(valid);
2070}
2071
2072void logfs_set_segment_used(struct super_block *sb, u64 ofs, int increment)
2073{
2074 struct logfs_super *super = logfs_super(sb);
2075 u32 segno = ofs >> super->s_segshift;
2076
2077 if (!increment)
2078 return;
2079
2080 logfs_mod_segment_entry(sb, segno, 1, __set_segment_used, increment);
2081}
2082
2083static void __set_segment_erased(struct logfs_segment_entry *se, long ec_level)
2084{
2085 se->ec_level = cpu_to_be32(ec_level);
2086}
2087
2088void logfs_set_segment_erased(struct super_block *sb, u32 segno, u32 ec,
2089 gc_level_t gc_level)
2090{
2091 u32 ec_level = ec << 4 | (__force u8)gc_level;
2092
2093 logfs_mod_segment_entry(sb, segno, 1, __set_segment_erased, ec_level);
2094}
2095
2096static void __set_segment_reserved(struct logfs_segment_entry *se, long ignore)
2097{
2098 se->valid = cpu_to_be32(RESERVED);
2099}
2100
2101void logfs_set_segment_reserved(struct super_block *sb, u32 segno)
2102{
2103 logfs_mod_segment_entry(sb, segno, 1, __set_segment_reserved, 0);
2104}
2105
2106static void __set_segment_unreserved(struct logfs_segment_entry *se,
2107 long ec_level)
2108{
2109 se->valid = 0;
2110 se->ec_level = cpu_to_be32(ec_level);
2111}
2112
2113void logfs_set_segment_unreserved(struct super_block *sb, u32 segno, u32 ec)
2114{
2115 u32 ec_level = ec << 4;
2116
2117 logfs_mod_segment_entry(sb, segno, 1, __set_segment_unreserved,
2118 ec_level);
2119}
2120
2121int __logfs_write_inode(struct inode *inode, long flags)
2122{
2123 struct super_block *sb = inode->i_sb;
2124 int ret;
2125
2126 logfs_get_wblocks(sb, NULL, flags & WF_LOCK);
2127 ret = do_write_inode(inode);
2128 logfs_put_wblocks(sb, NULL, flags & WF_LOCK);
2129 return ret;
2130}
2131
2132static int do_delete_inode(struct inode *inode)
2133{
2134 struct super_block *sb = inode->i_sb;
2135 struct inode *master_inode = logfs_super(sb)->s_master_inode;
2136 struct page *page;
2137 int ret;
2138
2139 page = logfs_get_write_page(master_inode, inode->i_ino, 0);
2140 if (!page)
2141 return -ENOMEM;
2142
2143 move_inode_to_page(page, inode);
2144
2145 logfs_get_wblocks(sb, page, 1);
2146 ret = __logfs_delete(master_inode, page);
2147 logfs_put_wblocks(sb, page, 1);
2148
2149 logfs_put_write_page(page);
2150 return ret;
2151}
2152
2153/*
2154 * ZOMBIE inodes have already been deleted before and should remain dead,
2155 * if it weren't for valid checking. No need to kill them again here.
2156 */
2157void logfs_delete_inode(struct inode *inode)
2158{
2159 struct logfs_inode *li = logfs_inode(inode);
2160
2161 if (!(li->li_flags & LOGFS_IF_ZOMBIE)) {
2162 li->li_flags |= LOGFS_IF_ZOMBIE;
2163 if (i_size_read(inode) > 0)
2164 logfs_truncate(inode, 0);
2165 do_delete_inode(inode);
2166 }
2167 truncate_inode_pages(&inode->i_data, 0);
2168 clear_inode(inode);
2169}
2170
2171void btree_write_block(struct logfs_block *block)
2172{
2173 struct inode *inode;
2174 struct page *page;
2175 int err, cookie;
2176
2177 inode = logfs_safe_iget(block->sb, block->ino, &cookie);
2178 page = logfs_get_write_page(inode, block->bix, block->level);
2179
2180 err = logfs_readpage_nolock(page);
2181 BUG_ON(err);
2182 BUG_ON(!PagePrivate(page));
2183 BUG_ON(logfs_block(page) != block);
2184 err = __logfs_write_buf(inode, page, 0);
2185 BUG_ON(err);
2186 BUG_ON(PagePrivate(page) || page->private);
2187
2188 logfs_put_write_page(page);
2189 logfs_safe_iput(inode, cookie);
2190}
2191
2192/**
2193 * logfs_inode_write - write inode or dentry objects
2194 *
2195 * @inode: parent inode (ifile or directory)
2196 * @buf: object to write (inode or dentry)
2197 * @n: object size
2198 * @_pos: object number (file position in blocks/objects)
2199 * @flags: write flags
2200 * @lock: 0 if write lock is already taken, 1 otherwise
2201 * @shadow_tree: shadow below this inode
2202 *
2203 * FIXME: All caller of this put a 200-300 byte variable on the stack,
2204 * only to call here and do a memcpy from that stack variable. A good
2205 * example of wasted performance and stack space.
2206 */
2207int logfs_inode_write(struct inode *inode, const void *buf, size_t count,
2208 loff_t bix, long flags, struct shadow_tree *shadow_tree)
2209{
2210 loff_t pos = bix << inode->i_sb->s_blocksize_bits;
2211 int err;
2212 struct page *page;
2213 void *pagebuf;
2214
2215 BUG_ON(pos & (LOGFS_BLOCKSIZE-1));
2216 BUG_ON(count > LOGFS_BLOCKSIZE);
2217 page = logfs_get_write_page(inode, bix, 0);
2218 if (!page)
2219 return -ENOMEM;
2220
2221 pagebuf = kmap_atomic(page, KM_USER0);
2222 memcpy(pagebuf, buf, count);
2223 flush_dcache_page(page);
2224 kunmap_atomic(pagebuf, KM_USER0);
2225
2226 if (i_size_read(inode) < pos + LOGFS_BLOCKSIZE)
2227 i_size_write(inode, pos + LOGFS_BLOCKSIZE);
2228
2229 err = logfs_write_buf(inode, page, flags);
2230 logfs_put_write_page(page);
2231 return err;
2232}
2233
2234int logfs_open_segfile(struct super_block *sb)
2235{
2236 struct logfs_super *super = logfs_super(sb);
2237 struct inode *inode;
2238
2239 inode = logfs_read_meta_inode(sb, LOGFS_INO_SEGFILE);
2240 if (IS_ERR(inode))
2241 return PTR_ERR(inode);
2242 super->s_segfile_inode = inode;
2243 return 0;
2244}
2245
2246int logfs_init_rw(struct super_block *sb)
2247{
2248 struct logfs_super *super = logfs_super(sb);
2249 int min_fill = 3 * super->s_no_blocks;
2250
2251 INIT_LIST_HEAD(&super->s_object_alias);
2252 mutex_init(&super->s_write_mutex);
2253 super->s_block_pool = mempool_create_kmalloc_pool(min_fill,
2254 sizeof(struct logfs_block));
2255 super->s_shadow_pool = mempool_create_kmalloc_pool(min_fill,
2256 sizeof(struct logfs_shadow));
2257 return 0;
2258}
2259
2260void logfs_cleanup_rw(struct super_block *sb)
2261{
2262 struct logfs_super *super = logfs_super(sb);
2263
2264 destroy_meta_inode(super->s_segfile_inode);
2265 logfs_mempool_destroy(super->s_block_pool);
2266 logfs_mempool_destroy(super->s_shadow_pool);
2267}
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
new file mode 100644
index 000000000000..f77ce2b470ba
--- /dev/null
+++ b/fs/logfs/segment.c
@@ -0,0 +1,930 @@
1/*
2 * fs/logfs/segment.c - Handling the Object Store
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 *
8 * Object store or ostore makes up the complete device with exception of
9 * the superblock and journal areas. Apart from its own metadata it stores
10 * three kinds of objects: inodes, dentries and blocks, both data and indirect.
11 */
12#include "logfs.h"
13#include <linux/slab.h>
14
15static int logfs_mark_segment_bad(struct super_block *sb, u32 segno)
16{
17 struct logfs_super *super = logfs_super(sb);
18 struct btree_head32 *head = &super->s_reserved_segments;
19 int err;
20
21 err = btree_insert32(head, segno, (void *)1, GFP_NOFS);
22 if (err)
23 return err;
24 logfs_super(sb)->s_bad_segments++;
25 /* FIXME: write to journal */
26 return 0;
27}
28
29int logfs_erase_segment(struct super_block *sb, u32 segno, int ensure_erase)
30{
31 struct logfs_super *super = logfs_super(sb);
32
33 super->s_gec++;
34
35 return super->s_devops->erase(sb, (u64)segno << super->s_segshift,
36 super->s_segsize, ensure_erase);
37}
38
39static s64 logfs_get_free_bytes(struct logfs_area *area, size_t bytes)
40{
41 s32 ofs;
42
43 logfs_open_area(area, bytes);
44
45 ofs = area->a_used_bytes;
46 area->a_used_bytes += bytes;
47 BUG_ON(area->a_used_bytes >= logfs_super(area->a_sb)->s_segsize);
48
49 return dev_ofs(area->a_sb, area->a_segno, ofs);
50}
51
52static struct page *get_mapping_page(struct super_block *sb, pgoff_t index,
53 int use_filler)
54{
55 struct logfs_super *super = logfs_super(sb);
56 struct address_space *mapping = super->s_mapping_inode->i_mapping;
57 filler_t *filler = super->s_devops->readpage;
58 struct page *page;
59
60 BUG_ON(mapping_gfp_mask(mapping) & __GFP_FS);
61 if (use_filler)
62 page = read_cache_page(mapping, index, filler, sb);
63 else {
64 page = find_or_create_page(mapping, index, GFP_NOFS);
65 unlock_page(page);
66 }
67 return page;
68}
69
70void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
71 int use_filler)
72{
73 pgoff_t index = ofs >> PAGE_SHIFT;
74 struct page *page;
75 long offset = ofs & (PAGE_SIZE-1);
76 long copylen;
77
78 /* Only logfs_wbuf_recover may use len==0 */
79 BUG_ON(!len && !use_filler);
80 do {
81 copylen = min((ulong)len, PAGE_SIZE - offset);
82
83 page = get_mapping_page(area->a_sb, index, use_filler);
84 SetPageUptodate(page);
85 BUG_ON(!page); /* FIXME: reserve a pool */
86 memcpy(page_address(page) + offset, buf, copylen);
87 SetPagePrivate(page);
88 page_cache_release(page);
89
90 buf += copylen;
91 len -= copylen;
92 offset = 0;
93 index++;
94 } while (len);
95}
96
97static void pad_partial_page(struct logfs_area *area)
98{
99 struct super_block *sb = area->a_sb;
100 struct page *page;
101 u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes);
102 pgoff_t index = ofs >> PAGE_SHIFT;
103 long offset = ofs & (PAGE_SIZE-1);
104 u32 len = PAGE_SIZE - offset;
105
106 if (len % PAGE_SIZE) {
107 page = get_mapping_page(sb, index, 0);
108 BUG_ON(!page); /* FIXME: reserve a pool */
109 memset(page_address(page) + offset, 0xff, len);
110 SetPagePrivate(page);
111 page_cache_release(page);
112 }
113}
114
115static void pad_full_pages(struct logfs_area *area)
116{
117 struct super_block *sb = area->a_sb;
118 struct logfs_super *super = logfs_super(sb);
119 u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes);
120 u32 len = super->s_segsize - area->a_used_bytes;
121 pgoff_t index = PAGE_CACHE_ALIGN(ofs) >> PAGE_CACHE_SHIFT;
122 pgoff_t no_indizes = len >> PAGE_CACHE_SHIFT;
123 struct page *page;
124
125 while (no_indizes) {
126 page = get_mapping_page(sb, index, 0);
127 BUG_ON(!page); /* FIXME: reserve a pool */
128 SetPageUptodate(page);
129 memset(page_address(page), 0xff, PAGE_CACHE_SIZE);
130 SetPagePrivate(page);
131 page_cache_release(page);
132 index++;
133 no_indizes--;
134 }
135}
136
137/*
138 * bdev_writeseg will write full pages. Memset the tail to prevent data leaks.
139 * Also make sure we allocate (and memset) all pages for final writeout.
140 */
141static void pad_wbuf(struct logfs_area *area, int final)
142{
143 pad_partial_page(area);
144 if (final)
145 pad_full_pages(area);
146}
147
148/*
149 * We have to be careful with the alias tree. Since lookup is done by bix,
150 * it needs to be normalized, so 14, 15, 16, etc. all match when dealing with
151 * indirect blocks. So always use it through accessor functions.
152 */
153static void *alias_tree_lookup(struct super_block *sb, u64 ino, u64 bix,
154 level_t level)
155{
156 struct btree_head128 *head = &logfs_super(sb)->s_object_alias_tree;
157 pgoff_t index = logfs_pack_index(bix, level);
158
159 return btree_lookup128(head, ino, index);
160}
161
162static int alias_tree_insert(struct super_block *sb, u64 ino, u64 bix,
163 level_t level, void *val)
164{
165 struct btree_head128 *head = &logfs_super(sb)->s_object_alias_tree;
166 pgoff_t index = logfs_pack_index(bix, level);
167
168 return btree_insert128(head, ino, index, val, GFP_NOFS);
169}
170
171static int btree_write_alias(struct super_block *sb, struct logfs_block *block,
172 write_alias_t *write_one_alias)
173{
174 struct object_alias_item *item;
175 int err;
176
177 list_for_each_entry(item, &block->item_list, list) {
178 err = write_alias_journal(sb, block->ino, block->bix,
179 block->level, item->child_no, item->val);
180 if (err)
181 return err;
182 }
183 return 0;
184}
185
186static struct logfs_block_ops btree_block_ops = {
187 .write_block = btree_write_block,
188 .free_block = __free_block,
189 .write_alias = btree_write_alias,
190};
191
192int logfs_load_object_aliases(struct super_block *sb,
193 struct logfs_obj_alias *oa, int count)
194{
195 struct logfs_super *super = logfs_super(sb);
196 struct logfs_block *block;
197 struct object_alias_item *item;
198 u64 ino, bix;
199 level_t level;
200 int i, err;
201
202 super->s_flags |= LOGFS_SB_FLAG_OBJ_ALIAS;
203 count /= sizeof(*oa);
204 for (i = 0; i < count; i++) {
205 item = mempool_alloc(super->s_alias_pool, GFP_NOFS);
206 if (!item)
207 return -ENOMEM;
208 memset(item, 0, sizeof(*item));
209
210 super->s_no_object_aliases++;
211 item->val = oa[i].val;
212 item->child_no = be16_to_cpu(oa[i].child_no);
213
214 ino = be64_to_cpu(oa[i].ino);
215 bix = be64_to_cpu(oa[i].bix);
216 level = LEVEL(oa[i].level);
217
218 log_aliases("logfs_load_object_aliases(%llx, %llx, %x, %x) %llx\n",
219 ino, bix, level, item->child_no,
220 be64_to_cpu(item->val));
221 block = alias_tree_lookup(sb, ino, bix, level);
222 if (!block) {
223 block = __alloc_block(sb, ino, bix, level);
224 block->ops = &btree_block_ops;
225 err = alias_tree_insert(sb, ino, bix, level, block);
226 BUG_ON(err); /* mempool empty */
227 }
228 if (test_and_set_bit(item->child_no, block->alias_map)) {
229 printk(KERN_ERR"LogFS: Alias collision detected\n");
230 return -EIO;
231 }
232 list_move_tail(&block->alias_list, &super->s_object_alias);
233 list_add(&item->list, &block->item_list);
234 }
235 return 0;
236}
237
238static void kill_alias(void *_block, unsigned long ignore0,
239 u64 ignore1, u64 ignore2, size_t ignore3)
240{
241 struct logfs_block *block = _block;
242 struct super_block *sb = block->sb;
243 struct logfs_super *super = logfs_super(sb);
244 struct object_alias_item *item;
245
246 while (!list_empty(&block->item_list)) {
247 item = list_entry(block->item_list.next, typeof(*item), list);
248 list_del(&item->list);
249 mempool_free(item, super->s_alias_pool);
250 }
251 block->ops->free_block(sb, block);
252}
253
254static int obj_type(struct inode *inode, level_t level)
255{
256 if (level == 0) {
257 if (S_ISDIR(inode->i_mode))
258 return OBJ_DENTRY;
259 if (inode->i_ino == LOGFS_INO_MASTER)
260 return OBJ_INODE;
261 }
262 return OBJ_BLOCK;
263}
264
265static int obj_len(struct super_block *sb, int obj_type)
266{
267 switch (obj_type) {
268 case OBJ_DENTRY:
269 return sizeof(struct logfs_disk_dentry);
270 case OBJ_INODE:
271 return sizeof(struct logfs_disk_inode);
272 case OBJ_BLOCK:
273 return sb->s_blocksize;
274 default:
275 BUG();
276 }
277}
278
279static int __logfs_segment_write(struct inode *inode, void *buf,
280 struct logfs_shadow *shadow, int type, int len, int compr)
281{
282 struct logfs_area *area;
283 struct super_block *sb = inode->i_sb;
284 s64 ofs;
285 struct logfs_object_header h;
286 int acc_len;
287
288 if (shadow->gc_level == 0)
289 acc_len = len;
290 else
291 acc_len = obj_len(sb, type);
292
293 area = get_area(sb, shadow->gc_level);
294 ofs = logfs_get_free_bytes(area, len + LOGFS_OBJECT_HEADERSIZE);
295 LOGFS_BUG_ON(ofs <= 0, sb);
296 /*
297 * Order is important. logfs_get_free_bytes(), by modifying the
298 * segment file, may modify the content of the very page we're about
299 * to write now. Which is fine, as long as the calculated crc and
300 * written data still match. So do the modifications _before_
301 * calculating the crc.
302 */
303
304 h.len = cpu_to_be16(len);
305 h.type = type;
306 h.compr = compr;
307 h.ino = cpu_to_be64(inode->i_ino);
308 h.bix = cpu_to_be64(shadow->bix);
309 h.crc = logfs_crc32(&h, sizeof(h) - 4, 4);
310 h.data_crc = logfs_crc32(buf, len, 0);
311
312 logfs_buf_write(area, ofs, &h, sizeof(h));
313 logfs_buf_write(area, ofs + LOGFS_OBJECT_HEADERSIZE, buf, len);
314
315 shadow->new_ofs = ofs;
316 shadow->new_len = acc_len + LOGFS_OBJECT_HEADERSIZE;
317
318 return 0;
319}
320
321static s64 logfs_segment_write_compress(struct inode *inode, void *buf,
322 struct logfs_shadow *shadow, int type, int len)
323{
324 struct super_block *sb = inode->i_sb;
325 void *compressor_buf = logfs_super(sb)->s_compressed_je;
326 ssize_t compr_len;
327 int ret;
328
329 mutex_lock(&logfs_super(sb)->s_journal_mutex);
330 compr_len = logfs_compress(buf, compressor_buf, len, len);
331
332 if (compr_len >= 0) {
333 ret = __logfs_segment_write(inode, compressor_buf, shadow,
334 type, compr_len, COMPR_ZLIB);
335 } else {
336 ret = __logfs_segment_write(inode, buf, shadow, type, len,
337 COMPR_NONE);
338 }
339 mutex_unlock(&logfs_super(sb)->s_journal_mutex);
340 return ret;
341}
342
343/**
344 * logfs_segment_write - write data block to object store
345 * @inode: inode containing data
346 *
347 * Returns an errno or zero.
348 */
349int logfs_segment_write(struct inode *inode, struct page *page,
350 struct logfs_shadow *shadow)
351{
352 struct super_block *sb = inode->i_sb;
353 struct logfs_super *super = logfs_super(sb);
354 int do_compress, type, len;
355 int ret;
356 void *buf;
357
358 super->s_flags |= LOGFS_SB_FLAG_DIRTY;
359 BUG_ON(super->s_flags & LOGFS_SB_FLAG_SHUTDOWN);
360 do_compress = logfs_inode(inode)->li_flags & LOGFS_IF_COMPRESSED;
361 if (shadow->gc_level != 0) {
362 /* temporarily disable compression for indirect blocks */
363 do_compress = 0;
364 }
365
366 type = obj_type(inode, shrink_level(shadow->gc_level));
367 len = obj_len(sb, type);
368 buf = kmap(page);
369 if (do_compress)
370 ret = logfs_segment_write_compress(inode, buf, shadow, type,
371 len);
372 else
373 ret = __logfs_segment_write(inode, buf, shadow, type, len,
374 COMPR_NONE);
375 kunmap(page);
376
377 log_segment("logfs_segment_write(%llx, %llx, %x) %llx->%llx %x->%x\n",
378 shadow->ino, shadow->bix, shadow->gc_level,
379 shadow->old_ofs, shadow->new_ofs,
380 shadow->old_len, shadow->new_len);
381 /* this BUG_ON did catch a locking bug. useful */
382 BUG_ON(!(shadow->new_ofs & (super->s_segsize - 1)));
383 return ret;
384}
385
386int wbuf_read(struct super_block *sb, u64 ofs, size_t len, void *buf)
387{
388 pgoff_t index = ofs >> PAGE_SHIFT;
389 struct page *page;
390 long offset = ofs & (PAGE_SIZE-1);
391 long copylen;
392
393 while (len) {
394 copylen = min((ulong)len, PAGE_SIZE - offset);
395
396 page = get_mapping_page(sb, index, 1);
397 if (IS_ERR(page))
398 return PTR_ERR(page);
399 memcpy(buf, page_address(page) + offset, copylen);
400 page_cache_release(page);
401
402 buf += copylen;
403 len -= copylen;
404 offset = 0;
405 index++;
406 }
407 return 0;
408}
409
410/*
411 * The "position" of indirect blocks is ambiguous. It can be the position
412 * of any data block somewhere behind this indirect block. So we need to
413 * normalize the positions through logfs_block_mask() before comparing.
414 */
415static int check_pos(struct super_block *sb, u64 pos1, u64 pos2, level_t level)
416{
417 return (pos1 & logfs_block_mask(sb, level)) !=
418 (pos2 & logfs_block_mask(sb, level));
419}
420
421#if 0
422static int read_seg_header(struct super_block *sb, u64 ofs,
423 struct logfs_segment_header *sh)
424{
425 __be32 crc;
426 int err;
427
428 err = wbuf_read(sb, ofs, sizeof(*sh), sh);
429 if (err)
430 return err;
431 crc = logfs_crc32(sh, sizeof(*sh), 4);
432 if (crc != sh->crc) {
433 printk(KERN_ERR"LOGFS: header crc error at %llx: expected %x, "
434 "got %x\n", ofs, be32_to_cpu(sh->crc),
435 be32_to_cpu(crc));
436 return -EIO;
437 }
438 return 0;
439}
440#endif
441
442static int read_obj_header(struct super_block *sb, u64 ofs,
443 struct logfs_object_header *oh)
444{
445 __be32 crc;
446 int err;
447
448 err = wbuf_read(sb, ofs, sizeof(*oh), oh);
449 if (err)
450 return err;
451 crc = logfs_crc32(oh, sizeof(*oh) - 4, 4);
452 if (crc != oh->crc) {
453 printk(KERN_ERR"LOGFS: header crc error at %llx: expected %x, "
454 "got %x\n", ofs, be32_to_cpu(oh->crc),
455 be32_to_cpu(crc));
456 return -EIO;
457 }
458 return 0;
459}
460
461static void move_btree_to_page(struct inode *inode, struct page *page,
462 __be64 *data)
463{
464 struct super_block *sb = inode->i_sb;
465 struct logfs_super *super = logfs_super(sb);
466 struct btree_head128 *head = &super->s_object_alias_tree;
467 struct logfs_block *block;
468 struct object_alias_item *item, *next;
469
470 if (!(super->s_flags & LOGFS_SB_FLAG_OBJ_ALIAS))
471 return;
472
473 block = btree_remove128(head, inode->i_ino, page->index);
474 if (!block)
475 return;
476
477 log_blockmove("move_btree_to_page(%llx, %llx, %x)\n",
478 block->ino, block->bix, block->level);
479 list_for_each_entry_safe(item, next, &block->item_list, list) {
480 data[item->child_no] = item->val;
481 list_del(&item->list);
482 mempool_free(item, super->s_alias_pool);
483 }
484 block->page = page;
485 SetPagePrivate(page);
486 page->private = (unsigned long)block;
487 block->ops = &indirect_block_ops;
488 initialize_block_counters(page, block, data, 0);
489}
490
491/*
492 * This silences a false, yet annoying gcc warning. I hate it when my editor
493 * jumps into bitops.h each time I recompile this file.
494 * TODO: Complain to gcc folks about this and upgrade compiler.
495 */
496static unsigned long fnb(const unsigned long *addr,
497 unsigned long size, unsigned long offset)
498{
499 return find_next_bit(addr, size, offset);
500}
501
502void move_page_to_btree(struct page *page)
503{
504 struct logfs_block *block = logfs_block(page);
505 struct super_block *sb = block->sb;
506 struct logfs_super *super = logfs_super(sb);
507 struct object_alias_item *item;
508 unsigned long pos;
509 __be64 *child;
510 int err;
511
512 if (super->s_flags & LOGFS_SB_FLAG_SHUTDOWN) {
513 block->ops->free_block(sb, block);
514 return;
515 }
516 log_blockmove("move_page_to_btree(%llx, %llx, %x)\n",
517 block->ino, block->bix, block->level);
518 super->s_flags |= LOGFS_SB_FLAG_OBJ_ALIAS;
519
520 for (pos = 0; ; pos++) {
521 pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos);
522 if (pos >= LOGFS_BLOCK_FACTOR)
523 break;
524
525 item = mempool_alloc(super->s_alias_pool, GFP_NOFS);
526 BUG_ON(!item); /* mempool empty */
527 memset(item, 0, sizeof(*item));
528
529 child = kmap_atomic(page, KM_USER0);
530 item->val = child[pos];
531 kunmap_atomic(child, KM_USER0);
532 item->child_no = pos;
533 list_add(&item->list, &block->item_list);
534 }
535 block->page = NULL;
536 ClearPagePrivate(page);
537 page->private = 0;
538 block->ops = &btree_block_ops;
539 err = alias_tree_insert(block->sb, block->ino, block->bix, block->level,
540 block);
541 BUG_ON(err); /* mempool empty */
542 ClearPageUptodate(page);
543}
544
545static int __logfs_segment_read(struct inode *inode, void *buf,
546 u64 ofs, u64 bix, level_t level)
547{
548 struct super_block *sb = inode->i_sb;
549 void *compressor_buf = logfs_super(sb)->s_compressed_je;
550 struct logfs_object_header oh;
551 __be32 crc;
552 u16 len;
553 int err, block_len;
554
555 block_len = obj_len(sb, obj_type(inode, level));
556 err = read_obj_header(sb, ofs, &oh);
557 if (err)
558 goto out_err;
559
560 err = -EIO;
561 if (be64_to_cpu(oh.ino) != inode->i_ino
562 || check_pos(sb, be64_to_cpu(oh.bix), bix, level)) {
563 printk(KERN_ERR"LOGFS: (ino, bix) don't match at %llx: "
564 "expected (%lx, %llx), got (%llx, %llx)\n",
565 ofs, inode->i_ino, bix,
566 be64_to_cpu(oh.ino), be64_to_cpu(oh.bix));
567 goto out_err;
568 }
569
570 len = be16_to_cpu(oh.len);
571
572 switch (oh.compr) {
573 case COMPR_NONE:
574 err = wbuf_read(sb, ofs + LOGFS_OBJECT_HEADERSIZE, len, buf);
575 if (err)
576 goto out_err;
577 crc = logfs_crc32(buf, len, 0);
578 if (crc != oh.data_crc) {
579 printk(KERN_ERR"LOGFS: uncompressed data crc error at "
580 "%llx: expected %x, got %x\n", ofs,
581 be32_to_cpu(oh.data_crc),
582 be32_to_cpu(crc));
583 goto out_err;
584 }
585 break;
586 case COMPR_ZLIB:
587 mutex_lock(&logfs_super(sb)->s_journal_mutex);
588 err = wbuf_read(sb, ofs + LOGFS_OBJECT_HEADERSIZE, len,
589 compressor_buf);
590 if (err) {
591 mutex_unlock(&logfs_super(sb)->s_journal_mutex);
592 goto out_err;
593 }
594 crc = logfs_crc32(compressor_buf, len, 0);
595 if (crc != oh.data_crc) {
596 printk(KERN_ERR"LOGFS: compressed data crc error at "
597 "%llx: expected %x, got %x\n", ofs,
598 be32_to_cpu(oh.data_crc),
599 be32_to_cpu(crc));
600 mutex_unlock(&logfs_super(sb)->s_journal_mutex);
601 goto out_err;
602 }
603 err = logfs_uncompress(compressor_buf, buf, len, block_len);
604 mutex_unlock(&logfs_super(sb)->s_journal_mutex);
605 if (err) {
606 printk(KERN_ERR"LOGFS: uncompress error at %llx\n", ofs);
607 goto out_err;
608 }
609 break;
610 default:
611 LOGFS_BUG(sb);
612 err = -EIO;
613 goto out_err;
614 }
615 return 0;
616
617out_err:
618 logfs_set_ro(sb);
619 printk(KERN_ERR"LOGFS: device is read-only now\n");
620 LOGFS_BUG(sb);
621 return err;
622}
623
624/**
625 * logfs_segment_read - read data block from object store
626 * @inode: inode containing data
627 * @buf: data buffer
628 * @ofs: physical data offset
629 * @bix: block index
630 * @level: block level
631 *
632 * Returns 0 on success or a negative errno.
633 */
634int logfs_segment_read(struct inode *inode, struct page *page,
635 u64 ofs, u64 bix, level_t level)
636{
637 int err;
638 void *buf;
639
640 if (PageUptodate(page))
641 return 0;
642
643 ofs &= ~LOGFS_FULLY_POPULATED;
644
645 buf = kmap(page);
646 err = __logfs_segment_read(inode, buf, ofs, bix, level);
647 if (!err) {
648 move_btree_to_page(inode, page, buf);
649 SetPageUptodate(page);
650 }
651 kunmap(page);
652 log_segment("logfs_segment_read(%lx, %llx, %x) %llx (%d)\n",
653 inode->i_ino, bix, level, ofs, err);
654 return err;
655}
656
657int logfs_segment_delete(struct inode *inode, struct logfs_shadow *shadow)
658{
659 struct super_block *sb = inode->i_sb;
660 struct logfs_super *super = logfs_super(sb);
661 struct logfs_object_header h;
662 u16 len;
663 int err;
664
665 super->s_flags |= LOGFS_SB_FLAG_DIRTY;
666 BUG_ON(super->s_flags & LOGFS_SB_FLAG_SHUTDOWN);
667 BUG_ON(shadow->old_ofs & LOGFS_FULLY_POPULATED);
668 if (!shadow->old_ofs)
669 return 0;
670
671 log_segment("logfs_segment_delete(%llx, %llx, %x) %llx->%llx %x->%x\n",
672 shadow->ino, shadow->bix, shadow->gc_level,
673 shadow->old_ofs, shadow->new_ofs,
674 shadow->old_len, shadow->new_len);
675 err = read_obj_header(sb, shadow->old_ofs, &h);
676 LOGFS_BUG_ON(err, sb);
677 LOGFS_BUG_ON(be64_to_cpu(h.ino) != inode->i_ino, sb);
678 LOGFS_BUG_ON(check_pos(sb, shadow->bix, be64_to_cpu(h.bix),
679 shrink_level(shadow->gc_level)), sb);
680
681 if (shadow->gc_level == 0)
682 len = be16_to_cpu(h.len);
683 else
684 len = obj_len(sb, h.type);
685 shadow->old_len = len + sizeof(h);
686 return 0;
687}
688
689void freeseg(struct super_block *sb, u32 segno)
690{
691 struct logfs_super *super = logfs_super(sb);
692 struct address_space *mapping = super->s_mapping_inode->i_mapping;
693 struct page *page;
694 u64 ofs, start, end;
695
696 start = dev_ofs(sb, segno, 0);
697 end = dev_ofs(sb, segno + 1, 0);
698 for (ofs = start; ofs < end; ofs += PAGE_SIZE) {
699 page = find_get_page(mapping, ofs >> PAGE_SHIFT);
700 if (!page)
701 continue;
702 ClearPagePrivate(page);
703 page_cache_release(page);
704 }
705}
706
707int logfs_open_area(struct logfs_area *area, size_t bytes)
708{
709 struct super_block *sb = area->a_sb;
710 struct logfs_super *super = logfs_super(sb);
711 int err, closed = 0;
712
713 if (area->a_is_open && area->a_used_bytes + bytes <= super->s_segsize)
714 return 0;
715
716 if (area->a_is_open) {
717 u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
718 u32 len = super->s_segsize - area->a_written_bytes;
719
720 log_gc("logfs_close_area(%x)\n", area->a_segno);
721 pad_wbuf(area, 1);
722 super->s_devops->writeseg(area->a_sb, ofs, len);
723 freeseg(sb, area->a_segno);
724 closed = 1;
725 }
726
727 area->a_used_bytes = 0;
728 area->a_written_bytes = 0;
729again:
730 area->a_ops->get_free_segment(area);
731 area->a_ops->get_erase_count(area);
732
733 log_gc("logfs_open_area(%x, %x)\n", area->a_segno, area->a_level);
734 err = area->a_ops->erase_segment(area);
735 if (err) {
736 printk(KERN_WARNING "LogFS: Error erasing segment %x\n",
737 area->a_segno);
738 logfs_mark_segment_bad(sb, area->a_segno);
739 goto again;
740 }
741 area->a_is_open = 1;
742 return closed;
743}
744
745void logfs_sync_area(struct logfs_area *area)
746{
747 struct super_block *sb = area->a_sb;
748 struct logfs_super *super = logfs_super(sb);
749 u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
750 u32 len = (area->a_used_bytes - area->a_written_bytes);
751
752 if (super->s_writesize)
753 len &= ~(super->s_writesize - 1);
754 if (len == 0)
755 return;
756 pad_wbuf(area, 0);
757 super->s_devops->writeseg(sb, ofs, len);
758 area->a_written_bytes += len;
759}
760
761void logfs_sync_segments(struct super_block *sb)
762{
763 struct logfs_super *super = logfs_super(sb);
764 int i;
765
766 for_each_area(i)
767 logfs_sync_area(super->s_area[i]);
768}
769
770/*
771 * Pick a free segment to be used for this area. Effectively takes a
772 * candidate from the free list (not really a candidate anymore).
773 */
774static void ostore_get_free_segment(struct logfs_area *area)
775{
776 struct super_block *sb = area->a_sb;
777 struct logfs_super *super = logfs_super(sb);
778
779 if (super->s_free_list.count == 0) {
780 printk(KERN_ERR"LOGFS: ran out of free segments\n");
781 LOGFS_BUG(sb);
782 }
783
784 area->a_segno = get_best_cand(sb, &super->s_free_list, NULL);
785}
786
787static void ostore_get_erase_count(struct logfs_area *area)
788{
789 struct logfs_segment_entry se;
790 u32 ec_level;
791
792 logfs_get_segment_entry(area->a_sb, area->a_segno, &se);
793 BUG_ON(se.ec_level == cpu_to_be32(BADSEG) ||
794 se.valid == cpu_to_be32(RESERVED));
795
796 ec_level = be32_to_cpu(se.ec_level);
797 area->a_erase_count = (ec_level >> 4) + 1;
798}
799
800static int ostore_erase_segment(struct logfs_area *area)
801{
802 struct super_block *sb = area->a_sb;
803 struct logfs_segment_header sh;
804 u64 ofs;
805 int err;
806
807 err = logfs_erase_segment(sb, area->a_segno, 0);
808 if (err)
809 return err;
810
811 sh.pad = 0;
812 sh.type = SEG_OSTORE;
813 sh.level = (__force u8)area->a_level;
814 sh.segno = cpu_to_be32(area->a_segno);
815 sh.ec = cpu_to_be32(area->a_erase_count);
816 sh.gec = cpu_to_be64(logfs_super(sb)->s_gec);
817 sh.crc = logfs_crc32(&sh, sizeof(sh), 4);
818
819 logfs_set_segment_erased(sb, area->a_segno, area->a_erase_count,
820 area->a_level);
821
822 ofs = dev_ofs(sb, area->a_segno, 0);
823 area->a_used_bytes = sizeof(sh);
824 logfs_buf_write(area, ofs, &sh, sizeof(sh));
825 return 0;
826}
827
828static const struct logfs_area_ops ostore_area_ops = {
829 .get_free_segment = ostore_get_free_segment,
830 .get_erase_count = ostore_get_erase_count,
831 .erase_segment = ostore_erase_segment,
832};
833
834static void free_area(struct logfs_area *area)
835{
836 if (area)
837 freeseg(area->a_sb, area->a_segno);
838 kfree(area);
839}
840
841static struct logfs_area *alloc_area(struct super_block *sb)
842{
843 struct logfs_area *area;
844
845 area = kzalloc(sizeof(*area), GFP_KERNEL);
846 if (!area)
847 return NULL;
848
849 area->a_sb = sb;
850 return area;
851}
852
853static void map_invalidatepage(struct page *page, unsigned long l)
854{
855 BUG();
856}
857
858static int map_releasepage(struct page *page, gfp_t g)
859{
860 /* Don't release these pages */
861 return 0;
862}
863
864static const struct address_space_operations mapping_aops = {
865 .invalidatepage = map_invalidatepage,
866 .releasepage = map_releasepage,
867 .set_page_dirty = __set_page_dirty_nobuffers,
868};
869
870int logfs_init_mapping(struct super_block *sb)
871{
872 struct logfs_super *super = logfs_super(sb);
873 struct address_space *mapping;
874 struct inode *inode;
875
876 inode = logfs_new_meta_inode(sb, LOGFS_INO_MAPPING);
877 if (IS_ERR(inode))
878 return PTR_ERR(inode);
879 super->s_mapping_inode = inode;
880 mapping = inode->i_mapping;
881 mapping->a_ops = &mapping_aops;
882 /* Would it be possible to use __GFP_HIGHMEM as well? */
883 mapping_set_gfp_mask(mapping, GFP_NOFS);
884 return 0;
885}
886
887int logfs_init_areas(struct super_block *sb)
888{
889 struct logfs_super *super = logfs_super(sb);
890 int i = -1;
891
892 super->s_alias_pool = mempool_create_kmalloc_pool(600,
893 sizeof(struct object_alias_item));
894 if (!super->s_alias_pool)
895 return -ENOMEM;
896
897 super->s_journal_area = alloc_area(sb);
898 if (!super->s_journal_area)
899 goto err;
900
901 for_each_area(i) {
902 super->s_area[i] = alloc_area(sb);
903 if (!super->s_area[i])
904 goto err;
905 super->s_area[i]->a_level = GC_LEVEL(i);
906 super->s_area[i]->a_ops = &ostore_area_ops;
907 }
908 btree_init_mempool128(&super->s_object_alias_tree,
909 super->s_btree_pool);
910 return 0;
911
912err:
913 for (i--; i >= 0; i--)
914 free_area(super->s_area[i]);
915 free_area(super->s_journal_area);
916 logfs_mempool_destroy(super->s_alias_pool);
917 return -ENOMEM;
918}
919
920void logfs_cleanup_areas(struct super_block *sb)
921{
922 struct logfs_super *super = logfs_super(sb);
923 int i;
924
925 btree_grim_visitor128(&super->s_object_alias_tree, 0, kill_alias);
926 for_each_area(i)
927 free_area(super->s_area[i]);
928 free_area(super->s_journal_area);
929 destroy_meta_inode(super->s_mapping_inode);
930}
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
new file mode 100644
index 000000000000..d7c23ed8349a
--- /dev/null
+++ b/fs/logfs/super.c
@@ -0,0 +1,657 @@
1/*
2 * fs/logfs/super.c
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 *
8 * Generally contains mount/umount code and also serves as a dump area for
9 * any functions that don't fit elsewhere and neither justify a file of their
10 * own.
11 */
12#include "logfs.h"
13#include <linux/bio.h>
14#include <linux/slab.h>
15#include <linux/blkdev.h>
16#include <linux/mtd/mtd.h>
17#include <linux/statfs.h>
18#include <linux/buffer_head.h>
19
20static DEFINE_MUTEX(emergency_mutex);
21static struct page *emergency_page;
22
23struct page *emergency_read_begin(struct address_space *mapping, pgoff_t index)
24{
25 filler_t *filler = (filler_t *)mapping->a_ops->readpage;
26 struct page *page;
27 int err;
28
29 page = read_cache_page(mapping, index, filler, NULL);
30 if (page)
31 return page;
32
33 /* No more pages available, switch to emergency page */
34 printk(KERN_INFO"Logfs: Using emergency page\n");
35 mutex_lock(&emergency_mutex);
36 err = filler(NULL, emergency_page);
37 if (err) {
38 mutex_unlock(&emergency_mutex);
39 printk(KERN_EMERG"Logfs: Error reading emergency page\n");
40 return ERR_PTR(err);
41 }
42 return emergency_page;
43}
44
45void emergency_read_end(struct page *page)
46{
47 if (page == emergency_page)
48 mutex_unlock(&emergency_mutex);
49 else
50 page_cache_release(page);
51}
52
53static void dump_segfile(struct super_block *sb)
54{
55 struct logfs_super *super = logfs_super(sb);
56 struct logfs_segment_entry se;
57 u32 segno;
58
59 for (segno = 0; segno < super->s_no_segs; segno++) {
60 logfs_get_segment_entry(sb, segno, &se);
61 printk("%3x: %6x %8x", segno, be32_to_cpu(se.ec_level),
62 be32_to_cpu(se.valid));
63 if (++segno < super->s_no_segs) {
64 logfs_get_segment_entry(sb, segno, &se);
65 printk(" %6x %8x", be32_to_cpu(se.ec_level),
66 be32_to_cpu(se.valid));
67 }
68 if (++segno < super->s_no_segs) {
69 logfs_get_segment_entry(sb, segno, &se);
70 printk(" %6x %8x", be32_to_cpu(se.ec_level),
71 be32_to_cpu(se.valid));
72 }
73 if (++segno < super->s_no_segs) {
74 logfs_get_segment_entry(sb, segno, &se);
75 printk(" %6x %8x", be32_to_cpu(se.ec_level),
76 be32_to_cpu(se.valid));
77 }
78 printk("\n");
79 }
80}
81
82/*
83 * logfs_crash_dump - dump debug information to device
84 *
85 * The LogFS superblock only occupies part of a segment. This function will
86 * write as much debug information as it can gather into the spare space.
87 */
88void logfs_crash_dump(struct super_block *sb)
89{
90 dump_segfile(sb);
91}
92
93/*
94 * TODO: move to lib/string.c
95 */
96/**
97 * memchr_inv - Find a character in an area of memory.
98 * @s: The memory area
99 * @c: The byte to search for
100 * @n: The size of the area.
101 *
102 * returns the address of the first character other than @c, or %NULL
103 * if the whole buffer contains just @c.
104 */
105void *memchr_inv(const void *s, int c, size_t n)
106{
107 const unsigned char *p = s;
108 while (n-- != 0)
109 if ((unsigned char)c != *p++)
110 return (void *)(p - 1);
111
112 return NULL;
113}
114
115/*
116 * FIXME: There should be a reserve for root, similar to ext2.
117 */
118int logfs_statfs(struct dentry *dentry, struct kstatfs *stats)
119{
120 struct super_block *sb = dentry->d_sb;
121 struct logfs_super *super = logfs_super(sb);
122
123 stats->f_type = LOGFS_MAGIC_U32;
124 stats->f_bsize = sb->s_blocksize;
125 stats->f_blocks = super->s_size >> LOGFS_BLOCK_BITS >> 3;
126 stats->f_bfree = super->s_free_bytes >> sb->s_blocksize_bits;
127 stats->f_bavail = super->s_free_bytes >> sb->s_blocksize_bits;
128 stats->f_files = 0;
129 stats->f_ffree = 0;
130 stats->f_namelen = LOGFS_MAX_NAMELEN;
131 return 0;
132}
133
134static int logfs_sb_set(struct super_block *sb, void *_super)
135{
136 struct logfs_super *super = _super;
137
138 sb->s_fs_info = super;
139 sb->s_mtd = super->s_mtd;
140 sb->s_bdev = super->s_bdev;
141 if (sb->s_bdev)
142 sb->s_bdi = &bdev_get_queue(sb->s_bdev)->backing_dev_info;
143 if (sb->s_mtd)
144 sb->s_bdi = sb->s_mtd->backing_dev_info;
145 return 0;
146}
147
148static int logfs_sb_test(struct super_block *sb, void *_super)
149{
150 struct logfs_super *super = _super;
151 struct mtd_info *mtd = super->s_mtd;
152
153 if (mtd && sb->s_mtd == mtd)
154 return 1;
155 if (super->s_bdev && sb->s_bdev == super->s_bdev)
156 return 1;
157 return 0;
158}
159
160static void set_segment_header(struct logfs_segment_header *sh, u8 type,
161 u8 level, u32 segno, u32 ec)
162{
163 sh->pad = 0;
164 sh->type = type;
165 sh->level = level;
166 sh->segno = cpu_to_be32(segno);
167 sh->ec = cpu_to_be32(ec);
168 sh->gec = cpu_to_be64(segno);
169 sh->crc = logfs_crc32(sh, LOGFS_SEGMENT_HEADERSIZE, 4);
170}
171
172static void logfs_write_ds(struct super_block *sb, struct logfs_disk_super *ds,
173 u32 segno, u32 ec)
174{
175 struct logfs_super *super = logfs_super(sb);
176 struct logfs_segment_header *sh = &ds->ds_sh;
177 int i;
178
179 memset(ds, 0, sizeof(*ds));
180 set_segment_header(sh, SEG_SUPER, 0, segno, ec);
181
182 ds->ds_ifile_levels = super->s_ifile_levels;
183 ds->ds_iblock_levels = super->s_iblock_levels;
184 ds->ds_data_levels = super->s_data_levels; /* XXX: Remove */
185 ds->ds_segment_shift = super->s_segshift;
186 ds->ds_block_shift = sb->s_blocksize_bits;
187 ds->ds_write_shift = super->s_writeshift;
188 ds->ds_filesystem_size = cpu_to_be64(super->s_size);
189 ds->ds_segment_size = cpu_to_be32(super->s_segsize);
190 ds->ds_bad_seg_reserve = cpu_to_be32(super->s_bad_seg_reserve);
191 ds->ds_feature_incompat = cpu_to_be64(super->s_feature_incompat);
192 ds->ds_feature_ro_compat= cpu_to_be64(super->s_feature_ro_compat);
193 ds->ds_feature_compat = cpu_to_be64(super->s_feature_compat);
194 ds->ds_feature_flags = cpu_to_be64(super->s_feature_flags);
195 ds->ds_root_reserve = cpu_to_be64(super->s_root_reserve);
196 ds->ds_speed_reserve = cpu_to_be64(super->s_speed_reserve);
197 journal_for_each(i)
198 ds->ds_journal_seg[i] = cpu_to_be32(super->s_journal_seg[i]);
199 ds->ds_magic = cpu_to_be64(LOGFS_MAGIC);
200 ds->ds_crc = logfs_crc32(ds, sizeof(*ds),
201 LOGFS_SEGMENT_HEADERSIZE + 12);
202}
203
204static int write_one_sb(struct super_block *sb,
205 struct page *(*find_sb)(struct super_block *sb, u64 *ofs))
206{
207 struct logfs_super *super = logfs_super(sb);
208 struct logfs_disk_super *ds;
209 struct logfs_segment_entry se;
210 struct page *page;
211 u64 ofs;
212 u32 ec, segno;
213 int err;
214
215 page = find_sb(sb, &ofs);
216 if (!page)
217 return -EIO;
218 ds = page_address(page);
219 segno = seg_no(sb, ofs);
220 logfs_get_segment_entry(sb, segno, &se);
221 ec = be32_to_cpu(se.ec_level) >> 4;
222 ec++;
223 logfs_set_segment_erased(sb, segno, ec, 0);
224 logfs_write_ds(sb, ds, segno, ec);
225 err = super->s_devops->write_sb(sb, page);
226 page_cache_release(page);
227 return err;
228}
229
230int logfs_write_sb(struct super_block *sb)
231{
232 struct logfs_super *super = logfs_super(sb);
233 int err;
234
235 /* First superblock */
236 err = write_one_sb(sb, super->s_devops->find_first_sb);
237 if (err)
238 return err;
239
240 /* Last superblock */
241 err = write_one_sb(sb, super->s_devops->find_last_sb);
242 if (err)
243 return err;
244 return 0;
245}
246
247static int ds_cmp(const void *ds0, const void *ds1)
248{
249 size_t len = sizeof(struct logfs_disk_super);
250
251 /* We know the segment headers differ, so ignore them */
252 len -= LOGFS_SEGMENT_HEADERSIZE;
253 ds0 += LOGFS_SEGMENT_HEADERSIZE;
254 ds1 += LOGFS_SEGMENT_HEADERSIZE;
255 return memcmp(ds0, ds1, len);
256}
257
258static int logfs_recover_sb(struct super_block *sb)
259{
260 struct logfs_super *super = logfs_super(sb);
261 struct logfs_disk_super _ds0, *ds0 = &_ds0;
262 struct logfs_disk_super _ds1, *ds1 = &_ds1;
263 int err, valid0, valid1;
264
265 /* read first superblock */
266 err = wbuf_read(sb, super->s_sb_ofs[0], sizeof(*ds0), ds0);
267 if (err)
268 return err;
269 /* read last superblock */
270 err = wbuf_read(sb, super->s_sb_ofs[1], sizeof(*ds1), ds1);
271 if (err)
272 return err;
273 valid0 = logfs_check_ds(ds0) == 0;
274 valid1 = logfs_check_ds(ds1) == 0;
275
276 if (!valid0 && valid1) {
277 printk(KERN_INFO"First superblock is invalid - fixing.\n");
278 return write_one_sb(sb, super->s_devops->find_first_sb);
279 }
280 if (valid0 && !valid1) {
281 printk(KERN_INFO"Last superblock is invalid - fixing.\n");
282 return write_one_sb(sb, super->s_devops->find_last_sb);
283 }
284 if (valid0 && valid1 && ds_cmp(ds0, ds1)) {
285 printk(KERN_INFO"Superblocks don't match - fixing.\n");
286 return logfs_write_sb(sb);
287 }
288 /* If neither is valid now, something's wrong. Didn't we properly
289 * check them before?!? */
290 BUG_ON(!valid0 && !valid1);
291 return 0;
292}
293
294static int logfs_make_writeable(struct super_block *sb)
295{
296 int err;
297
298 err = logfs_open_segfile(sb);
299 if (err)
300 return err;
301
302 /* Repair any broken superblock copies */
303 err = logfs_recover_sb(sb);
304 if (err)
305 return err;
306
307 /* Check areas for trailing unaccounted data */
308 err = logfs_check_areas(sb);
309 if (err)
310 return err;
311
312 /* Do one GC pass before any data gets dirtied */
313 logfs_gc_pass(sb);
314
315 /* after all initializations are done, replay the journal
316 * for rw-mounts, if necessary */
317 err = logfs_replay_journal(sb);
318 if (err)
319 return err;
320
321 return 0;
322}
323
324static int logfs_get_sb_final(struct super_block *sb, struct vfsmount *mnt)
325{
326 struct logfs_super *super = logfs_super(sb);
327 struct inode *rootdir;
328 int err;
329
330 /* root dir */
331 rootdir = logfs_iget(sb, LOGFS_INO_ROOT);
332 if (IS_ERR(rootdir))
333 goto fail;
334
335 sb->s_root = d_alloc_root(rootdir);
336 if (!sb->s_root) {
337 iput(rootdir);
338 goto fail;
339 }
340
341 super->s_erase_page = alloc_pages(GFP_KERNEL, 0);
342 if (!super->s_erase_page)
343 goto fail;
344 memset(page_address(super->s_erase_page), 0xFF, PAGE_SIZE);
345
346 /* FIXME: check for read-only mounts */
347 err = logfs_make_writeable(sb);
348 if (err)
349 goto fail1;
350
351 log_super("LogFS: Finished mounting\n");
352 simple_set_mnt(mnt, sb);
353 return 0;
354
355fail1:
356 __free_page(super->s_erase_page);
357fail:
358 iput(logfs_super(sb)->s_master_inode);
359 return -EIO;
360}
361
362int logfs_check_ds(struct logfs_disk_super *ds)
363{
364 struct logfs_segment_header *sh = &ds->ds_sh;
365
366 if (ds->ds_magic != cpu_to_be64(LOGFS_MAGIC))
367 return -EINVAL;
368 if (sh->crc != logfs_crc32(sh, LOGFS_SEGMENT_HEADERSIZE, 4))
369 return -EINVAL;
370 if (ds->ds_crc != logfs_crc32(ds, sizeof(*ds),
371 LOGFS_SEGMENT_HEADERSIZE + 12))
372 return -EINVAL;
373 return 0;
374}
375
376static struct page *find_super_block(struct super_block *sb)
377{
378 struct logfs_super *super = logfs_super(sb);
379 struct page *first, *last;
380
381 first = super->s_devops->find_first_sb(sb, &super->s_sb_ofs[0]);
382 if (!first || IS_ERR(first))
383 return NULL;
384 last = super->s_devops->find_last_sb(sb, &super->s_sb_ofs[1]);
385 if (!last || IS_ERR(first)) {
386 page_cache_release(first);
387 return NULL;
388 }
389
390 if (!logfs_check_ds(page_address(first))) {
391 page_cache_release(last);
392 return first;
393 }
394
395 /* First one didn't work, try the second superblock */
396 if (!logfs_check_ds(page_address(last))) {
397 page_cache_release(first);
398 return last;
399 }
400
401 /* Neither worked, sorry folks */
402 page_cache_release(first);
403 page_cache_release(last);
404 return NULL;
405}
406
407static int __logfs_read_sb(struct super_block *sb)
408{
409 struct logfs_super *super = logfs_super(sb);
410 struct page *page;
411 struct logfs_disk_super *ds;
412 int i;
413
414 page = find_super_block(sb);
415 if (!page)
416 return -EIO;
417
418 ds = page_address(page);
419 super->s_size = be64_to_cpu(ds->ds_filesystem_size);
420 super->s_root_reserve = be64_to_cpu(ds->ds_root_reserve);
421 super->s_speed_reserve = be64_to_cpu(ds->ds_speed_reserve);
422 super->s_bad_seg_reserve = be32_to_cpu(ds->ds_bad_seg_reserve);
423 super->s_segsize = 1 << ds->ds_segment_shift;
424 super->s_segmask = (1 << ds->ds_segment_shift) - 1;
425 super->s_segshift = ds->ds_segment_shift;
426 sb->s_blocksize = 1 << ds->ds_block_shift;
427 sb->s_blocksize_bits = ds->ds_block_shift;
428 super->s_writesize = 1 << ds->ds_write_shift;
429 super->s_writeshift = ds->ds_write_shift;
430 super->s_no_segs = super->s_size >> super->s_segshift;
431 super->s_no_blocks = super->s_segsize >> sb->s_blocksize_bits;
432 super->s_feature_incompat = be64_to_cpu(ds->ds_feature_incompat);
433 super->s_feature_ro_compat = be64_to_cpu(ds->ds_feature_ro_compat);
434 super->s_feature_compat = be64_to_cpu(ds->ds_feature_compat);
435 super->s_feature_flags = be64_to_cpu(ds->ds_feature_flags);
436
437 journal_for_each(i)
438 super->s_journal_seg[i] = be32_to_cpu(ds->ds_journal_seg[i]);
439
440 super->s_ifile_levels = ds->ds_ifile_levels;
441 super->s_iblock_levels = ds->ds_iblock_levels;
442 super->s_data_levels = ds->ds_data_levels;
443 super->s_total_levels = super->s_ifile_levels + super->s_iblock_levels
444 + super->s_data_levels;
445 page_cache_release(page);
446 return 0;
447}
448
449static int logfs_read_sb(struct super_block *sb, int read_only)
450{
451 struct logfs_super *super = logfs_super(sb);
452 int ret;
453
454 super->s_btree_pool = mempool_create(32, btree_alloc, btree_free, NULL);
455 if (!super->s_btree_pool)
456 return -ENOMEM;
457
458 btree_init_mempool64(&super->s_shadow_tree.new, super->s_btree_pool);
459 btree_init_mempool64(&super->s_shadow_tree.old, super->s_btree_pool);
460 btree_init_mempool32(&super->s_shadow_tree.segment_map,
461 super->s_btree_pool);
462
463 ret = logfs_init_mapping(sb);
464 if (ret)
465 return ret;
466
467 ret = __logfs_read_sb(sb);
468 if (ret)
469 return ret;
470
471 if (super->s_feature_incompat & ~LOGFS_FEATURES_INCOMPAT)
472 return -EIO;
473 if ((super->s_feature_ro_compat & ~LOGFS_FEATURES_RO_COMPAT) &&
474 !read_only)
475 return -EIO;
476
477 mutex_init(&super->s_dirop_mutex);
478 mutex_init(&super->s_object_alias_mutex);
479 INIT_LIST_HEAD(&super->s_freeing_list);
480
481 ret = logfs_init_rw(sb);
482 if (ret)
483 return ret;
484
485 ret = logfs_init_areas(sb);
486 if (ret)
487 return ret;
488
489 ret = logfs_init_gc(sb);
490 if (ret)
491 return ret;
492
493 ret = logfs_init_journal(sb);
494 if (ret)
495 return ret;
496
497 return 0;
498}
499
500static void logfs_kill_sb(struct super_block *sb)
501{
502 struct logfs_super *super = logfs_super(sb);
503
504 log_super("LogFS: Start unmounting\n");
505 /* Alias entries slow down mount, so evict as many as possible */
506 sync_filesystem(sb);
507 logfs_write_anchor(sb);
508
509 /*
510 * From this point on alias entries are simply dropped - and any
511 * writes to the object store are considered bugs.
512 */
513 super->s_flags |= LOGFS_SB_FLAG_SHUTDOWN;
514 log_super("LogFS: Now in shutdown\n");
515 generic_shutdown_super(sb);
516
517 BUG_ON(super->s_dirty_used_bytes || super->s_dirty_free_bytes);
518
519 logfs_cleanup_gc(sb);
520 logfs_cleanup_journal(sb);
521 logfs_cleanup_areas(sb);
522 logfs_cleanup_rw(sb);
523 if (super->s_erase_page)
524 __free_page(super->s_erase_page);
525 super->s_devops->put_device(sb);
526 logfs_mempool_destroy(super->s_btree_pool);
527 logfs_mempool_destroy(super->s_alias_pool);
528 kfree(super);
529 log_super("LogFS: Finished unmounting\n");
530}
531
532int logfs_get_sb_device(struct file_system_type *type, int flags,
533 struct mtd_info *mtd, struct block_device *bdev,
534 const struct logfs_device_ops *devops, struct vfsmount *mnt)
535{
536 struct logfs_super *super;
537 struct super_block *sb;
538 int err = -ENOMEM;
539 static int mount_count;
540
541 log_super("LogFS: Start mount %x\n", mount_count++);
542 super = kzalloc(sizeof(*super), GFP_KERNEL);
543 if (!super)
544 goto err0;
545
546 super->s_mtd = mtd;
547 super->s_bdev = bdev;
548 err = -EINVAL;
549 sb = sget(type, logfs_sb_test, logfs_sb_set, super);
550 if (IS_ERR(sb))
551 goto err0;
552
553 if (sb->s_root) {
554 /* Device is already in use */
555 err = 0;
556 simple_set_mnt(mnt, sb);
557 goto err0;
558 }
559
560 super->s_devops = devops;
561
562 /*
563 * sb->s_maxbytes is limited to 8TB. On 32bit systems, the page cache
564 * only covers 16TB and the upper 8TB are used for indirect blocks.
565 * On 64bit system we could bump up the limit, but that would make
566 * the filesystem incompatible with 32bit systems.
567 */
568 sb->s_maxbytes = (1ull << 43) - 1;
569 sb->s_op = &logfs_super_operations;
570 sb->s_flags = flags | MS_NOATIME;
571
572 err = logfs_read_sb(sb, sb->s_flags & MS_RDONLY);
573 if (err)
574 goto err1;
575
576 sb->s_flags |= MS_ACTIVE;
577 err = logfs_get_sb_final(sb, mnt);
578 if (err)
579 goto err1;
580 return 0;
581
582err1:
583 deactivate_locked_super(sb);
584 return err;
585err0:
586 kfree(super);
587 //devops->put_device(sb);
588 return err;
589}
590
591static int logfs_get_sb(struct file_system_type *type, int flags,
592 const char *devname, void *data, struct vfsmount *mnt)
593{
594 ulong mtdnr;
595
596 if (!devname)
597 return logfs_get_sb_bdev(type, flags, devname, mnt);
598 if (strncmp(devname, "mtd", 3))
599 return logfs_get_sb_bdev(type, flags, devname, mnt);
600
601 {
602 char *garbage;
603 mtdnr = simple_strtoul(devname+3, &garbage, 0);
604 if (*garbage)
605 return -EINVAL;
606 }
607
608 return logfs_get_sb_mtd(type, flags, mtdnr, mnt);
609}
610
611static struct file_system_type logfs_fs_type = {
612 .owner = THIS_MODULE,
613 .name = "logfs",
614 .get_sb = logfs_get_sb,
615 .kill_sb = logfs_kill_sb,
616 .fs_flags = FS_REQUIRES_DEV,
617
618};
619
620static int __init logfs_init(void)
621{
622 int ret;
623
624 emergency_page = alloc_pages(GFP_KERNEL, 0);
625 if (!emergency_page)
626 return -ENOMEM;
627
628 ret = logfs_compr_init();
629 if (ret)
630 goto out1;
631
632 ret = logfs_init_inode_cache();
633 if (ret)
634 goto out2;
635
636 return register_filesystem(&logfs_fs_type);
637out2:
638 logfs_compr_exit();
639out1:
640 __free_pages(emergency_page, 0);
641 return ret;
642}
643
644static void __exit logfs_exit(void)
645{
646 unregister_filesystem(&logfs_fs_type);
647 logfs_destroy_inode_cache();
648 logfs_compr_exit();
649 __free_pages(emergency_page, 0);
650}
651
652module_init(logfs_init);
653module_exit(logfs_exit);
654
655MODULE_LICENSE("GPL v2");
656MODULE_AUTHOR("Joern Engel <joern@logfs.org>");
657MODULE_DESCRIPTION("scalable flash filesystem");