aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2010-03-06 16:18:03 -0500
committerLinus Torvalds <torvalds@linux-foundation.org>2010-03-06 16:18:03 -0500
commit66b89159c25a47d2177743526c61b5ada7acc39e (patch)
treeb092b859ca01d7544a666c95f940144b0ef3b35b
parent87c7ae06cc50bcbcdcc60d64a959ca0b9b71f892 (diff)
parentc2f843f03d658e9ab2a1a455f2c1851fd6a869af (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/joern/logfs
* git://git.kernel.org/pub/scm/linux/kernel/git/joern/logfs: [LogFS] Change magic number [LogFS] Remove h_version field [LogFS] Check feature flags [LogFS] Only write journal if dirty [LogFS] Fix bdev erases [LogFS] Silence gcc [LogFS] Prevent 64bit divisions in hash_index [LogFS] Plug memory leak on error paths [LogFS] Add MAINTAINERS entry [LogFS] add new flash file system Fixed up trivial conflict in lib/Kconfig, and a semantic conflict in fs/logfs/inode.c introduced by write_inode() being changed to use writeback_control' by commit a9185b41a4f84971b930c519f0c63bd450c4810d ("pass writeback_control to ->write_inode")
-rw-r--r--Documentation/filesystems/00-INDEX2
-rw-r--r--Documentation/filesystems/logfs.txt241
-rw-r--r--MAINTAINERS7
-rw-r--r--fs/Kconfig1
-rw-r--r--fs/Makefile1
-rw-r--r--fs/logfs/Kconfig17
-rw-r--r--fs/logfs/Makefile13
-rw-r--r--fs/logfs/compr.c95
-rw-r--r--fs/logfs/dev_bdev.c327
-rw-r--r--fs/logfs/dev_mtd.c254
-rw-r--r--fs/logfs/dir.c827
-rw-r--r--fs/logfs/file.c263
-rw-r--r--fs/logfs/gc.c730
-rw-r--r--fs/logfs/inode.c417
-rw-r--r--fs/logfs/journal.c883
-rw-r--r--fs/logfs/logfs.h724
-rw-r--r--fs/logfs/logfs_abi.h629
-rw-r--r--fs/logfs/readwrite.c2246
-rw-r--r--fs/logfs/segment.c927
-rw-r--r--fs/logfs/super.c650
-rw-r--r--include/linux/btree-128.h109
-rw-r--r--include/linux/btree-type.h147
-rw-r--r--include/linux/btree.h243
-rw-r--r--lib/Kconfig3
-rw-r--r--lib/Makefile1
-rw-r--r--lib/btree.c797
26 files changed, 10554 insertions, 0 deletions
diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX
index 875d49696b6e..5139b8c9d5af 100644
--- a/Documentation/filesystems/00-INDEX
+++ b/Documentation/filesystems/00-INDEX
@@ -62,6 +62,8 @@ jfs.txt
62 - info and mount options for the JFS filesystem. 62 - info and mount options for the JFS filesystem.
63locks.txt 63locks.txt
64 - info on file locking implementations, flock() vs. fcntl(), etc. 64 - info on file locking implementations, flock() vs. fcntl(), etc.
65logfs.txt
66 - info on the LogFS flash filesystem.
65mandatory-locking.txt 67mandatory-locking.txt
66 - info on the Linux implementation of Sys V mandatory file locking. 68 - info on the Linux implementation of Sys V mandatory file locking.
67ncpfs.txt 69ncpfs.txt
diff --git a/Documentation/filesystems/logfs.txt b/Documentation/filesystems/logfs.txt
new file mode 100644
index 000000000000..e64c94ba401a
--- /dev/null
+++ b/Documentation/filesystems/logfs.txt
@@ -0,0 +1,241 @@
1
2The LogFS Flash Filesystem
3==========================
4
5Specification
6=============
7
8Superblocks
9-----------
10
11Two superblocks exist at the beginning and end of the filesystem.
12Each superblock is 256 Bytes large, with another 3840 Bytes reserved
13for future purposes, making a total of 4096 Bytes.
14
15Superblock locations may differ for MTD and block devices. On MTD the
16first non-bad block contains a superblock in the first 4096 Bytes and
17the last non-bad block contains a superblock in the last 4096 Bytes.
18On block devices, the first 4096 Bytes of the device contain the first
19superblock and the last aligned 4096 Byte-block contains the second
20superblock.
21
22For the most part, the superblocks can be considered read-only. They
23are written only to correct errors detected within the superblocks,
24move the journal and change the filesystem parameters through tunefs.
25As a result, the superblock does not contain any fields that require
26constant updates, like the amount of free space, etc.
27
28Segments
29--------
30
31The space in the device is split up into equal-sized segments.
32Segments are the primary write unit of LogFS. Within each segments,
33writes happen from front (low addresses) to back (high addresses. If
34only a partial segment has been written, the segment number, the
35current position within and optionally a write buffer are stored in
36the journal.
37
38Segments are erased as a whole. Therefore Garbage Collection may be
39required to completely free a segment before doing so.
40
41Journal
42--------
43
44The journal contains all global information about the filesystem that
45is subject to frequent change. At mount time, it has to be scanned
46for the most recent commit entry, which contains a list of pointers to
47all currently valid entries.
48
49Object Store
50------------
51
52All space except for the superblocks and journal is part of the object
53store. Each segment contains a segment header and a number of
54objects, each consisting of the object header and the payload.
55Objects are either inodes, directory entries (dentries), file data
56blocks or indirect blocks.
57
58Levels
59------
60
61Garbage collection (GC) may fail if all data is written
62indiscriminately. One requirement of GC is that data is seperated
63roughly according to the distance between the tree root and the data.
64Effectively that means all file data is on level 0, indirect blocks
65are on levels 1, 2, 3 4 or 5 for 1x, 2x, 3x, 4x or 5x indirect blocks,
66respectively. Inode file data is on level 6 for the inodes and 7-11
67for indirect blocks.
68
69Each segment contains objects of a single level only. As a result,
70each level requires its own seperate segment to be open for writing.
71
72Inode File
73----------
74
75All inodes are stored in a special file, the inode file. Single
76exception is the inode file's inode (master inode) which for obvious
77reasons is stored in the journal instead. Instead of data blocks, the
78leaf nodes of the inode files are inodes.
79
80Aliases
81-------
82
83Writes in LogFS are done by means of a wandering tree. A naïve
84implementation would require that for each write or a block, all
85parent blocks are written as well, since the block pointers have
86changed. Such an implementation would not be very efficient.
87
88In LogFS, the block pointer changes are cached in the journal by means
89of alias entries. Each alias consists of its logical address - inode
90number, block index, level and child number (index into block) - and
91the changed data. Any 8-byte word can be changes in this manner.
92
93Currently aliases are used for block pointers, file size, file used
94bytes and the height of an inodes indirect tree.
95
96Segment Aliases
97---------------
98
99Related to regular aliases, these are used to handle bad blocks.
100Initially, bad blocks are handled by moving the affected segment
101content to a spare segment and noting this move in the journal with a
102segment alias, a simple (to, from) tupel. GC will later empty this
103segment and the alias can be removed again. This is used on MTD only.
104
105Vim
106---
107
108By cleverly predicting the life time of data, it is possible to
109seperate long-living data from short-living data and thereby reduce
110the GC overhead later. Each type of distinc life expectency (vim) can
111have a seperate segment open for writing. Each (level, vim) tupel can
112be open just once. If an open segment with unknown vim is encountered
113at mount time, it is closed and ignored henceforth.
114
115Indirect Tree
116-------------
117
118Inodes in LogFS are similar to FFS-style filesystems with direct and
119indirect block pointers. One difference is that LogFS uses a single
120indirect pointer that can be either a 1x, 2x, etc. indirect pointer.
121A height field in the inode defines the height of the indirect tree
122and thereby the indirection of the pointer.
123
124Another difference is the addressing of indirect blocks. In LogFS,
125the first 16 pointers in the first indirect block are left empty,
126corresponding to the 16 direct pointers in the inode. In ext2 (maybe
127others as well) the first pointer in the first indirect block
128corresponds to logical block 12, skipping the 12 direct pointers.
129So where ext2 is using arithmetic to better utilize space, LogFS keeps
130arithmetic simple and uses compression to save space.
131
132Compression
133-----------
134
135Both file data and metadata can be compressed. Compression for file
136data can be enabled with chattr +c and disabled with chattr -c. Doing
137so has no effect on existing data, but new data will be stored
138accordingly. New inodes will inherit the compression flag of the
139parent directory.
140
141Metadata is always compressed. However, the space accounting ignores
142this and charges for the uncompressed size. Failing to do so could
143result in GC failures when, after moving some data, indirect blocks
144compress worse than previously. Even on a 100% full medium, GC may
145not consume any extra space, so the compression gains are lost space
146to the user.
147
148However, they are not lost space to the filesystem internals. By
149cheating the user for those bytes, the filesystem gained some slack
150space and GC will run less often and faster.
151
152Garbage Collection and Wear Leveling
153------------------------------------
154
155Garbage collection is invoked whenever the number of free segments
156falls below a threshold. The best (known) candidate is picked based
157on the least amount of valid data contained in the segment. All
158remaining valid data is copied elsewhere, thereby invalidating it.
159
160The GC code also checks for aliases and writes then back if their
161number gets too large.
162
163Wear leveling is done by occasionally picking a suboptimal segment for
164garbage collection. If a stale segments erase count is significantly
165lower than the active segments' erase counts, it will be picked. Wear
166leveling is rate limited, so it will never monopolize the device for
167more than one segment worth at a time.
168
169Values for "occasionally", "significantly lower" are compile time
170constants.
171
172Hashed directories
173------------------
174
175To satisfy efficient lookup(), directory entries are hashed and
176located based on the hash. In order to both support large directories
177and not be overly inefficient for small directories, several hash
178tables of increasing size are used. For each table, the hash value
179modulo the table size gives the table index.
180
181Tables sizes are chosen to limit the number of indirect blocks with a
182fully populated table to 0, 1, 2 or 3 respectively. So the first
183table contains 16 entries, the second 512-16, etc.
184
185The last table is special in several ways. First its size depends on
186the effective 32bit limit on telldir/seekdir cookies. Since logfs
187uses the upper half of the address space for indirect blocks, the size
188is limited to 2^31. Secondly the table contains hash buckets with 16
189entries each.
190
191Using single-entry buckets would result in birthday "attacks". At
192just 2^16 used entries, hash collisions would be likely (P >= 0.5).
193My math skills are insufficient to do the combinatorics for the 17x
194collisions necessary to overflow a bucket, but testing showed that in
19510,000 runs the lowest directory fill before a bucket overflow was
196188,057,130 entries with an average of 315,149,915 entries. So for
197directory sizes of up to a million, bucket overflows should be
198virtually impossible under normal circumstances.
199
200With carefully chosen filenames, it is obviously possible to cause an
201overflow with just 21 entries (4 higher tables + 16 entries + 1). So
202there may be a security concern if a malicious user has write access
203to a directory.
204
205Open For Discussion
206===================
207
208Device Address Space
209--------------------
210
211A device address space is used for caching. Both block devices and
212MTD provide functions to either read a single page or write a segment.
213Partial segments may be written for data integrity, but where possible
214complete segments are written for performance on simple block device
215flash media.
216
217Meta Inodes
218-----------
219
220Inodes are stored in the inode file, which is just a regular file for
221most purposes. At umount time, however, the inode file needs to
222remain open until all dirty inodes are written. So
223generic_shutdown_super() may not close this inode, but shouldn't
224complain about remaining inodes due to the inode file either. Same
225goes for mapping inode of the device address space.
226
227Currently logfs uses a hack that essentially copies part of fs/inode.c
228code over. A general solution would be preferred.
229
230Indirect block mapping
231----------------------
232
233With compression, the block device (or mapping inode) cannot be used
234to cache indirect blocks. Some other place is required. Currently
235logfs uses the top half of each inode's address space. The low 8TB
236(on 32bit) are filled with file data, the high 8TB are used for
237indirect blocks.
238
239One problem is that 16TB files created on 64bit systems actually have
240data in the top 8TB. But files >16TB would cause problems anyway, so
241only the limit has changed.
diff --git a/MAINTAINERS b/MAINTAINERS
index bfa4fd1f3c03..c8a8b1fd58b3 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -3450,6 +3450,13 @@ S: Maintained
3450F: Documentation/ldm.txt 3450F: Documentation/ldm.txt
3451F: fs/partitions/ldm.* 3451F: fs/partitions/ldm.*
3452 3452
3453LogFS
3454M: Joern Engel <joern@logfs.org>
3455L: logfs@logfs.org
3456W: logfs.org
3457S: Maintained
3458F: fs/logfs/
3459
3453LSILOGIC MPT FUSION DRIVERS (FC/SAS/SPI) 3460LSILOGIC MPT FUSION DRIVERS (FC/SAS/SPI)
3454M: Eric Moore <Eric.Moore@lsi.com> 3461M: Eric Moore <Eric.Moore@lsi.com>
3455M: support@lsi.com 3462M: support@lsi.com
diff --git a/fs/Kconfig b/fs/Kconfig
index 64d44efad7a5..7405f071be67 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -177,6 +177,7 @@ source "fs/efs/Kconfig"
177source "fs/jffs2/Kconfig" 177source "fs/jffs2/Kconfig"
178# UBIFS File system configuration 178# UBIFS File system configuration
179source "fs/ubifs/Kconfig" 179source "fs/ubifs/Kconfig"
180source "fs/logfs/Kconfig"
180source "fs/cramfs/Kconfig" 181source "fs/cramfs/Kconfig"
181source "fs/squashfs/Kconfig" 182source "fs/squashfs/Kconfig"
182source "fs/freevxfs/Kconfig" 183source "fs/freevxfs/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index af6d04700d9c..c3633aa46911 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -99,6 +99,7 @@ obj-$(CONFIG_NTFS_FS) += ntfs/
99obj-$(CONFIG_UFS_FS) += ufs/ 99obj-$(CONFIG_UFS_FS) += ufs/
100obj-$(CONFIG_EFS_FS) += efs/ 100obj-$(CONFIG_EFS_FS) += efs/
101obj-$(CONFIG_JFFS2_FS) += jffs2/ 101obj-$(CONFIG_JFFS2_FS) += jffs2/
102obj-$(CONFIG_LOGFS) += logfs/
102obj-$(CONFIG_UBIFS_FS) += ubifs/ 103obj-$(CONFIG_UBIFS_FS) += ubifs/
103obj-$(CONFIG_AFFS_FS) += affs/ 104obj-$(CONFIG_AFFS_FS) += affs/
104obj-$(CONFIG_ROMFS_FS) += romfs/ 105obj-$(CONFIG_ROMFS_FS) += romfs/
diff --git a/fs/logfs/Kconfig b/fs/logfs/Kconfig
new file mode 100644
index 000000000000..daf9a9b32dd3
--- /dev/null
+++ b/fs/logfs/Kconfig
@@ -0,0 +1,17 @@
1config LOGFS
2 tristate "LogFS file system (EXPERIMENTAL)"
3 depends on (MTD || BLOCK) && EXPERIMENTAL
4 select ZLIB_INFLATE
5 select ZLIB_DEFLATE
6 select CRC32
7 select BTREE
8 help
9 Flash filesystem aimed to scale efficiently to large devices.
10 In comparison to JFFS2 it offers significantly faster mount
11 times and potentially less RAM usage, although the latter has
12 not been measured yet.
13
14 In its current state it is still very experimental and should
15 not be used for other than testing purposes.
16
17 If unsure, say N.
diff --git a/fs/logfs/Makefile b/fs/logfs/Makefile
new file mode 100644
index 000000000000..4820027787ee
--- /dev/null
+++ b/fs/logfs/Makefile
@@ -0,0 +1,13 @@
1obj-$(CONFIG_LOGFS) += logfs.o
2
3logfs-y += compr.o
4logfs-y += dir.o
5logfs-y += file.o
6logfs-y += gc.o
7logfs-y += inode.o
8logfs-y += journal.o
9logfs-y += readwrite.o
10logfs-y += segment.o
11logfs-y += super.o
12logfs-$(CONFIG_BLOCK) += dev_bdev.o
13logfs-$(CONFIG_MTD) += dev_mtd.o
diff --git a/fs/logfs/compr.c b/fs/logfs/compr.c
new file mode 100644
index 000000000000..44bbfd249abc
--- /dev/null
+++ b/fs/logfs/compr.c
@@ -0,0 +1,95 @@
1/*
2 * fs/logfs/compr.c - compression routines
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */
8#include "logfs.h"
9#include <linux/vmalloc.h>
10#include <linux/zlib.h>
11
12#define COMPR_LEVEL 3
13
14static DEFINE_MUTEX(compr_mutex);
15static struct z_stream_s stream;
16
17int logfs_compress(void *in, void *out, size_t inlen, size_t outlen)
18{
19 int err, ret;
20
21 ret = -EIO;
22 mutex_lock(&compr_mutex);
23 err = zlib_deflateInit(&stream, COMPR_LEVEL);
24 if (err != Z_OK)
25 goto error;
26
27 stream.next_in = in;
28 stream.avail_in = inlen;
29 stream.total_in = 0;
30 stream.next_out = out;
31 stream.avail_out = outlen;
32 stream.total_out = 0;
33
34 err = zlib_deflate(&stream, Z_FINISH);
35 if (err != Z_STREAM_END)
36 goto error;
37
38 err = zlib_deflateEnd(&stream);
39 if (err != Z_OK)
40 goto error;
41
42 if (stream.total_out >= stream.total_in)
43 goto error;
44
45 ret = stream.total_out;
46error:
47 mutex_unlock(&compr_mutex);
48 return ret;
49}
50
51int logfs_uncompress(void *in, void *out, size_t inlen, size_t outlen)
52{
53 int err, ret;
54
55 ret = -EIO;
56 mutex_lock(&compr_mutex);
57 err = zlib_inflateInit(&stream);
58 if (err != Z_OK)
59 goto error;
60
61 stream.next_in = in;
62 stream.avail_in = inlen;
63 stream.total_in = 0;
64 stream.next_out = out;
65 stream.avail_out = outlen;
66 stream.total_out = 0;
67
68 err = zlib_inflate(&stream, Z_FINISH);
69 if (err != Z_STREAM_END)
70 goto error;
71
72 err = zlib_inflateEnd(&stream);
73 if (err != Z_OK)
74 goto error;
75
76 ret = 0;
77error:
78 mutex_unlock(&compr_mutex);
79 return ret;
80}
81
82int __init logfs_compr_init(void)
83{
84 size_t size = max(zlib_deflate_workspacesize(),
85 zlib_inflate_workspacesize());
86 stream.workspace = vmalloc(size);
87 if (!stream.workspace)
88 return -ENOMEM;
89 return 0;
90}
91
92void logfs_compr_exit(void)
93{
94 vfree(stream.workspace);
95}
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
new file mode 100644
index 000000000000..9718c22f186d
--- /dev/null
+++ b/fs/logfs/dev_bdev.c
@@ -0,0 +1,327 @@
1/*
2 * fs/logfs/dev_bdev.c - Device access methods for block devices
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */
8#include "logfs.h"
9#include <linux/bio.h>
10#include <linux/blkdev.h>
11#include <linux/buffer_head.h>
12
13#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
14
15static void request_complete(struct bio *bio, int err)
16{
17 complete((struct completion *)bio->bi_private);
18}
19
20static int sync_request(struct page *page, struct block_device *bdev, int rw)
21{
22 struct bio bio;
23 struct bio_vec bio_vec;
24 struct completion complete;
25
26 bio_init(&bio);
27 bio.bi_io_vec = &bio_vec;
28 bio_vec.bv_page = page;
29 bio_vec.bv_len = PAGE_SIZE;
30 bio_vec.bv_offset = 0;
31 bio.bi_vcnt = 1;
32 bio.bi_idx = 0;
33 bio.bi_size = PAGE_SIZE;
34 bio.bi_bdev = bdev;
35 bio.bi_sector = page->index * (PAGE_SIZE >> 9);
36 init_completion(&complete);
37 bio.bi_private = &complete;
38 bio.bi_end_io = request_complete;
39
40 submit_bio(rw, &bio);
41 generic_unplug_device(bdev_get_queue(bdev));
42 wait_for_completion(&complete);
43 return test_bit(BIO_UPTODATE, &bio.bi_flags) ? 0 : -EIO;
44}
45
46static int bdev_readpage(void *_sb, struct page *page)
47{
48 struct super_block *sb = _sb;
49 struct block_device *bdev = logfs_super(sb)->s_bdev;
50 int err;
51
52 err = sync_request(page, bdev, READ);
53 if (err) {
54 ClearPageUptodate(page);
55 SetPageError(page);
56 } else {
57 SetPageUptodate(page);
58 ClearPageError(page);
59 }
60 unlock_page(page);
61 return err;
62}
63
64static DECLARE_WAIT_QUEUE_HEAD(wq);
65
66static void writeseg_end_io(struct bio *bio, int err)
67{
68 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
69 struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1;
70 struct super_block *sb = bio->bi_private;
71 struct logfs_super *super = logfs_super(sb);
72 struct page *page;
73
74 BUG_ON(!uptodate); /* FIXME: Retry io or write elsewhere */
75 BUG_ON(err);
76 BUG_ON(bio->bi_vcnt == 0);
77 do {
78 page = bvec->bv_page;
79 if (--bvec >= bio->bi_io_vec)
80 prefetchw(&bvec->bv_page->flags);
81
82 end_page_writeback(page);
83 } while (bvec >= bio->bi_io_vec);
84 bio_put(bio);
85 if (atomic_dec_and_test(&super->s_pending_writes))
86 wake_up(&wq);
87}
88
89static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
90 size_t nr_pages)
91{
92 struct logfs_super *super = logfs_super(sb);
93 struct address_space *mapping = super->s_mapping_inode->i_mapping;
94 struct bio *bio;
95 struct page *page;
96 struct request_queue *q = bdev_get_queue(sb->s_bdev);
97 unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9);
98 int i;
99
100 bio = bio_alloc(GFP_NOFS, max_pages);
101 BUG_ON(!bio); /* FIXME: handle this */
102
103 for (i = 0; i < nr_pages; i++) {
104 if (i >= max_pages) {
105 /* Block layer cannot split bios :( */
106 bio->bi_vcnt = i;
107 bio->bi_idx = 0;
108 bio->bi_size = i * PAGE_SIZE;
109 bio->bi_bdev = super->s_bdev;
110 bio->bi_sector = ofs >> 9;
111 bio->bi_private = sb;
112 bio->bi_end_io = writeseg_end_io;
113 atomic_inc(&super->s_pending_writes);
114 submit_bio(WRITE, bio);
115
116 ofs += i * PAGE_SIZE;
117 index += i;
118 nr_pages -= i;
119 i = 0;
120
121 bio = bio_alloc(GFP_NOFS, max_pages);
122 BUG_ON(!bio);
123 }
124 page = find_lock_page(mapping, index + i);
125 BUG_ON(!page);
126 bio->bi_io_vec[i].bv_page = page;
127 bio->bi_io_vec[i].bv_len = PAGE_SIZE;
128 bio->bi_io_vec[i].bv_offset = 0;
129
130 BUG_ON(PageWriteback(page));
131 set_page_writeback(page);
132 unlock_page(page);
133 }
134 bio->bi_vcnt = nr_pages;
135 bio->bi_idx = 0;
136 bio->bi_size = nr_pages * PAGE_SIZE;
137 bio->bi_bdev = super->s_bdev;
138 bio->bi_sector = ofs >> 9;
139 bio->bi_private = sb;
140 bio->bi_end_io = writeseg_end_io;
141 atomic_inc(&super->s_pending_writes);
142 submit_bio(WRITE, bio);
143 return 0;
144}
145
146static void bdev_writeseg(struct super_block *sb, u64 ofs, size_t len)
147{
148 struct logfs_super *super = logfs_super(sb);
149 int head;
150
151 BUG_ON(super->s_flags & LOGFS_SB_FLAG_RO);
152
153 if (len == 0) {
154 /* This can happen when the object fit perfectly into a
155 * segment, the segment gets written per sync and subsequently
156 * closed.
157 */
158 return;
159 }
160 head = ofs & (PAGE_SIZE - 1);
161 if (head) {
162 ofs -= head;
163 len += head;
164 }
165 len = PAGE_ALIGN(len);
166 __bdev_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT);
167 generic_unplug_device(bdev_get_queue(logfs_super(sb)->s_bdev));
168}
169
170
171static void erase_end_io(struct bio *bio, int err)
172{
173 const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags);
174 struct super_block *sb = bio->bi_private;
175 struct logfs_super *super = logfs_super(sb);
176
177 BUG_ON(!uptodate); /* FIXME: Retry io or write elsewhere */
178 BUG_ON(err);
179 BUG_ON(bio->bi_vcnt == 0);
180 bio_put(bio);
181 if (atomic_dec_and_test(&super->s_pending_writes))
182 wake_up(&wq);
183}
184
185static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index,
186 size_t nr_pages)
187{
188 struct logfs_super *super = logfs_super(sb);
189 struct bio *bio;
190 struct request_queue *q = bdev_get_queue(sb->s_bdev);
191 unsigned int max_pages = queue_max_hw_sectors(q) >> (PAGE_SHIFT - 9);
192 int i;
193
194 bio = bio_alloc(GFP_NOFS, max_pages);
195 BUG_ON(!bio); /* FIXME: handle this */
196
197 for (i = 0; i < nr_pages; i++) {
198 if (i >= max_pages) {
199 /* Block layer cannot split bios :( */
200 bio->bi_vcnt = i;
201 bio->bi_idx = 0;
202 bio->bi_size = i * PAGE_SIZE;
203 bio->bi_bdev = super->s_bdev;
204 bio->bi_sector = ofs >> 9;
205 bio->bi_private = sb;
206 bio->bi_end_io = erase_end_io;
207 atomic_inc(&super->s_pending_writes);
208 submit_bio(WRITE, bio);
209
210 ofs += i * PAGE_SIZE;
211 index += i;
212 nr_pages -= i;
213 i = 0;
214
215 bio = bio_alloc(GFP_NOFS, max_pages);
216 BUG_ON(!bio);
217 }
218 bio->bi_io_vec[i].bv_page = super->s_erase_page;
219 bio->bi_io_vec[i].bv_len = PAGE_SIZE;
220 bio->bi_io_vec[i].bv_offset = 0;
221 }
222 bio->bi_vcnt = nr_pages;
223 bio->bi_idx = 0;
224 bio->bi_size = nr_pages * PAGE_SIZE;
225 bio->bi_bdev = super->s_bdev;
226 bio->bi_sector = ofs >> 9;
227 bio->bi_private = sb;
228 bio->bi_end_io = erase_end_io;
229 atomic_inc(&super->s_pending_writes);
230 submit_bio(WRITE, bio);
231 return 0;
232}
233
234static int bdev_erase(struct super_block *sb, loff_t to, size_t len,
235 int ensure_write)
236{
237 struct logfs_super *super = logfs_super(sb);
238
239 BUG_ON(to & (PAGE_SIZE - 1));
240 BUG_ON(len & (PAGE_SIZE - 1));
241
242 if (super->s_flags & LOGFS_SB_FLAG_RO)
243 return -EROFS;
244
245 if (ensure_write) {
246 /*
247 * Object store doesn't care whether erases happen or not.
248 * But for the journal they are required. Otherwise a scan
249 * can find an old commit entry and assume it is the current
250 * one, travelling back in time.
251 */
252 do_erase(sb, to, to >> PAGE_SHIFT, len >> PAGE_SHIFT);
253 }
254
255 return 0;
256}
257
258static void bdev_sync(struct super_block *sb)
259{
260 struct logfs_super *super = logfs_super(sb);
261
262 wait_event(wq, atomic_read(&super->s_pending_writes) == 0);
263}
264
265static struct page *bdev_find_first_sb(struct super_block *sb, u64 *ofs)
266{
267 struct logfs_super *super = logfs_super(sb);
268 struct address_space *mapping = super->s_mapping_inode->i_mapping;
269 filler_t *filler = bdev_readpage;
270
271 *ofs = 0;
272 return read_cache_page(mapping, 0, filler, sb);
273}
274
275static struct page *bdev_find_last_sb(struct super_block *sb, u64 *ofs)
276{
277 struct logfs_super *super = logfs_super(sb);
278 struct address_space *mapping = super->s_mapping_inode->i_mapping;
279 filler_t *filler = bdev_readpage;
280 u64 pos = (super->s_bdev->bd_inode->i_size & ~0xfffULL) - 0x1000;
281 pgoff_t index = pos >> PAGE_SHIFT;
282
283 *ofs = pos;
284 return read_cache_page(mapping, index, filler, sb);
285}
286
287static int bdev_write_sb(struct super_block *sb, struct page *page)
288{
289 struct block_device *bdev = logfs_super(sb)->s_bdev;
290
291 /* Nothing special to do for block devices. */
292 return sync_request(page, bdev, WRITE);
293}
294
295static void bdev_put_device(struct super_block *sb)
296{
297 close_bdev_exclusive(logfs_super(sb)->s_bdev, FMODE_READ|FMODE_WRITE);
298}
299
300static const struct logfs_device_ops bd_devops = {
301 .find_first_sb = bdev_find_first_sb,
302 .find_last_sb = bdev_find_last_sb,
303 .write_sb = bdev_write_sb,
304 .readpage = bdev_readpage,
305 .writeseg = bdev_writeseg,
306 .erase = bdev_erase,
307 .sync = bdev_sync,
308 .put_device = bdev_put_device,
309};
310
311int logfs_get_sb_bdev(struct file_system_type *type, int flags,
312 const char *devname, struct vfsmount *mnt)
313{
314 struct block_device *bdev;
315
316 bdev = open_bdev_exclusive(devname, FMODE_READ|FMODE_WRITE, type);
317 if (IS_ERR(bdev))
318 return PTR_ERR(bdev);
319
320 if (MAJOR(bdev->bd_dev) == MTD_BLOCK_MAJOR) {
321 int mtdnr = MINOR(bdev->bd_dev);
322 close_bdev_exclusive(bdev, FMODE_READ|FMODE_WRITE);
323 return logfs_get_sb_mtd(type, flags, mtdnr, mnt);
324 }
325
326 return logfs_get_sb_device(type, flags, NULL, bdev, &bd_devops, mnt);
327}
diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c
new file mode 100644
index 000000000000..cafb6ef2e05b
--- /dev/null
+++ b/fs/logfs/dev_mtd.c
@@ -0,0 +1,254 @@
1/*
2 * fs/logfs/dev_mtd.c - Device access methods for MTD
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */
8#include "logfs.h"
9#include <linux/completion.h>
10#include <linux/mount.h>
11#include <linux/sched.h>
12
13#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
14
15static int mtd_read(struct super_block *sb, loff_t ofs, size_t len, void *buf)
16{
17 struct mtd_info *mtd = logfs_super(sb)->s_mtd;
18 size_t retlen;
19 int ret;
20
21 ret = mtd->read(mtd, ofs, len, &retlen, buf);
22 BUG_ON(ret == -EINVAL);
23 if (ret)
24 return ret;
25
26 /* Not sure if we should loop instead. */
27 if (retlen != len)
28 return -EIO;
29
30 return 0;
31}
32
33static int mtd_write(struct super_block *sb, loff_t ofs, size_t len, void *buf)
34{
35 struct logfs_super *super = logfs_super(sb);
36 struct mtd_info *mtd = super->s_mtd;
37 size_t retlen;
38 loff_t page_start, page_end;
39 int ret;
40
41 if (super->s_flags & LOGFS_SB_FLAG_RO)
42 return -EROFS;
43
44 BUG_ON((ofs >= mtd->size) || (len > mtd->size - ofs));
45 BUG_ON(ofs != (ofs >> super->s_writeshift) << super->s_writeshift);
46 BUG_ON(len > PAGE_CACHE_SIZE);
47 page_start = ofs & PAGE_CACHE_MASK;
48 page_end = PAGE_CACHE_ALIGN(ofs + len) - 1;
49 ret = mtd->write(mtd, ofs, len, &retlen, buf);
50 if (ret || (retlen != len))
51 return -EIO;
52
53 return 0;
54}
55
56/*
57 * For as long as I can remember (since about 2001) mtd->erase has been an
58 * asynchronous interface lacking the first driver to actually use the
59 * asynchronous properties. So just to prevent the first implementor of such
60 * a thing from breaking logfs in 2350, we do the usual pointless dance to
61 * declare a completion variable and wait for completion before returning
62 * from mtd_erase(). What an excercise in futility!
63 */
64static void logfs_erase_callback(struct erase_info *ei)
65{
66 complete((struct completion *)ei->priv);
67}
68
69static int mtd_erase_mapping(struct super_block *sb, loff_t ofs, size_t len)
70{
71 struct logfs_super *super = logfs_super(sb);
72 struct address_space *mapping = super->s_mapping_inode->i_mapping;
73 struct page *page;
74 pgoff_t index = ofs >> PAGE_SHIFT;
75
76 for (index = ofs >> PAGE_SHIFT; index < (ofs + len) >> PAGE_SHIFT; index++) {
77 page = find_get_page(mapping, index);
78 if (!page)
79 continue;
80 memset(page_address(page), 0xFF, PAGE_SIZE);
81 page_cache_release(page);
82 }
83 return 0;
84}
85
86static int mtd_erase(struct super_block *sb, loff_t ofs, size_t len,
87 int ensure_write)
88{
89 struct mtd_info *mtd = logfs_super(sb)->s_mtd;
90 struct erase_info ei;
91 DECLARE_COMPLETION_ONSTACK(complete);
92 int ret;
93
94 BUG_ON(len % mtd->erasesize);
95 if (logfs_super(sb)->s_flags & LOGFS_SB_FLAG_RO)
96 return -EROFS;
97
98 memset(&ei, 0, sizeof(ei));
99 ei.mtd = mtd;
100 ei.addr = ofs;
101 ei.len = len;
102 ei.callback = logfs_erase_callback;
103 ei.priv = (long)&complete;
104 ret = mtd->erase(mtd, &ei);
105 if (ret)
106 return -EIO;
107
108 wait_for_completion(&complete);
109 if (ei.state != MTD_ERASE_DONE)
110 return -EIO;
111 return mtd_erase_mapping(sb, ofs, len);
112}
113
114static void mtd_sync(struct super_block *sb)
115{
116 struct mtd_info *mtd = logfs_super(sb)->s_mtd;
117
118 if (mtd->sync)
119 mtd->sync(mtd);
120}
121
122static int mtd_readpage(void *_sb, struct page *page)
123{
124 struct super_block *sb = _sb;
125 int err;
126
127 err = mtd_read(sb, page->index << PAGE_SHIFT, PAGE_SIZE,
128 page_address(page));
129 if (err == -EUCLEAN) {
130 err = 0;
131 /* FIXME: force GC this segment */
132 }
133 if (err) {
134 ClearPageUptodate(page);
135 SetPageError(page);
136 } else {
137 SetPageUptodate(page);
138 ClearPageError(page);
139 }
140 unlock_page(page);
141 return err;
142}
143
144static struct page *mtd_find_first_sb(struct super_block *sb, u64 *ofs)
145{
146 struct logfs_super *super = logfs_super(sb);
147 struct address_space *mapping = super->s_mapping_inode->i_mapping;
148 filler_t *filler = mtd_readpage;
149 struct mtd_info *mtd = super->s_mtd;
150
151 if (!mtd->block_isbad)
152 return NULL;
153
154 *ofs = 0;
155 while (mtd->block_isbad(mtd, *ofs)) {
156 *ofs += mtd->erasesize;
157 if (*ofs >= mtd->size)
158 return NULL;
159 }
160 BUG_ON(*ofs & ~PAGE_MASK);
161 return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb);
162}
163
164static struct page *mtd_find_last_sb(struct super_block *sb, u64 *ofs)
165{
166 struct logfs_super *super = logfs_super(sb);
167 struct address_space *mapping = super->s_mapping_inode->i_mapping;
168 filler_t *filler = mtd_readpage;
169 struct mtd_info *mtd = super->s_mtd;
170
171 if (!mtd->block_isbad)
172 return NULL;
173
174 *ofs = mtd->size - mtd->erasesize;
175 while (mtd->block_isbad(mtd, *ofs)) {
176 *ofs -= mtd->erasesize;
177 if (*ofs <= 0)
178 return NULL;
179 }
180 *ofs = *ofs + mtd->erasesize - 0x1000;
181 BUG_ON(*ofs & ~PAGE_MASK);
182 return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb);
183}
184
185static int __mtd_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
186 size_t nr_pages)
187{
188 struct logfs_super *super = logfs_super(sb);
189 struct address_space *mapping = super->s_mapping_inode->i_mapping;
190 struct page *page;
191 int i, err;
192
193 for (i = 0; i < nr_pages; i++) {
194 page = find_lock_page(mapping, index + i);
195 BUG_ON(!page);
196
197 err = mtd_write(sb, page->index << PAGE_SHIFT, PAGE_SIZE,
198 page_address(page));
199 unlock_page(page);
200 page_cache_release(page);
201 if (err)
202 return err;
203 }
204 return 0;
205}
206
207static void mtd_writeseg(struct super_block *sb, u64 ofs, size_t len)
208{
209 struct logfs_super *super = logfs_super(sb);
210 int head;
211
212 if (super->s_flags & LOGFS_SB_FLAG_RO)
213 return;
214
215 if (len == 0) {
216 /* This can happen when the object fit perfectly into a
217 * segment, the segment gets written per sync and subsequently
218 * closed.
219 */
220 return;
221 }
222 head = ofs & (PAGE_SIZE - 1);
223 if (head) {
224 ofs -= head;
225 len += head;
226 }
227 len = PAGE_ALIGN(len);
228 __mtd_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT);
229}
230
231static void mtd_put_device(struct super_block *sb)
232{
233 put_mtd_device(logfs_super(sb)->s_mtd);
234}
235
236static const struct logfs_device_ops mtd_devops = {
237 .find_first_sb = mtd_find_first_sb,
238 .find_last_sb = mtd_find_last_sb,
239 .readpage = mtd_readpage,
240 .writeseg = mtd_writeseg,
241 .erase = mtd_erase,
242 .sync = mtd_sync,
243 .put_device = mtd_put_device,
244};
245
246int logfs_get_sb_mtd(struct file_system_type *type, int flags,
247 int mtdnr, struct vfsmount *mnt)
248{
249 struct mtd_info *mtd;
250 const struct logfs_device_ops *devops = &mtd_devops;
251
252 mtd = get_mtd_device(NULL, mtdnr);
253 return logfs_get_sb_device(type, flags, mtd, NULL, devops, mnt);
254}
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
new file mode 100644
index 000000000000..56a8bfbb0120
--- /dev/null
+++ b/fs/logfs/dir.c
@@ -0,0 +1,827 @@
1/*
2 * fs/logfs/dir.c - directory-related code
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */
8#include "logfs.h"
9
10
11/*
12 * Atomic dir operations
13 *
14 * Directory operations are by default not atomic. Dentries and Inodes are
15 * created/removed/altered in seperate operations. Therefore we need to do
16 * a small amount of journaling.
17 *
18 * Create, link, mkdir, mknod and symlink all share the same function to do
19 * the work: __logfs_create. This function works in two atomic steps:
20 * 1. allocate inode (remember in journal)
21 * 2. allocate dentry (clear journal)
22 *
23 * As we can only get interrupted between the two, when the inode we just
24 * created is simply stored in the anchor. On next mount, if we were
25 * interrupted, we delete the inode. From a users point of view the
26 * operation never happened.
27 *
28 * Unlink and rmdir also share the same function: unlink. Again, this
29 * function works in two atomic steps
30 * 1. remove dentry (remember inode in journal)
31 * 2. unlink inode (clear journal)
32 *
33 * And again, on the next mount, if we were interrupted, we delete the inode.
34 * From a users point of view the operation succeeded.
35 *
36 * Rename is the real pain to deal with, harder than all the other methods
37 * combined. Depending on the circumstances we can run into three cases.
38 * A "target rename" where the target dentry already existed, a "local
39 * rename" where both parent directories are identical or a "cross-directory
40 * rename" in the remaining case.
41 *
42 * Local rename is atomic, as the old dentry is simply rewritten with a new
43 * name.
44 *
45 * Cross-directory rename works in two steps, similar to __logfs_create and
46 * logfs_unlink:
47 * 1. Write new dentry (remember old dentry in journal)
48 * 2. Remove old dentry (clear journal)
49 *
50 * Here we remember a dentry instead of an inode. On next mount, if we were
51 * interrupted, we delete the dentry. From a users point of view, the
52 * operation succeeded.
53 *
54 * Target rename works in three atomic steps:
55 * 1. Attach old inode to new dentry (remember old dentry and new inode)
56 * 2. Remove old dentry (still remember the new inode)
57 * 3. Remove victim inode
58 *
59 * Here we remember both an inode an a dentry. If we get interrupted
60 * between steps 1 and 2, we delete both the dentry and the inode. If
61 * we get interrupted between steps 2 and 3, we delete just the inode.
62 * In either case, the remaining objects are deleted on next mount. From
63 * a users point of view, the operation succeeded.
64 */
65
66static int write_dir(struct inode *dir, struct logfs_disk_dentry *dd,
67 loff_t pos)
68{
69 return logfs_inode_write(dir, dd, sizeof(*dd), pos, WF_LOCK, NULL);
70}
71
72static int write_inode(struct inode *inode)
73{
74 return __logfs_write_inode(inode, WF_LOCK);
75}
76
77static s64 dir_seek_data(struct inode *inode, s64 pos)
78{
79 s64 new_pos = logfs_seek_data(inode, pos);
80
81 return max(pos, new_pos - 1);
82}
83
84static int beyond_eof(struct inode *inode, loff_t bix)
85{
86 loff_t pos = bix << inode->i_sb->s_blocksize_bits;
87 return pos >= i_size_read(inode);
88}
89
90/*
91 * Prime value was chosen to be roughly 256 + 26. r5 hash uses 11,
92 * so short names (len <= 9) don't even occupy the complete 32bit name
93 * space. A prime >256 ensures short names quickly spread the 32bit
94 * name space. Add about 26 for the estimated amount of information
95 * of each character and pick a prime nearby, preferrably a bit-sparse
96 * one.
97 */
98static u32 hash_32(const char *s, int len, u32 seed)
99{
100 u32 hash = seed;
101 int i;
102
103 for (i = 0; i < len; i++)
104 hash = hash * 293 + s[i];
105 return hash;
106}
107
108/*
109 * We have to satisfy several conflicting requirements here. Small
110 * directories should stay fairly compact and not require too many
111 * indirect blocks. The number of possible locations for a given hash
112 * should be small to make lookup() fast. And we should try hard not
113 * to overflow the 32bit name space or nfs and 32bit host systems will
114 * be unhappy.
115 *
116 * So we use the following scheme. First we reduce the hash to 0..15
117 * and try a direct block. If that is occupied we reduce the hash to
118 * 16..255 and try an indirect block. Same for 2x and 3x indirect
119 * blocks. Lastly we reduce the hash to 0x800_0000 .. 0xffff_ffff,
120 * but use buckets containing eight entries instead of a single one.
121 *
122 * Using 16 entries should allow for a reasonable amount of hash
123 * collisions, so the 32bit name space can be packed fairly tight
124 * before overflowing. Oh and currently we don't overflow but return
125 * and error.
126 *
127 * How likely are collisions? Doing the appropriate math is beyond me
128 * and the Bronstein textbook. But running a test program to brute
129 * force collisions for a couple of days showed that on average the
130 * first collision occurs after 598M entries, with 290M being the
131 * smallest result. Obviously 21 entries could already cause a
132 * collision if all entries are carefully chosen.
133 */
134static pgoff_t hash_index(u32 hash, int round)
135{
136 u32 i0_blocks = I0_BLOCKS;
137 u32 i1_blocks = I1_BLOCKS;
138 u32 i2_blocks = I2_BLOCKS;
139 u32 i3_blocks = I3_BLOCKS;
140
141 switch (round) {
142 case 0:
143 return hash % i0_blocks;
144 case 1:
145 return i0_blocks + hash % (i1_blocks - i0_blocks);
146 case 2:
147 return i1_blocks + hash % (i2_blocks - i1_blocks);
148 case 3:
149 return i2_blocks + hash % (i3_blocks - i2_blocks);
150 case 4 ... 19:
151 return i3_blocks + 16 * (hash % (((1<<31) - i3_blocks) / 16))
152 + round - 4;
153 }
154 BUG();
155}
156
157static struct page *logfs_get_dd_page(struct inode *dir, struct dentry *dentry)
158{
159 struct qstr *name = &dentry->d_name;
160 struct page *page;
161 struct logfs_disk_dentry *dd;
162 u32 hash = hash_32(name->name, name->len, 0);
163 pgoff_t index;
164 int round;
165
166 if (name->len > LOGFS_MAX_NAMELEN)
167 return ERR_PTR(-ENAMETOOLONG);
168
169 for (round = 0; round < 20; round++) {
170 index = hash_index(hash, round);
171
172 if (beyond_eof(dir, index))
173 return NULL;
174 if (!logfs_exist_block(dir, index))
175 continue;
176 page = read_cache_page(dir->i_mapping, index,
177 (filler_t *)logfs_readpage, NULL);
178 if (IS_ERR(page))
179 return page;
180 dd = kmap_atomic(page, KM_USER0);
181 BUG_ON(dd->namelen == 0);
182
183 if (name->len != be16_to_cpu(dd->namelen) ||
184 memcmp(name->name, dd->name, name->len)) {
185 kunmap_atomic(dd, KM_USER0);
186 page_cache_release(page);
187 continue;
188 }
189
190 kunmap_atomic(dd, KM_USER0);
191 return page;
192 }
193 return NULL;
194}
195
196static int logfs_remove_inode(struct inode *inode)
197{
198 int ret;
199
200 inode->i_nlink--;
201 ret = write_inode(inode);
202 LOGFS_BUG_ON(ret, inode->i_sb);
203 return ret;
204}
205
206static void abort_transaction(struct inode *inode, struct logfs_transaction *ta)
207{
208 if (logfs_inode(inode)->li_block)
209 logfs_inode(inode)->li_block->ta = NULL;
210 kfree(ta);
211}
212
213static int logfs_unlink(struct inode *dir, struct dentry *dentry)
214{
215 struct logfs_super *super = logfs_super(dir->i_sb);
216 struct inode *inode = dentry->d_inode;
217 struct logfs_transaction *ta;
218 struct page *page;
219 pgoff_t index;
220 int ret;
221
222 ta = kzalloc(sizeof(*ta), GFP_KERNEL);
223 if (!ta)
224 return -ENOMEM;
225
226 ta->state = UNLINK_1;
227 ta->ino = inode->i_ino;
228
229 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
230
231 page = logfs_get_dd_page(dir, dentry);
232 if (!page) {
233 kfree(ta);
234 return -ENOENT;
235 }
236 if (IS_ERR(page)) {
237 kfree(ta);
238 return PTR_ERR(page);
239 }
240 index = page->index;
241 page_cache_release(page);
242
243 mutex_lock(&super->s_dirop_mutex);
244 logfs_add_transaction(dir, ta);
245
246 ret = logfs_delete(dir, index, NULL);
247 if (!ret)
248 ret = write_inode(dir);
249
250 if (ret) {
251 abort_transaction(dir, ta);
252 printk(KERN_ERR"LOGFS: unable to delete inode\n");
253 goto out;
254 }
255
256 ta->state = UNLINK_2;
257 logfs_add_transaction(inode, ta);
258 ret = logfs_remove_inode(inode);
259out:
260 mutex_unlock(&super->s_dirop_mutex);
261 return ret;
262}
263
264static inline int logfs_empty_dir(struct inode *dir)
265{
266 u64 data;
267
268 data = logfs_seek_data(dir, 0) << dir->i_sb->s_blocksize_bits;
269 return data >= i_size_read(dir);
270}
271
272static int logfs_rmdir(struct inode *dir, struct dentry *dentry)
273{
274 struct inode *inode = dentry->d_inode;
275
276 if (!logfs_empty_dir(inode))
277 return -ENOTEMPTY;
278
279 return logfs_unlink(dir, dentry);
280}
281
282/* FIXME: readdir currently has it's own dir_walk code. I don't see a good
283 * way to combine the two copies */
284#define IMPLICIT_NODES 2
285static int __logfs_readdir(struct file *file, void *buf, filldir_t filldir)
286{
287 struct inode *dir = file->f_dentry->d_inode;
288 loff_t pos = file->f_pos - IMPLICIT_NODES;
289 struct page *page;
290 struct logfs_disk_dentry *dd;
291 int full;
292
293 BUG_ON(pos < 0);
294 for (;; pos++) {
295 if (beyond_eof(dir, pos))
296 break;
297 if (!logfs_exist_block(dir, pos)) {
298 /* deleted dentry */
299 pos = dir_seek_data(dir, pos);
300 continue;
301 }
302 page = read_cache_page(dir->i_mapping, pos,
303 (filler_t *)logfs_readpage, NULL);
304 if (IS_ERR(page))
305 return PTR_ERR(page);
306 dd = kmap_atomic(page, KM_USER0);
307 BUG_ON(dd->namelen == 0);
308
309 full = filldir(buf, (char *)dd->name, be16_to_cpu(dd->namelen),
310 pos, be64_to_cpu(dd->ino), dd->type);
311 kunmap_atomic(dd, KM_USER0);
312 page_cache_release(page);
313 if (full)
314 break;
315 }
316
317 file->f_pos = pos + IMPLICIT_NODES;
318 return 0;
319}
320
321static int logfs_readdir(struct file *file, void *buf, filldir_t filldir)
322{
323 struct inode *inode = file->f_dentry->d_inode;
324 ino_t pino = parent_ino(file->f_dentry);
325 int err;
326
327 if (file->f_pos < 0)
328 return -EINVAL;
329
330 if (file->f_pos == 0) {
331 if (filldir(buf, ".", 1, 1, inode->i_ino, DT_DIR) < 0)
332 return 0;
333 file->f_pos++;
334 }
335 if (file->f_pos == 1) {
336 if (filldir(buf, "..", 2, 2, pino, DT_DIR) < 0)
337 return 0;
338 file->f_pos++;
339 }
340
341 err = __logfs_readdir(file, buf, filldir);
342 return err;
343}
344
345static void logfs_set_name(struct logfs_disk_dentry *dd, struct qstr *name)
346{
347 dd->namelen = cpu_to_be16(name->len);
348 memcpy(dd->name, name->name, name->len);
349}
350
351static struct dentry *logfs_lookup(struct inode *dir, struct dentry *dentry,
352 struct nameidata *nd)
353{
354 struct page *page;
355 struct logfs_disk_dentry *dd;
356 pgoff_t index;
357 u64 ino = 0;
358 struct inode *inode;
359
360 page = logfs_get_dd_page(dir, dentry);
361 if (IS_ERR(page))
362 return ERR_CAST(page);
363 if (!page) {
364 d_add(dentry, NULL);
365 return NULL;
366 }
367 index = page->index;
368 dd = kmap_atomic(page, KM_USER0);
369 ino = be64_to_cpu(dd->ino);
370 kunmap_atomic(dd, KM_USER0);
371 page_cache_release(page);
372
373 inode = logfs_iget(dir->i_sb, ino);
374 if (IS_ERR(inode)) {
375 printk(KERN_ERR"LogFS: Cannot read inode #%llx for dentry (%lx, %lx)n",
376 ino, dir->i_ino, index);
377 return ERR_CAST(inode);
378 }
379 return d_splice_alias(inode, dentry);
380}
381
382static void grow_dir(struct inode *dir, loff_t index)
383{
384 index = (index + 1) << dir->i_sb->s_blocksize_bits;
385 if (i_size_read(dir) < index)
386 i_size_write(dir, index);
387}
388
389static int logfs_write_dir(struct inode *dir, struct dentry *dentry,
390 struct inode *inode)
391{
392 struct page *page;
393 struct logfs_disk_dentry *dd;
394 u32 hash = hash_32(dentry->d_name.name, dentry->d_name.len, 0);
395 pgoff_t index;
396 int round, err;
397
398 for (round = 0; round < 20; round++) {
399 index = hash_index(hash, round);
400
401 if (logfs_exist_block(dir, index))
402 continue;
403 page = find_or_create_page(dir->i_mapping, index, GFP_KERNEL);
404 if (!page)
405 return -ENOMEM;
406
407 dd = kmap_atomic(page, KM_USER0);
408 memset(dd, 0, sizeof(*dd));
409 dd->ino = cpu_to_be64(inode->i_ino);
410 dd->type = logfs_type(inode);
411 logfs_set_name(dd, &dentry->d_name);
412 kunmap_atomic(dd, KM_USER0);
413
414 err = logfs_write_buf(dir, page, WF_LOCK);
415 unlock_page(page);
416 page_cache_release(page);
417 if (!err)
418 grow_dir(dir, index);
419 return err;
420 }
421 /* FIXME: Is there a better return value? In most cases neither
422 * the filesystem nor the directory are full. But we have had
423 * too many collisions for this particular hash and no fallback.
424 */
425 return -ENOSPC;
426}
427
428static int __logfs_create(struct inode *dir, struct dentry *dentry,
429 struct inode *inode, const char *dest, long destlen)
430{
431 struct logfs_super *super = logfs_super(dir->i_sb);
432 struct logfs_inode *li = logfs_inode(inode);
433 struct logfs_transaction *ta;
434 int ret;
435
436 ta = kzalloc(sizeof(*ta), GFP_KERNEL);
437 if (!ta)
438 return -ENOMEM;
439
440 ta->state = CREATE_1;
441 ta->ino = inode->i_ino;
442 mutex_lock(&super->s_dirop_mutex);
443 logfs_add_transaction(inode, ta);
444
445 if (dest) {
446 /* symlink */
447 ret = logfs_inode_write(inode, dest, destlen, 0, WF_LOCK, NULL);
448 if (!ret)
449 ret = write_inode(inode);
450 } else {
451 /* creat/mkdir/mknod */
452 ret = write_inode(inode);
453 }
454 if (ret) {
455 abort_transaction(inode, ta);
456 li->li_flags |= LOGFS_IF_STILLBORN;
457 /* FIXME: truncate symlink */
458 inode->i_nlink--;
459 iput(inode);
460 goto out;
461 }
462
463 ta->state = CREATE_2;
464 logfs_add_transaction(dir, ta);
465 ret = logfs_write_dir(dir, dentry, inode);
466 /* sync directory */
467 if (!ret)
468 ret = write_inode(dir);
469
470 if (ret) {
471 logfs_del_transaction(dir, ta);
472 ta->state = CREATE_2;
473 logfs_add_transaction(inode, ta);
474 logfs_remove_inode(inode);
475 iput(inode);
476 goto out;
477 }
478 d_instantiate(dentry, inode);
479out:
480 mutex_unlock(&super->s_dirop_mutex);
481 return ret;
482}
483
484static int logfs_mkdir(struct inode *dir, struct dentry *dentry, int mode)
485{
486 struct inode *inode;
487
488 /*
489 * FIXME: why do we have to fill in S_IFDIR, while the mode is
490 * correct for mknod, creat, etc.? Smells like the vfs *should*
491 * do it for us but for some reason fails to do so.
492 */
493 inode = logfs_new_inode(dir, S_IFDIR | mode);
494 if (IS_ERR(inode))
495 return PTR_ERR(inode);
496
497 inode->i_op = &logfs_dir_iops;
498 inode->i_fop = &logfs_dir_fops;
499
500 return __logfs_create(dir, dentry, inode, NULL, 0);
501}
502
503static int logfs_create(struct inode *dir, struct dentry *dentry, int mode,
504 struct nameidata *nd)
505{
506 struct inode *inode;
507
508 inode = logfs_new_inode(dir, mode);
509 if (IS_ERR(inode))
510 return PTR_ERR(inode);
511
512 inode->i_op = &logfs_reg_iops;
513 inode->i_fop = &logfs_reg_fops;
514 inode->i_mapping->a_ops = &logfs_reg_aops;
515
516 return __logfs_create(dir, dentry, inode, NULL, 0);
517}
518
519static int logfs_mknod(struct inode *dir, struct dentry *dentry, int mode,
520 dev_t rdev)
521{
522 struct inode *inode;
523
524 if (dentry->d_name.len > LOGFS_MAX_NAMELEN)
525 return -ENAMETOOLONG;
526
527 inode = logfs_new_inode(dir, mode);
528 if (IS_ERR(inode))
529 return PTR_ERR(inode);
530
531 init_special_inode(inode, mode, rdev);
532
533 return __logfs_create(dir, dentry, inode, NULL, 0);
534}
535
536static int logfs_symlink(struct inode *dir, struct dentry *dentry,
537 const char *target)
538{
539 struct inode *inode;
540 size_t destlen = strlen(target) + 1;
541
542 if (destlen > dir->i_sb->s_blocksize)
543 return -ENAMETOOLONG;
544
545 inode = logfs_new_inode(dir, S_IFLNK | 0777);
546 if (IS_ERR(inode))
547 return PTR_ERR(inode);
548
549 inode->i_op = &logfs_symlink_iops;
550 inode->i_mapping->a_ops = &logfs_reg_aops;
551
552 return __logfs_create(dir, dentry, inode, target, destlen);
553}
554
555static int logfs_permission(struct inode *inode, int mask)
556{
557 return generic_permission(inode, mask, NULL);
558}
559
560static int logfs_link(struct dentry *old_dentry, struct inode *dir,
561 struct dentry *dentry)
562{
563 struct inode *inode = old_dentry->d_inode;
564
565 if (inode->i_nlink >= LOGFS_LINK_MAX)
566 return -EMLINK;
567
568 inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME;
569 atomic_inc(&inode->i_count);
570 inode->i_nlink++;
571 mark_inode_dirty_sync(inode);
572
573 return __logfs_create(dir, dentry, inode, NULL, 0);
574}
575
576static int logfs_get_dd(struct inode *dir, struct dentry *dentry,
577 struct logfs_disk_dentry *dd, loff_t *pos)
578{
579 struct page *page;
580 void *map;
581
582 page = logfs_get_dd_page(dir, dentry);
583 if (IS_ERR(page))
584 return PTR_ERR(page);
585 *pos = page->index;
586 map = kmap_atomic(page, KM_USER0);
587 memcpy(dd, map, sizeof(*dd));
588 kunmap_atomic(map, KM_USER0);
589 page_cache_release(page);
590 return 0;
591}
592
593static int logfs_delete_dd(struct inode *dir, loff_t pos)
594{
595 /*
596 * Getting called with pos somewhere beyond eof is either a goofup
597 * within this file or means someone maliciously edited the
598 * (crc-protected) journal.
599 */
600 BUG_ON(beyond_eof(dir, pos));
601 dir->i_ctime = dir->i_mtime = CURRENT_TIME;
602 log_dir(" Delete dentry (%lx, %llx)\n", dir->i_ino, pos);
603 return logfs_delete(dir, pos, NULL);
604}
605
606/*
607 * Cross-directory rename, target does not exist. Just a little nasty.
608 * Create a new dentry in the target dir, then remove the old dentry,
609 * all the while taking care to remember our operation in the journal.
610 */
611static int logfs_rename_cross(struct inode *old_dir, struct dentry *old_dentry,
612 struct inode *new_dir, struct dentry *new_dentry)
613{
614 struct logfs_super *super = logfs_super(old_dir->i_sb);
615 struct logfs_disk_dentry dd;
616 struct logfs_transaction *ta;
617 loff_t pos;
618 int err;
619
620 /* 1. locate source dd */
621 err = logfs_get_dd(old_dir, old_dentry, &dd, &pos);
622 if (err)
623 return err;
624
625 ta = kzalloc(sizeof(*ta), GFP_KERNEL);
626 if (!ta)
627 return -ENOMEM;
628
629 ta->state = CROSS_RENAME_1;
630 ta->dir = old_dir->i_ino;
631 ta->pos = pos;
632
633 /* 2. write target dd */
634 mutex_lock(&super->s_dirop_mutex);
635 logfs_add_transaction(new_dir, ta);
636 err = logfs_write_dir(new_dir, new_dentry, old_dentry->d_inode);
637 if (!err)
638 err = write_inode(new_dir);
639
640 if (err) {
641 super->s_rename_dir = 0;
642 super->s_rename_pos = 0;
643 abort_transaction(new_dir, ta);
644 goto out;
645 }
646
647 /* 3. remove source dd */
648 ta->state = CROSS_RENAME_2;
649 logfs_add_transaction(old_dir, ta);
650 err = logfs_delete_dd(old_dir, pos);
651 if (!err)
652 err = write_inode(old_dir);
653 LOGFS_BUG_ON(err, old_dir->i_sb);
654out:
655 mutex_unlock(&super->s_dirop_mutex);
656 return err;
657}
658
659static int logfs_replace_inode(struct inode *dir, struct dentry *dentry,
660 struct logfs_disk_dentry *dd, struct inode *inode)
661{
662 loff_t pos;
663 int err;
664
665 err = logfs_get_dd(dir, dentry, dd, &pos);
666 if (err)
667 return err;
668 dd->ino = cpu_to_be64(inode->i_ino);
669 dd->type = logfs_type(inode);
670
671 err = write_dir(dir, dd, pos);
672 if (err)
673 return err;
674 log_dir("Replace dentry (%lx, %llx) %s -> %llx\n", dir->i_ino, pos,
675 dd->name, be64_to_cpu(dd->ino));
676 return write_inode(dir);
677}
678
679/* Target dentry exists - the worst case. We need to attach the source
680 * inode to the target dentry, then remove the orphaned target inode and
681 * source dentry.
682 */
683static int logfs_rename_target(struct inode *old_dir, struct dentry *old_dentry,
684 struct inode *new_dir, struct dentry *new_dentry)
685{
686 struct logfs_super *super = logfs_super(old_dir->i_sb);
687 struct inode *old_inode = old_dentry->d_inode;
688 struct inode *new_inode = new_dentry->d_inode;
689 int isdir = S_ISDIR(old_inode->i_mode);
690 struct logfs_disk_dentry dd;
691 struct logfs_transaction *ta;
692 loff_t pos;
693 int err;
694
695 BUG_ON(isdir != S_ISDIR(new_inode->i_mode));
696 if (isdir) {
697 if (!logfs_empty_dir(new_inode))
698 return -ENOTEMPTY;
699 }
700
701 /* 1. locate source dd */
702 err = logfs_get_dd(old_dir, old_dentry, &dd, &pos);
703 if (err)
704 return err;
705
706 ta = kzalloc(sizeof(*ta), GFP_KERNEL);
707 if (!ta)
708 return -ENOMEM;
709
710 ta->state = TARGET_RENAME_1;
711 ta->dir = old_dir->i_ino;
712 ta->pos = pos;
713 ta->ino = new_inode->i_ino;
714
715 /* 2. attach source inode to target dd */
716 mutex_lock(&super->s_dirop_mutex);
717 logfs_add_transaction(new_dir, ta);
718 err = logfs_replace_inode(new_dir, new_dentry, &dd, old_inode);
719 if (err) {
720 super->s_rename_dir = 0;
721 super->s_rename_pos = 0;
722 super->s_victim_ino = 0;
723 abort_transaction(new_dir, ta);
724 goto out;
725 }
726
727 /* 3. remove source dd */
728 ta->state = TARGET_RENAME_2;
729 logfs_add_transaction(old_dir, ta);
730 err = logfs_delete_dd(old_dir, pos);
731 if (!err)
732 err = write_inode(old_dir);
733 LOGFS_BUG_ON(err, old_dir->i_sb);
734
735 /* 4. remove target inode */
736 ta->state = TARGET_RENAME_3;
737 logfs_add_transaction(new_inode, ta);
738 err = logfs_remove_inode(new_inode);
739
740out:
741 mutex_unlock(&super->s_dirop_mutex);
742 return err;
743}
744
745static int logfs_rename(struct inode *old_dir, struct dentry *old_dentry,
746 struct inode *new_dir, struct dentry *new_dentry)
747{
748 if (new_dentry->d_inode)
749 return logfs_rename_target(old_dir, old_dentry,
750 new_dir, new_dentry);
751 return logfs_rename_cross(old_dir, old_dentry, new_dir, new_dentry);
752}
753
754/* No locking done here, as this is called before .get_sb() returns. */
755int logfs_replay_journal(struct super_block *sb)
756{
757 struct logfs_super *super = logfs_super(sb);
758 struct inode *inode;
759 u64 ino, pos;
760 int err;
761
762 if (super->s_victim_ino) {
763 /* delete victim inode */
764 ino = super->s_victim_ino;
765 printk(KERN_INFO"LogFS: delete unmapped inode #%llx\n", ino);
766 inode = logfs_iget(sb, ino);
767 if (IS_ERR(inode))
768 goto fail;
769
770 LOGFS_BUG_ON(i_size_read(inode) > 0, sb);
771 super->s_victim_ino = 0;
772 err = logfs_remove_inode(inode);
773 iput(inode);
774 if (err) {
775 super->s_victim_ino = ino;
776 goto fail;
777 }
778 }
779 if (super->s_rename_dir) {
780 /* delete old dd from rename */
781 ino = super->s_rename_dir;
782 pos = super->s_rename_pos;
783 printk(KERN_INFO"LogFS: delete unbacked dentry (%llx, %llx)\n",
784 ino, pos);
785 inode = logfs_iget(sb, ino);
786 if (IS_ERR(inode))
787 goto fail;
788
789 super->s_rename_dir = 0;
790 super->s_rename_pos = 0;
791 err = logfs_delete_dd(inode, pos);
792 iput(inode);
793 if (err) {
794 super->s_rename_dir = ino;
795 super->s_rename_pos = pos;
796 goto fail;
797 }
798 }
799 return 0;
800fail:
801 LOGFS_BUG(sb);
802 return -EIO;
803}
804
805const struct inode_operations logfs_symlink_iops = {
806 .readlink = generic_readlink,
807 .follow_link = page_follow_link_light,
808};
809
810const struct inode_operations logfs_dir_iops = {
811 .create = logfs_create,
812 .link = logfs_link,
813 .lookup = logfs_lookup,
814 .mkdir = logfs_mkdir,
815 .mknod = logfs_mknod,
816 .rename = logfs_rename,
817 .rmdir = logfs_rmdir,
818 .permission = logfs_permission,
819 .symlink = logfs_symlink,
820 .unlink = logfs_unlink,
821};
822const struct file_operations logfs_dir_fops = {
823 .fsync = logfs_fsync,
824 .ioctl = logfs_ioctl,
825 .readdir = logfs_readdir,
826 .read = generic_read_dir,
827};
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
new file mode 100644
index 000000000000..370f367a933e
--- /dev/null
+++ b/fs/logfs/file.c
@@ -0,0 +1,263 @@
1/*
2 * fs/logfs/file.c - prepare_write, commit_write and friends
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */
8#include "logfs.h"
9#include <linux/sched.h>
10#include <linux/writeback.h>
11
12static int logfs_write_begin(struct file *file, struct address_space *mapping,
13 loff_t pos, unsigned len, unsigned flags,
14 struct page **pagep, void **fsdata)
15{
16 struct inode *inode = mapping->host;
17 struct page *page;
18 pgoff_t index = pos >> PAGE_CACHE_SHIFT;
19
20 page = grab_cache_page_write_begin(mapping, index, flags);
21 if (!page)
22 return -ENOMEM;
23 *pagep = page;
24
25 if ((len == PAGE_CACHE_SIZE) || PageUptodate(page))
26 return 0;
27 if ((pos & PAGE_CACHE_MASK) >= i_size_read(inode)) {
28 unsigned start = pos & (PAGE_CACHE_SIZE - 1);
29 unsigned end = start + len;
30
31 /* Reading beyond i_size is simple: memset to zero */
32 zero_user_segments(page, 0, start, end, PAGE_CACHE_SIZE);
33 return 0;
34 }
35 return logfs_readpage_nolock(page);
36}
37
38static int logfs_write_end(struct file *file, struct address_space *mapping,
39 loff_t pos, unsigned len, unsigned copied, struct page *page,
40 void *fsdata)
41{
42 struct inode *inode = mapping->host;
43 pgoff_t index = page->index;
44 unsigned start = pos & (PAGE_CACHE_SIZE - 1);
45 unsigned end = start + copied;
46 int ret = 0;
47
48 BUG_ON(PAGE_CACHE_SIZE != inode->i_sb->s_blocksize);
49 BUG_ON(page->index > I3_BLOCKS);
50
51 if (copied < len) {
52 /*
53 * Short write of a non-initialized paged. Just tell userspace
54 * to retry the entire page.
55 */
56 if (!PageUptodate(page)) {
57 copied = 0;
58 goto out;
59 }
60 }
61 if (copied == 0)
62 goto out; /* FIXME: do we need to update inode? */
63
64 if (i_size_read(inode) < (index << PAGE_CACHE_SHIFT) + end) {
65 i_size_write(inode, (index << PAGE_CACHE_SHIFT) + end);
66 mark_inode_dirty_sync(inode);
67 }
68
69 SetPageUptodate(page);
70 if (!PageDirty(page)) {
71 if (!get_page_reserve(inode, page))
72 __set_page_dirty_nobuffers(page);
73 else
74 ret = logfs_write_buf(inode, page, WF_LOCK);
75 }
76out:
77 unlock_page(page);
78 page_cache_release(page);
79 return ret ? ret : copied;
80}
81
82int logfs_readpage(struct file *file, struct page *page)
83{
84 int ret;
85
86 ret = logfs_readpage_nolock(page);
87 unlock_page(page);
88 return ret;
89}
90
91/* Clear the page's dirty flag in the radix tree. */
92/* TODO: mucking with PageWriteback is silly. Add a generic function to clear
93 * the dirty bit from the radix tree for filesystems that don't have to wait
94 * for page writeback to finish (i.e. any compressing filesystem).
95 */
96static void clear_radix_tree_dirty(struct page *page)
97{
98 BUG_ON(PagePrivate(page) || page->private);
99 set_page_writeback(page);
100 end_page_writeback(page);
101}
102
103static int __logfs_writepage(struct page *page)
104{
105 struct inode *inode = page->mapping->host;
106 int err;
107
108 err = logfs_write_buf(inode, page, WF_LOCK);
109 if (err)
110 set_page_dirty(page);
111 else
112 clear_radix_tree_dirty(page);
113 unlock_page(page);
114 return err;
115}
116
117static int logfs_writepage(struct page *page, struct writeback_control *wbc)
118{
119 struct inode *inode = page->mapping->host;
120 loff_t i_size = i_size_read(inode);
121 pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT;
122 unsigned offset;
123 u64 bix;
124 level_t level;
125
126 log_file("logfs_writepage(%lx, %lx, %p)\n", inode->i_ino, page->index,
127 page);
128
129 logfs_unpack_index(page->index, &bix, &level);
130
131 /* Indirect blocks are never truncated */
132 if (level != 0)
133 return __logfs_writepage(page);
134
135 /*
136 * TODO: everything below is a near-verbatim copy of nobh_writepage().
137 * The relevant bits should be factored out after logfs is merged.
138 */
139
140 /* Is the page fully inside i_size? */
141 if (bix < end_index)
142 return __logfs_writepage(page);
143
144 /* Is the page fully outside i_size? (truncate in progress) */
145 offset = i_size & (PAGE_CACHE_SIZE-1);
146 if (bix > end_index || offset == 0) {
147 unlock_page(page);
148 return 0; /* don't care */
149 }
150
151 /*
152 * The page straddles i_size. It must be zeroed out on each and every
153 * writepage invokation because it may be mmapped. "A file is mapped
154 * in multiples of the page size. For a file that is not a multiple of
155 * the page size, the remaining memory is zeroed when mapped, and
156 * writes to that region are not written out to the file."
157 */
158 zero_user_segment(page, offset, PAGE_CACHE_SIZE);
159 return __logfs_writepage(page);
160}
161
162static void logfs_invalidatepage(struct page *page, unsigned long offset)
163{
164 move_page_to_btree(page);
165 BUG_ON(PagePrivate(page) || page->private);
166}
167
168static int logfs_releasepage(struct page *page, gfp_t only_xfs_uses_this)
169{
170 return 0; /* None of these are easy to release */
171}
172
173
174int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
175 unsigned long arg)
176{
177 struct logfs_inode *li = logfs_inode(inode);
178 unsigned int oldflags, flags;
179 int err;
180
181 switch (cmd) {
182 case FS_IOC_GETFLAGS:
183 flags = li->li_flags & LOGFS_FL_USER_VISIBLE;
184 return put_user(flags, (int __user *)arg);
185 case FS_IOC_SETFLAGS:
186 if (IS_RDONLY(inode))
187 return -EROFS;
188
189 if (!is_owner_or_cap(inode))
190 return -EACCES;
191
192 err = get_user(flags, (int __user *)arg);
193 if (err)
194 return err;
195
196 mutex_lock(&inode->i_mutex);
197 oldflags = li->li_flags;
198 flags &= LOGFS_FL_USER_MODIFIABLE;
199 flags |= oldflags & ~LOGFS_FL_USER_MODIFIABLE;
200 li->li_flags = flags;
201 mutex_unlock(&inode->i_mutex);
202
203 inode->i_ctime = CURRENT_TIME;
204 mark_inode_dirty_sync(inode);
205 return 0;
206
207 default:
208 return -ENOTTY;
209 }
210}
211
212int logfs_fsync(struct file *file, struct dentry *dentry, int datasync)
213{
214 struct super_block *sb = dentry->d_inode->i_sb;
215 struct logfs_super *super = logfs_super(sb);
216
217 /* FIXME: write anchor */
218 super->s_devops->sync(sb);
219 return 0;
220}
221
222static int logfs_setattr(struct dentry *dentry, struct iattr *attr)
223{
224 struct inode *inode = dentry->d_inode;
225 int err = 0;
226
227 if (attr->ia_valid & ATTR_SIZE)
228 err = logfs_truncate(inode, attr->ia_size);
229 attr->ia_valid &= ~ATTR_SIZE;
230
231 if (!err)
232 err = inode_change_ok(inode, attr);
233 if (!err)
234 err = inode_setattr(inode, attr);
235 return err;
236}
237
238const struct inode_operations logfs_reg_iops = {
239 .setattr = logfs_setattr,
240};
241
242const struct file_operations logfs_reg_fops = {
243 .aio_read = generic_file_aio_read,
244 .aio_write = generic_file_aio_write,
245 .fsync = logfs_fsync,
246 .ioctl = logfs_ioctl,
247 .llseek = generic_file_llseek,
248 .mmap = generic_file_readonly_mmap,
249 .open = generic_file_open,
250 .read = do_sync_read,
251 .write = do_sync_write,
252};
253
254const struct address_space_operations logfs_reg_aops = {
255 .invalidatepage = logfs_invalidatepage,
256 .readpage = logfs_readpage,
257 .releasepage = logfs_releasepage,
258 .set_page_dirty = __set_page_dirty_nobuffers,
259 .writepage = logfs_writepage,
260 .writepages = generic_writepages,
261 .write_begin = logfs_write_begin,
262 .write_end = logfs_write_end,
263};
diff --git a/fs/logfs/gc.c b/fs/logfs/gc.c
new file mode 100644
index 000000000000..92949f95a901
--- /dev/null
+++ b/fs/logfs/gc.c
@@ -0,0 +1,730 @@
1/*
2 * fs/logfs/gc.c - garbage collection code
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */
8#include "logfs.h"
9#include <linux/sched.h>
10
11/*
12 * Wear leveling needs to kick in when the difference between low erase
13 * counts and high erase counts gets too big. A good value for "too big"
14 * may be somewhat below 10% of maximum erase count for the device.
15 * Why not 397, to pick a nice round number with no specific meaning? :)
16 *
17 * WL_RATELIMIT is the minimum time between two wear level events. A huge
18 * number of segments may fulfil the requirements for wear leveling at the
19 * same time. If that happens we don't want to cause a latency from hell,
20 * but just gently pick one segment every so often and minimize overhead.
21 */
22#define WL_DELTA 397
23#define WL_RATELIMIT 100
24#define MAX_OBJ_ALIASES 2600
25#define SCAN_RATIO 512 /* number of scanned segments per gc'd segment */
26#define LIST_SIZE 64 /* base size of candidate lists */
27#define SCAN_ROUNDS 128 /* maximum number of complete medium scans */
28#define SCAN_ROUNDS_HIGH 4 /* maximum number of higher-level scans */
29
30static int no_free_segments(struct super_block *sb)
31{
32 struct logfs_super *super = logfs_super(sb);
33
34 return super->s_free_list.count;
35}
36
37/* journal has distance -1, top-most ifile layer distance 0 */
38static u8 root_distance(struct super_block *sb, gc_level_t __gc_level)
39{
40 struct logfs_super *super = logfs_super(sb);
41 u8 gc_level = (__force u8)__gc_level;
42
43 switch (gc_level) {
44 case 0: /* fall through */
45 case 1: /* fall through */
46 case 2: /* fall through */
47 case 3:
48 /* file data or indirect blocks */
49 return super->s_ifile_levels + super->s_iblock_levels - gc_level;
50 case 6: /* fall through */
51 case 7: /* fall through */
52 case 8: /* fall through */
53 case 9:
54 /* inode file data or indirect blocks */
55 return super->s_ifile_levels - (gc_level - 6);
56 default:
57 printk(KERN_ERR"LOGFS: segment of unknown level %x found\n",
58 gc_level);
59 WARN_ON(1);
60 return super->s_ifile_levels + super->s_iblock_levels;
61 }
62}
63
64static int segment_is_reserved(struct super_block *sb, u32 segno)
65{
66 struct logfs_super *super = logfs_super(sb);
67 struct logfs_area *area;
68 void *reserved;
69 int i;
70
71 /* Some segments are reserved. Just pretend they were all valid */
72 reserved = btree_lookup32(&super->s_reserved_segments, segno);
73 if (reserved)
74 return 1;
75
76 /* Currently open segments */
77 for_each_area(i) {
78 area = super->s_area[i];
79 if (area->a_is_open && area->a_segno == segno)
80 return 1;
81 }
82
83 return 0;
84}
85
86static void logfs_mark_segment_bad(struct super_block *sb, u32 segno)
87{
88 BUG();
89}
90
91/*
92 * Returns the bytes consumed by valid objects in this segment. Object headers
93 * are counted, the segment header is not.
94 */
95static u32 logfs_valid_bytes(struct super_block *sb, u32 segno, u32 *ec,
96 gc_level_t *gc_level)
97{
98 struct logfs_segment_entry se;
99 u32 ec_level;
100
101 logfs_get_segment_entry(sb, segno, &se);
102 if (se.ec_level == cpu_to_be32(BADSEG) ||
103 se.valid == cpu_to_be32(RESERVED))
104 return RESERVED;
105
106 ec_level = be32_to_cpu(se.ec_level);
107 *ec = ec_level >> 4;
108 *gc_level = GC_LEVEL(ec_level & 0xf);
109 return be32_to_cpu(se.valid);
110}
111
112static void logfs_cleanse_block(struct super_block *sb, u64 ofs, u64 ino,
113 u64 bix, gc_level_t gc_level)
114{
115 struct inode *inode;
116 int err, cookie;
117
118 inode = logfs_safe_iget(sb, ino, &cookie);
119 err = logfs_rewrite_block(inode, bix, ofs, gc_level, 0);
120 BUG_ON(err);
121 logfs_safe_iput(inode, cookie);
122}
123
124static u32 logfs_gc_segment(struct super_block *sb, u32 segno, u8 dist)
125{
126 struct logfs_super *super = logfs_super(sb);
127 struct logfs_segment_header sh;
128 struct logfs_object_header oh;
129 u64 ofs, ino, bix;
130 u32 seg_ofs, logical_segno, cleaned = 0;
131 int err, len, valid;
132 gc_level_t gc_level;
133
134 LOGFS_BUG_ON(segment_is_reserved(sb, segno), sb);
135
136 btree_insert32(&super->s_reserved_segments, segno, (void *)1, GFP_NOFS);
137 err = wbuf_read(sb, dev_ofs(sb, segno, 0), sizeof(sh), &sh);
138 BUG_ON(err);
139 gc_level = GC_LEVEL(sh.level);
140 logical_segno = be32_to_cpu(sh.segno);
141 if (sh.crc != logfs_crc32(&sh, sizeof(sh), 4)) {
142 logfs_mark_segment_bad(sb, segno);
143 cleaned = -1;
144 goto out;
145 }
146
147 for (seg_ofs = LOGFS_SEGMENT_HEADERSIZE;
148 seg_ofs + sizeof(oh) < super->s_segsize; ) {
149 ofs = dev_ofs(sb, logical_segno, seg_ofs);
150 err = wbuf_read(sb, dev_ofs(sb, segno, seg_ofs), sizeof(oh),
151 &oh);
152 BUG_ON(err);
153
154 if (!memchr_inv(&oh, 0xff, sizeof(oh)))
155 break;
156
157 if (oh.crc != logfs_crc32(&oh, sizeof(oh) - 4, 4)) {
158 logfs_mark_segment_bad(sb, segno);
159 cleaned = super->s_segsize - 1;
160 goto out;
161 }
162
163 ino = be64_to_cpu(oh.ino);
164 bix = be64_to_cpu(oh.bix);
165 len = sizeof(oh) + be16_to_cpu(oh.len);
166 valid = logfs_is_valid_block(sb, ofs, ino, bix, gc_level);
167 if (valid == 1) {
168 logfs_cleanse_block(sb, ofs, ino, bix, gc_level);
169 cleaned += len;
170 } else if (valid == 2) {
171 /* Will be invalid upon journal commit */
172 cleaned += len;
173 }
174 seg_ofs += len;
175 }
176out:
177 btree_remove32(&super->s_reserved_segments, segno);
178 return cleaned;
179}
180
181static struct gc_candidate *add_list(struct gc_candidate *cand,
182 struct candidate_list *list)
183{
184 struct rb_node **p = &list->rb_tree.rb_node;
185 struct rb_node *parent = NULL;
186 struct gc_candidate *cur;
187 int comp;
188
189 cand->list = list;
190 while (*p) {
191 parent = *p;
192 cur = rb_entry(parent, struct gc_candidate, rb_node);
193
194 if (list->sort_by_ec)
195 comp = cand->erase_count < cur->erase_count;
196 else
197 comp = cand->valid < cur->valid;
198
199 if (comp)
200 p = &parent->rb_left;
201 else
202 p = &parent->rb_right;
203 }
204 rb_link_node(&cand->rb_node, parent, p);
205 rb_insert_color(&cand->rb_node, &list->rb_tree);
206
207 if (list->count <= list->maxcount) {
208 list->count++;
209 return NULL;
210 }
211 cand = rb_entry(rb_last(&list->rb_tree), struct gc_candidate, rb_node);
212 rb_erase(&cand->rb_node, &list->rb_tree);
213 cand->list = NULL;
214 return cand;
215}
216
217static void remove_from_list(struct gc_candidate *cand)
218{
219 struct candidate_list *list = cand->list;
220
221 rb_erase(&cand->rb_node, &list->rb_tree);
222 list->count--;
223}
224
225static void free_candidate(struct super_block *sb, struct gc_candidate *cand)
226{
227 struct logfs_super *super = logfs_super(sb);
228
229 btree_remove32(&super->s_cand_tree, cand->segno);
230 kfree(cand);
231}
232
233u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec)
234{
235 struct gc_candidate *cand;
236 u32 segno;
237
238 BUG_ON(list->count == 0);
239
240 cand = rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node);
241 remove_from_list(cand);
242 segno = cand->segno;
243 if (ec)
244 *ec = cand->erase_count;
245 free_candidate(sb, cand);
246 return segno;
247}
248
249/*
250 * We have several lists to manage segments with. The reserve_list is used to
251 * deal with bad blocks. We try to keep the best (lowest ec) segments on this
252 * list.
253 * The free_list contains free segments for normal usage. It usually gets the
254 * second pick after the reserve_list. But when the free_list is running short
255 * it is more important to keep the free_list full than to keep a reserve.
256 *
257 * Segments that are not free are put onto a per-level low_list. If we have
258 * to run garbage collection, we pick a candidate from there. All segments on
259 * those lists should have at least some free space so GC will make progress.
260 *
261 * And last we have the ec_list, which is used to pick segments for wear
262 * leveling.
263 *
264 * If all appropriate lists are full, we simply free the candidate and forget
265 * about that segment for a while. We have better candidates for each purpose.
266 */
267static void __add_candidate(struct super_block *sb, struct gc_candidate *cand)
268{
269 struct logfs_super *super = logfs_super(sb);
270 u32 full = super->s_segsize - LOGFS_SEGMENT_RESERVE;
271
272 if (cand->valid == 0) {
273 /* 100% free segments */
274 log_gc_noisy("add reserve segment %x (ec %x) at %llx\n",
275 cand->segno, cand->erase_count,
276 dev_ofs(sb, cand->segno, 0));
277 cand = add_list(cand, &super->s_reserve_list);
278 if (cand) {
279 log_gc_noisy("add free segment %x (ec %x) at %llx\n",
280 cand->segno, cand->erase_count,
281 dev_ofs(sb, cand->segno, 0));
282 cand = add_list(cand, &super->s_free_list);
283 }
284 } else {
285 /* good candidates for Garbage Collection */
286 if (cand->valid < full)
287 cand = add_list(cand, &super->s_low_list[cand->dist]);
288 /* good candidates for wear leveling,
289 * segments that were recently written get ignored */
290 if (cand)
291 cand = add_list(cand, &super->s_ec_list);
292 }
293 if (cand)
294 free_candidate(sb, cand);
295}
296
297static int add_candidate(struct super_block *sb, u32 segno, u32 valid, u32 ec,
298 u8 dist)
299{
300 struct logfs_super *super = logfs_super(sb);
301 struct gc_candidate *cand;
302
303 cand = kmalloc(sizeof(*cand), GFP_NOFS);
304 if (!cand)
305 return -ENOMEM;
306
307 cand->segno = segno;
308 cand->valid = valid;
309 cand->erase_count = ec;
310 cand->dist = dist;
311
312 btree_insert32(&super->s_cand_tree, segno, cand, GFP_NOFS);
313 __add_candidate(sb, cand);
314 return 0;
315}
316
317static void remove_segment_from_lists(struct super_block *sb, u32 segno)
318{
319 struct logfs_super *super = logfs_super(sb);
320 struct gc_candidate *cand;
321
322 cand = btree_lookup32(&super->s_cand_tree, segno);
323 if (cand) {
324 remove_from_list(cand);
325 free_candidate(sb, cand);
326 }
327}
328
329static void scan_segment(struct super_block *sb, u32 segno)
330{
331 u32 valid, ec = 0;
332 gc_level_t gc_level = 0;
333 u8 dist;
334
335 if (segment_is_reserved(sb, segno))
336 return;
337
338 remove_segment_from_lists(sb, segno);
339 valid = logfs_valid_bytes(sb, segno, &ec, &gc_level);
340 if (valid == RESERVED)
341 return;
342
343 dist = root_distance(sb, gc_level);
344 add_candidate(sb, segno, valid, ec, dist);
345}
346
347static struct gc_candidate *first_in_list(struct candidate_list *list)
348{
349 if (list->count == 0)
350 return NULL;
351 return rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node);
352}
353
354/*
355 * Find the best segment for garbage collection. Main criterion is
356 * the segment requiring the least effort to clean. Secondary
357 * criterion is to GC on the lowest level available.
358 *
359 * So we search the least effort segment on the lowest level first,
360 * then move up and pick another segment iff is requires significantly
361 * less effort. Hence the LOGFS_MAX_OBJECTSIZE in the comparison.
362 */
363static struct gc_candidate *get_candidate(struct super_block *sb)
364{
365 struct logfs_super *super = logfs_super(sb);
366 int i, max_dist;
367 struct gc_candidate *cand = NULL, *this;
368
369 max_dist = min(no_free_segments(sb), LOGFS_NO_AREAS);
370
371 for (i = max_dist; i >= 0; i--) {
372 this = first_in_list(&super->s_low_list[i]);
373 if (!this)
374 continue;
375 if (!cand)
376 cand = this;
377 if (this->valid + LOGFS_MAX_OBJECTSIZE <= cand->valid)
378 cand = this;
379 }
380 return cand;
381}
382
383static int __logfs_gc_once(struct super_block *sb, struct gc_candidate *cand)
384{
385 struct logfs_super *super = logfs_super(sb);
386 gc_level_t gc_level;
387 u32 cleaned, valid, segno, ec;
388 u8 dist;
389
390 if (!cand) {
391 log_gc("GC attempted, but no candidate found\n");
392 return 0;
393 }
394
395 segno = cand->segno;
396 dist = cand->dist;
397 valid = logfs_valid_bytes(sb, segno, &ec, &gc_level);
398 free_candidate(sb, cand);
399 log_gc("GC segment #%02x at %llx, %x required, %x free, %x valid, %llx free\n",
400 segno, (u64)segno << super->s_segshift,
401 dist, no_free_segments(sb), valid,
402 super->s_free_bytes);
403 cleaned = logfs_gc_segment(sb, segno, dist);
404 log_gc("GC segment #%02x complete - now %x valid\n", segno,
405 valid - cleaned);
406 BUG_ON(cleaned != valid);
407 return 1;
408}
409
410static int logfs_gc_once(struct super_block *sb)
411{
412 struct gc_candidate *cand;
413
414 cand = get_candidate(sb);
415 if (cand)
416 remove_from_list(cand);
417 return __logfs_gc_once(sb, cand);
418}
419
420/* returns 1 if a wrap occurs, 0 otherwise */
421static int logfs_scan_some(struct super_block *sb)
422{
423 struct logfs_super *super = logfs_super(sb);
424 u32 segno;
425 int i, ret = 0;
426
427 segno = super->s_sweeper;
428 for (i = SCAN_RATIO; i > 0; i--) {
429 segno++;
430 if (segno >= super->s_no_segs) {
431 segno = 0;
432 ret = 1;
433 /* Break out of the loop. We want to read a single
434 * block from the segment size on next invocation if
435 * SCAN_RATIO is set to match block size
436 */
437 break;
438 }
439
440 scan_segment(sb, segno);
441 }
442 super->s_sweeper = segno;
443 return ret;
444}
445
446/*
447 * In principle, this function should loop forever, looking for GC candidates
448 * and moving data. LogFS is designed in such a way that this loop is
449 * guaranteed to terminate.
450 *
451 * Limiting the loop to some iterations serves purely to catch cases when
452 * these guarantees have failed. An actual endless loop is an obvious bug
453 * and should be reported as such.
454 */
455static void __logfs_gc_pass(struct super_block *sb, int target)
456{
457 struct logfs_super *super = logfs_super(sb);
458 struct logfs_block *block;
459 int round, progress, last_progress = 0;
460
461 if (no_free_segments(sb) >= target &&
462 super->s_no_object_aliases < MAX_OBJ_ALIASES)
463 return;
464
465 log_gc("__logfs_gc_pass(%x)\n", target);
466 for (round = 0; round < SCAN_ROUNDS; ) {
467 if (no_free_segments(sb) >= target)
468 goto write_alias;
469
470 /* Sync in-memory state with on-medium state in case they
471 * diverged */
472 logfs_write_anchor(sb);
473 round += logfs_scan_some(sb);
474 if (no_free_segments(sb) >= target)
475 goto write_alias;
476 progress = logfs_gc_once(sb);
477 if (progress)
478 last_progress = round;
479 else if (round - last_progress > 2)
480 break;
481 continue;
482
483 /*
484 * The goto logic is nasty, I just don't know a better way to
485 * code it. GC is supposed to ensure two things:
486 * 1. Enough free segments are available.
487 * 2. The number of aliases is bounded.
488 * When 1. is achieved, we take a look at 2. and write back
489 * some alias-containing blocks, if necessary. However, after
490 * each such write we need to go back to 1., as writes can
491 * consume free segments.
492 */
493write_alias:
494 if (super->s_no_object_aliases < MAX_OBJ_ALIASES)
495 return;
496 if (list_empty(&super->s_object_alias)) {
497 /* All aliases are still in btree */
498 return;
499 }
500 log_gc("Write back one alias\n");
501 block = list_entry(super->s_object_alias.next,
502 struct logfs_block, alias_list);
503 block->ops->write_block(block);
504 /*
505 * To round off the nasty goto logic, we reset round here. It
506 * is a safety-net for GC not making any progress and limited
507 * to something reasonably small. If incremented it for every
508 * single alias, the loop could terminate rather quickly.
509 */
510 round = 0;
511 }
512 LOGFS_BUG(sb);
513}
514
515static int wl_ratelimit(struct super_block *sb, u64 *next_event)
516{
517 struct logfs_super *super = logfs_super(sb);
518
519 if (*next_event < super->s_gec) {
520 *next_event = super->s_gec + WL_RATELIMIT;
521 return 0;
522 }
523 return 1;
524}
525
526static void logfs_wl_pass(struct super_block *sb)
527{
528 struct logfs_super *super = logfs_super(sb);
529 struct gc_candidate *wl_cand, *free_cand;
530
531 if (wl_ratelimit(sb, &super->s_wl_gec_ostore))
532 return;
533
534 wl_cand = first_in_list(&super->s_ec_list);
535 if (!wl_cand)
536 return;
537 free_cand = first_in_list(&super->s_free_list);
538 if (!free_cand)
539 return;
540
541 if (wl_cand->erase_count < free_cand->erase_count + WL_DELTA) {
542 remove_from_list(wl_cand);
543 __logfs_gc_once(sb, wl_cand);
544 }
545}
546
547/*
548 * The journal needs wear leveling as well. But moving the journal is an
549 * expensive operation so we try to avoid it as much as possible. And if we
550 * have to do it, we move the whole journal, not individual segments.
551 *
552 * Ratelimiting is not strictly necessary here, it mainly serves to avoid the
553 * calculations. First we check whether moving the journal would be a
554 * significant improvement. That means that a) the current journal segments
555 * have more wear than the future journal segments and b) the current journal
556 * segments have more wear than normal ostore segments.
557 * Rationale for b) is that we don't have to move the journal if it is aging
558 * less than the ostore, even if the reserve segments age even less (they are
559 * excluded from wear leveling, after all).
560 * Next we check that the superblocks have less wear than the journal. Since
561 * moving the journal requires writing the superblocks, we have to protect the
562 * superblocks even more than the journal.
563 *
564 * Also we double the acceptable wear difference, compared to ostore wear
565 * leveling. Journal data is read and rewritten rapidly, comparatively. So
566 * soft errors have much less time to accumulate and we allow the journal to
567 * be a bit worse than the ostore.
568 */
569static void logfs_journal_wl_pass(struct super_block *sb)
570{
571 struct logfs_super *super = logfs_super(sb);
572 struct gc_candidate *cand;
573 u32 min_journal_ec = -1, max_reserve_ec = 0;
574 int i;
575
576 if (wl_ratelimit(sb, &super->s_wl_gec_journal))
577 return;
578
579 if (super->s_reserve_list.count < super->s_no_journal_segs) {
580 /* Reserve is not full enough to move complete journal */
581 return;
582 }
583
584 journal_for_each(i)
585 if (super->s_journal_seg[i])
586 min_journal_ec = min(min_journal_ec,
587 super->s_journal_ec[i]);
588 cand = rb_entry(rb_first(&super->s_free_list.rb_tree),
589 struct gc_candidate, rb_node);
590 max_reserve_ec = cand->erase_count;
591 for (i = 0; i < 2; i++) {
592 struct logfs_segment_entry se;
593 u32 segno = seg_no(sb, super->s_sb_ofs[i]);
594 u32 ec;
595
596 logfs_get_segment_entry(sb, segno, &se);
597 ec = be32_to_cpu(se.ec_level) >> 4;
598 max_reserve_ec = max(max_reserve_ec, ec);
599 }
600
601 if (min_journal_ec > max_reserve_ec + 2 * WL_DELTA) {
602 do_logfs_journal_wl_pass(sb);
603 }
604}
605
606void logfs_gc_pass(struct super_block *sb)
607{
608 struct logfs_super *super = logfs_super(sb);
609
610 //BUG_ON(mutex_trylock(&logfs_super(sb)->s_w_mutex));
611 /* Write journal before free space is getting saturated with dirty
612 * objects.
613 */
614 if (super->s_dirty_used_bytes + super->s_dirty_free_bytes
615 + LOGFS_MAX_OBJECTSIZE >= super->s_free_bytes)
616 logfs_write_anchor(sb);
617 __logfs_gc_pass(sb, super->s_total_levels);
618 logfs_wl_pass(sb);
619 logfs_journal_wl_pass(sb);
620}
621
622static int check_area(struct super_block *sb, int i)
623{
624 struct logfs_super *super = logfs_super(sb);
625 struct logfs_area *area = super->s_area[i];
626 struct logfs_object_header oh;
627 u32 segno = area->a_segno;
628 u32 ofs = area->a_used_bytes;
629 __be32 crc;
630 int err;
631
632 if (!area->a_is_open)
633 return 0;
634
635 for (ofs = area->a_used_bytes;
636 ofs <= super->s_segsize - sizeof(oh);
637 ofs += (u32)be16_to_cpu(oh.len) + sizeof(oh)) {
638 err = wbuf_read(sb, dev_ofs(sb, segno, ofs), sizeof(oh), &oh);
639 if (err)
640 return err;
641
642 if (!memchr_inv(&oh, 0xff, sizeof(oh)))
643 break;
644
645 crc = logfs_crc32(&oh, sizeof(oh) - 4, 4);
646 if (crc != oh.crc) {
647 printk(KERN_INFO "interrupted header at %llx\n",
648 dev_ofs(sb, segno, ofs));
649 return 0;
650 }
651 }
652 if (ofs != area->a_used_bytes) {
653 printk(KERN_INFO "%x bytes unaccounted data found at %llx\n",
654 ofs - area->a_used_bytes,
655 dev_ofs(sb, segno, area->a_used_bytes));
656 area->a_used_bytes = ofs;
657 }
658 return 0;
659}
660
661int logfs_check_areas(struct super_block *sb)
662{
663 int i, err;
664
665 for_each_area(i) {
666 err = check_area(sb, i);
667 if (err)
668 return err;
669 }
670 return 0;
671}
672
673static void logfs_init_candlist(struct candidate_list *list, int maxcount,
674 int sort_by_ec)
675{
676 list->count = 0;
677 list->maxcount = maxcount;
678 list->sort_by_ec = sort_by_ec;
679 list->rb_tree = RB_ROOT;
680}
681
682int logfs_init_gc(struct super_block *sb)
683{
684 struct logfs_super *super = logfs_super(sb);
685 int i;
686
687 btree_init_mempool32(&super->s_cand_tree, super->s_btree_pool);
688 logfs_init_candlist(&super->s_free_list, LIST_SIZE + SCAN_RATIO, 1);
689 logfs_init_candlist(&super->s_reserve_list,
690 super->s_bad_seg_reserve, 1);
691 for_each_area(i)
692 logfs_init_candlist(&super->s_low_list[i], LIST_SIZE, 0);
693 logfs_init_candlist(&super->s_ec_list, LIST_SIZE, 1);
694 return 0;
695}
696
697static void logfs_cleanup_list(struct super_block *sb,
698 struct candidate_list *list)
699{
700 struct gc_candidate *cand;
701
702 while (list->count) {
703 cand = rb_entry(list->rb_tree.rb_node, struct gc_candidate,
704 rb_node);
705 remove_from_list(cand);
706 free_candidate(sb, cand);
707 }
708 BUG_ON(list->rb_tree.rb_node);
709}
710
711void logfs_cleanup_gc(struct super_block *sb)
712{
713 struct logfs_super *super = logfs_super(sb);
714 int i;
715
716 if (!super->s_free_list.count)
717 return;
718
719 /*
720 * FIXME: The btree may still contain a single empty node. So we
721 * call the grim visitor to clean up that mess. Btree code should
722 * do it for us, really.
723 */
724 btree_grim_visitor32(&super->s_cand_tree, 0, NULL);
725 logfs_cleanup_list(sb, &super->s_free_list);
726 logfs_cleanup_list(sb, &super->s_reserve_list);
727 for_each_area(i)
728 logfs_cleanup_list(sb, &super->s_low_list[i]);
729 logfs_cleanup_list(sb, &super->s_ec_list);
730}
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
new file mode 100644
index 000000000000..33ec1aeaeec4
--- /dev/null
+++ b/fs/logfs/inode.c
@@ -0,0 +1,417 @@
1/*
2 * fs/logfs/inode.c - inode handling code
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */
8#include "logfs.h"
9#include <linux/writeback.h>
10#include <linux/backing-dev.h>
11
12/*
13 * How soon to reuse old inode numbers? LogFS doesn't store deleted inodes
14 * on the medium. It therefore also lacks a method to store the previous
15 * generation number for deleted inodes. Instead a single generation number
16 * is stored which will be used for new inodes. Being just a 32bit counter,
17 * this can obvious wrap relatively quickly. So we only reuse inodes if we
18 * know that a fair number of inodes can be created before we have to increment
19 * the generation again - effectively adding some bits to the counter.
20 * But being too aggressive here means we keep a very large and very sparse
21 * inode file, wasting space on indirect blocks.
22 * So what is a good value? Beats me. 64k seems moderately bad on both
23 * fronts, so let's use that for now...
24 *
25 * NFS sucks, as everyone already knows.
26 */
27#define INOS_PER_WRAP (0x10000)
28
29/*
30 * Logfs' requirement to read inodes for garbage collection makes life a bit
31 * harder. GC may have to read inodes that are in I_FREEING state, when they
32 * are being written out - and waiting for GC to make progress, naturally.
33 *
34 * So we cannot just call iget() or some variant of it, but first have to check
35 * wether the inode in question might be in I_FREEING state. Therefore we
36 * maintain our own per-sb list of "almost deleted" inodes and check against
37 * that list first. Normally this should be at most 1-2 entries long.
38 *
39 * Also, inodes have logfs-specific reference counting on top of what the vfs
40 * does. When .destroy_inode is called, normally the reference count will drop
41 * to zero and the inode gets deleted. But if GC accessed the inode, its
42 * refcount will remain nonzero and final deletion will have to wait.
43 *
44 * As a result we have two sets of functions to get/put inodes:
45 * logfs_safe_iget/logfs_safe_iput - safe to call from GC context
46 * logfs_iget/iput - normal version
47 */
48static struct kmem_cache *logfs_inode_cache;
49
50static DEFINE_SPINLOCK(logfs_inode_lock);
51
52static void logfs_inode_setops(struct inode *inode)
53{
54 switch (inode->i_mode & S_IFMT) {
55 case S_IFDIR:
56 inode->i_op = &logfs_dir_iops;
57 inode->i_fop = &logfs_dir_fops;
58 inode->i_mapping->a_ops = &logfs_reg_aops;
59 break;
60 case S_IFREG:
61 inode->i_op = &logfs_reg_iops;
62 inode->i_fop = &logfs_reg_fops;
63 inode->i_mapping->a_ops = &logfs_reg_aops;
64 break;
65 case S_IFLNK:
66 inode->i_op = &logfs_symlink_iops;
67 inode->i_mapping->a_ops = &logfs_reg_aops;
68 break;
69 case S_IFSOCK: /* fall through */
70 case S_IFBLK: /* fall through */
71 case S_IFCHR: /* fall through */
72 case S_IFIFO:
73 init_special_inode(inode, inode->i_mode, inode->i_rdev);
74 break;
75 default:
76 BUG();
77 }
78}
79
80static struct inode *__logfs_iget(struct super_block *sb, ino_t ino)
81{
82 struct inode *inode = iget_locked(sb, ino);
83 int err;
84
85 if (!inode)
86 return ERR_PTR(-ENOMEM);
87 if (!(inode->i_state & I_NEW))
88 return inode;
89
90 err = logfs_read_inode(inode);
91 if (err || inode->i_nlink == 0) {
92 /* inode->i_nlink == 0 can be true when called from
93 * block validator */
94 /* set i_nlink to 0 to prevent caching */
95 inode->i_nlink = 0;
96 logfs_inode(inode)->li_flags |= LOGFS_IF_ZOMBIE;
97 iget_failed(inode);
98 if (!err)
99 err = -ENOENT;
100 return ERR_PTR(err);
101 }
102
103 logfs_inode_setops(inode);
104 unlock_new_inode(inode);
105 return inode;
106}
107
108struct inode *logfs_iget(struct super_block *sb, ino_t ino)
109{
110 BUG_ON(ino == LOGFS_INO_MASTER);
111 BUG_ON(ino == LOGFS_INO_SEGFILE);
112 return __logfs_iget(sb, ino);
113}
114
115/*
116 * is_cached is set to 1 if we hand out a cached inode, 0 otherwise.
117 * this allows logfs_iput to do the right thing later
118 */
119struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *is_cached)
120{
121 struct logfs_super *super = logfs_super(sb);
122 struct logfs_inode *li;
123
124 if (ino == LOGFS_INO_MASTER)
125 return super->s_master_inode;
126 if (ino == LOGFS_INO_SEGFILE)
127 return super->s_segfile_inode;
128
129 spin_lock(&logfs_inode_lock);
130 list_for_each_entry(li, &super->s_freeing_list, li_freeing_list)
131 if (li->vfs_inode.i_ino == ino) {
132 li->li_refcount++;
133 spin_unlock(&logfs_inode_lock);
134 *is_cached = 1;
135 return &li->vfs_inode;
136 }
137 spin_unlock(&logfs_inode_lock);
138
139 *is_cached = 0;
140 return __logfs_iget(sb, ino);
141}
142
143static void __logfs_destroy_inode(struct inode *inode)
144{
145 struct logfs_inode *li = logfs_inode(inode);
146
147 BUG_ON(li->li_block);
148 list_del(&li->li_freeing_list);
149 kmem_cache_free(logfs_inode_cache, li);
150}
151
152static void logfs_destroy_inode(struct inode *inode)
153{
154 struct logfs_inode *li = logfs_inode(inode);
155
156 BUG_ON(list_empty(&li->li_freeing_list));
157 spin_lock(&logfs_inode_lock);
158 li->li_refcount--;
159 if (li->li_refcount == 0)
160 __logfs_destroy_inode(inode);
161 spin_unlock(&logfs_inode_lock);
162}
163
164void logfs_safe_iput(struct inode *inode, int is_cached)
165{
166 if (inode->i_ino == LOGFS_INO_MASTER)
167 return;
168 if (inode->i_ino == LOGFS_INO_SEGFILE)
169 return;
170
171 if (is_cached) {
172 logfs_destroy_inode(inode);
173 return;
174 }
175
176 iput(inode);
177}
178
179static void logfs_init_inode(struct super_block *sb, struct inode *inode)
180{
181 struct logfs_inode *li = logfs_inode(inode);
182 int i;
183
184 li->li_flags = 0;
185 li->li_height = 0;
186 li->li_used_bytes = 0;
187 li->li_block = NULL;
188 inode->i_uid = 0;
189 inode->i_gid = 0;
190 inode->i_size = 0;
191 inode->i_blocks = 0;
192 inode->i_ctime = CURRENT_TIME;
193 inode->i_mtime = CURRENT_TIME;
194 inode->i_nlink = 1;
195 INIT_LIST_HEAD(&li->li_freeing_list);
196
197 for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
198 li->li_data[i] = 0;
199
200 return;
201}
202
203static struct inode *logfs_alloc_inode(struct super_block *sb)
204{
205 struct logfs_inode *li;
206
207 li = kmem_cache_alloc(logfs_inode_cache, GFP_NOFS);
208 if (!li)
209 return NULL;
210 logfs_init_inode(sb, &li->vfs_inode);
211 return &li->vfs_inode;
212}
213
214/*
215 * In logfs inodes are written to an inode file. The inode file, like any
216 * other file, is managed with a inode. The inode file's inode, aka master
217 * inode, requires special handling in several respects. First, it cannot be
218 * written to the inode file, so it is stored in the journal instead.
219 *
220 * Secondly, this inode cannot be written back and destroyed before all other
221 * inodes have been written. The ordering is important. Linux' VFS is happily
222 * unaware of the ordering constraint and would ordinarily destroy the master
223 * inode at umount time while other inodes are still in use and dirty. Not
224 * good.
225 *
226 * So logfs makes sure the master inode is not written until all other inodes
227 * have been destroyed. Sadly, this method has another side-effect. The VFS
228 * will notice one remaining inode and print a frightening warning message.
229 * Worse, it is impossible to judge whether such a warning was caused by the
230 * master inode or any other inodes have leaked as well.
231 *
232 * Our attempt of solving this is with logfs_new_meta_inode() below. Its
233 * purpose is to create a new inode that will not trigger the warning if such
234 * an inode is still in use. An ugly hack, no doubt. Suggections for
235 * improvement are welcome.
236 */
237struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino)
238{
239 struct inode *inode;
240
241 inode = logfs_alloc_inode(sb);
242 if (!inode)
243 return ERR_PTR(-ENOMEM);
244
245 inode->i_mode = S_IFREG;
246 inode->i_ino = ino;
247 inode->i_sb = sb;
248
249 /* This is a blatant copy of alloc_inode code. We'd need alloc_inode
250 * to be nonstatic, alas. */
251 {
252 struct address_space * const mapping = &inode->i_data;
253
254 mapping->a_ops = &logfs_reg_aops;
255 mapping->host = inode;
256 mapping->flags = 0;
257 mapping_set_gfp_mask(mapping, GFP_NOFS);
258 mapping->assoc_mapping = NULL;
259 mapping->backing_dev_info = &default_backing_dev_info;
260 inode->i_mapping = mapping;
261 inode->i_nlink = 1;
262 }
263
264 return inode;
265}
266
267struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino)
268{
269 struct inode *inode;
270 int err;
271
272 inode = logfs_new_meta_inode(sb, ino);
273 if (IS_ERR(inode))
274 return inode;
275
276 err = logfs_read_inode(inode);
277 if (err) {
278 destroy_meta_inode(inode);
279 return ERR_PTR(err);
280 }
281 logfs_inode_setops(inode);
282 return inode;
283}
284
285static int logfs_write_inode(struct inode *inode, struct writeback_control *wbc)
286{
287 int ret;
288 long flags = WF_LOCK;
289
290 /* Can only happen if creat() failed. Safe to skip. */
291 if (logfs_inode(inode)->li_flags & LOGFS_IF_STILLBORN)
292 return 0;
293
294 ret = __logfs_write_inode(inode, flags);
295 LOGFS_BUG_ON(ret, inode->i_sb);
296 return ret;
297}
298
299void destroy_meta_inode(struct inode *inode)
300{
301 if (inode) {
302 if (inode->i_data.nrpages)
303 truncate_inode_pages(&inode->i_data, 0);
304 logfs_clear_inode(inode);
305 kmem_cache_free(logfs_inode_cache, logfs_inode(inode));
306 }
307}
308
309/* called with inode_lock held */
310static void logfs_drop_inode(struct inode *inode)
311{
312 struct logfs_super *super = logfs_super(inode->i_sb);
313 struct logfs_inode *li = logfs_inode(inode);
314
315 spin_lock(&logfs_inode_lock);
316 list_move(&li->li_freeing_list, &super->s_freeing_list);
317 spin_unlock(&logfs_inode_lock);
318 generic_drop_inode(inode);
319}
320
321static void logfs_set_ino_generation(struct super_block *sb,
322 struct inode *inode)
323{
324 struct logfs_super *super = logfs_super(sb);
325 u64 ino;
326
327 mutex_lock(&super->s_journal_mutex);
328 ino = logfs_seek_hole(super->s_master_inode, super->s_last_ino);
329 super->s_last_ino = ino;
330 super->s_inos_till_wrap--;
331 if (super->s_inos_till_wrap < 0) {
332 super->s_last_ino = LOGFS_RESERVED_INOS;
333 super->s_generation++;
334 super->s_inos_till_wrap = INOS_PER_WRAP;
335 }
336 inode->i_ino = ino;
337 inode->i_generation = super->s_generation;
338 mutex_unlock(&super->s_journal_mutex);
339}
340
341struct inode *logfs_new_inode(struct inode *dir, int mode)
342{
343 struct super_block *sb = dir->i_sb;
344 struct inode *inode;
345
346 inode = new_inode(sb);
347 if (!inode)
348 return ERR_PTR(-ENOMEM);
349
350 logfs_init_inode(sb, inode);
351
352 /* inherit parent flags */
353 logfs_inode(inode)->li_flags |=
354 logfs_inode(dir)->li_flags & LOGFS_FL_INHERITED;
355
356 inode->i_mode = mode;
357 logfs_set_ino_generation(sb, inode);
358
359 inode->i_uid = current_fsuid();
360 inode->i_gid = current_fsgid();
361 if (dir->i_mode & S_ISGID) {
362 inode->i_gid = dir->i_gid;
363 if (S_ISDIR(mode))
364 inode->i_mode |= S_ISGID;
365 }
366
367 logfs_inode_setops(inode);
368 insert_inode_hash(inode);
369
370 return inode;
371}
372
373static void logfs_init_once(void *_li)
374{
375 struct logfs_inode *li = _li;
376 int i;
377
378 li->li_flags = 0;
379 li->li_used_bytes = 0;
380 li->li_refcount = 1;
381 for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
382 li->li_data[i] = 0;
383 inode_init_once(&li->vfs_inode);
384}
385
386static int logfs_sync_fs(struct super_block *sb, int wait)
387{
388 /* FIXME: write anchor */
389 logfs_super(sb)->s_devops->sync(sb);
390 return 0;
391}
392
393const struct super_operations logfs_super_operations = {
394 .alloc_inode = logfs_alloc_inode,
395 .clear_inode = logfs_clear_inode,
396 .delete_inode = logfs_delete_inode,
397 .destroy_inode = logfs_destroy_inode,
398 .drop_inode = logfs_drop_inode,
399 .write_inode = logfs_write_inode,
400 .statfs = logfs_statfs,
401 .sync_fs = logfs_sync_fs,
402};
403
404int logfs_init_inode_cache(void)
405{
406 logfs_inode_cache = kmem_cache_create("logfs_inode_cache",
407 sizeof(struct logfs_inode), 0, SLAB_RECLAIM_ACCOUNT,
408 logfs_init_once);
409 if (!logfs_inode_cache)
410 return -ENOMEM;
411 return 0;
412}
413
414void logfs_destroy_inode_cache(void)
415{
416 kmem_cache_destroy(logfs_inode_cache);
417}
diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c
new file mode 100644
index 000000000000..6ad30a4c9052
--- /dev/null
+++ b/fs/logfs/journal.c
@@ -0,0 +1,883 @@
1/*
2 * fs/logfs/journal.c - journal handling code
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */
8#include "logfs.h"
9
10static void logfs_calc_free(struct super_block *sb)
11{
12 struct logfs_super *super = logfs_super(sb);
13 u64 reserve, no_segs = super->s_no_segs;
14 s64 free;
15 int i;
16
17 /* superblock segments */
18 no_segs -= 2;
19 super->s_no_journal_segs = 0;
20 /* journal */
21 journal_for_each(i)
22 if (super->s_journal_seg[i]) {
23 no_segs--;
24 super->s_no_journal_segs++;
25 }
26
27 /* open segments plus one extra per level for GC */
28 no_segs -= 2 * super->s_total_levels;
29
30 free = no_segs * (super->s_segsize - LOGFS_SEGMENT_RESERVE);
31 free -= super->s_used_bytes;
32 /* just a bit extra */
33 free -= super->s_total_levels * 4096;
34
35 /* Bad blocks are 'paid' for with speed reserve - the filesystem
36 * simply gets slower as bad blocks accumulate. Until the bad blocks
37 * exceed the speed reserve - then the filesystem gets smaller.
38 */
39 reserve = super->s_bad_segments + super->s_bad_seg_reserve;
40 reserve *= super->s_segsize - LOGFS_SEGMENT_RESERVE;
41 reserve = max(reserve, super->s_speed_reserve);
42 free -= reserve;
43 if (free < 0)
44 free = 0;
45
46 super->s_free_bytes = free;
47}
48
49static void reserve_sb_and_journal(struct super_block *sb)
50{
51 struct logfs_super *super = logfs_super(sb);
52 struct btree_head32 *head = &super->s_reserved_segments;
53 int i, err;
54
55 err = btree_insert32(head, seg_no(sb, super->s_sb_ofs[0]), (void *)1,
56 GFP_KERNEL);
57 BUG_ON(err);
58
59 err = btree_insert32(head, seg_no(sb, super->s_sb_ofs[1]), (void *)1,
60 GFP_KERNEL);
61 BUG_ON(err);
62
63 journal_for_each(i) {
64 if (!super->s_journal_seg[i])
65 continue;
66 err = btree_insert32(head, super->s_journal_seg[i], (void *)1,
67 GFP_KERNEL);
68 BUG_ON(err);
69 }
70}
71
72static void read_dynsb(struct super_block *sb,
73 struct logfs_je_dynsb *dynsb)
74{
75 struct logfs_super *super = logfs_super(sb);
76
77 super->s_gec = be64_to_cpu(dynsb->ds_gec);
78 super->s_sweeper = be64_to_cpu(dynsb->ds_sweeper);
79 super->s_victim_ino = be64_to_cpu(dynsb->ds_victim_ino);
80 super->s_rename_dir = be64_to_cpu(dynsb->ds_rename_dir);
81 super->s_rename_pos = be64_to_cpu(dynsb->ds_rename_pos);
82 super->s_used_bytes = be64_to_cpu(dynsb->ds_used_bytes);
83 super->s_generation = be32_to_cpu(dynsb->ds_generation);
84}
85
86static void read_anchor(struct super_block *sb,
87 struct logfs_je_anchor *da)
88{
89 struct logfs_super *super = logfs_super(sb);
90 struct inode *inode = super->s_master_inode;
91 struct logfs_inode *li = logfs_inode(inode);
92 int i;
93
94 super->s_last_ino = be64_to_cpu(da->da_last_ino);
95 li->li_flags = 0;
96 li->li_height = da->da_height;
97 i_size_write(inode, be64_to_cpu(da->da_size));
98 li->li_used_bytes = be64_to_cpu(da->da_used_bytes);
99
100 for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
101 li->li_data[i] = be64_to_cpu(da->da_data[i]);
102}
103
104static void read_erasecount(struct super_block *sb,
105 struct logfs_je_journal_ec *ec)
106{
107 struct logfs_super *super = logfs_super(sb);
108 int i;
109
110 journal_for_each(i)
111 super->s_journal_ec[i] = be32_to_cpu(ec->ec[i]);
112}
113
114static int read_area(struct super_block *sb, struct logfs_je_area *a)
115{
116 struct logfs_super *super = logfs_super(sb);
117 struct logfs_area *area = super->s_area[a->gc_level];
118 u64 ofs;
119 u32 writemask = ~(super->s_writesize - 1);
120
121 if (a->gc_level >= LOGFS_NO_AREAS)
122 return -EIO;
123 if (a->vim != VIM_DEFAULT)
124 return -EIO; /* TODO: close area and continue */
125
126 area->a_used_bytes = be32_to_cpu(a->used_bytes);
127 area->a_written_bytes = area->a_used_bytes & writemask;
128 area->a_segno = be32_to_cpu(a->segno);
129 if (area->a_segno)
130 area->a_is_open = 1;
131
132 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
133 if (super->s_writesize > 1)
134 logfs_buf_recover(area, ofs, a + 1, super->s_writesize);
135 else
136 logfs_buf_recover(area, ofs, NULL, 0);
137 return 0;
138}
139
140static void *unpack(void *from, void *to)
141{
142 struct logfs_journal_header *jh = from;
143 void *data = from + sizeof(struct logfs_journal_header);
144 int err;
145 size_t inlen, outlen;
146
147 inlen = be16_to_cpu(jh->h_len);
148 outlen = be16_to_cpu(jh->h_datalen);
149
150 if (jh->h_compr == COMPR_NONE)
151 memcpy(to, data, inlen);
152 else {
153 err = logfs_uncompress(data, to, inlen, outlen);
154 BUG_ON(err);
155 }
156 return to;
157}
158
159static int __read_je_header(struct super_block *sb, u64 ofs,
160 struct logfs_journal_header *jh)
161{
162 struct logfs_super *super = logfs_super(sb);
163 size_t bufsize = max_t(size_t, sb->s_blocksize, super->s_writesize)
164 + MAX_JOURNAL_HEADER;
165 u16 type, len, datalen;
166 int err;
167
168 /* read header only */
169 err = wbuf_read(sb, ofs, sizeof(*jh), jh);
170 if (err)
171 return err;
172 type = be16_to_cpu(jh->h_type);
173 len = be16_to_cpu(jh->h_len);
174 datalen = be16_to_cpu(jh->h_datalen);
175 if (len > sb->s_blocksize)
176 return -EIO;
177 if ((type < JE_FIRST) || (type > JE_LAST))
178 return -EIO;
179 if (datalen > bufsize)
180 return -EIO;
181 return 0;
182}
183
184static int __read_je_payload(struct super_block *sb, u64 ofs,
185 struct logfs_journal_header *jh)
186{
187 u16 len;
188 int err;
189
190 len = be16_to_cpu(jh->h_len);
191 err = wbuf_read(sb, ofs + sizeof(*jh), len, jh + 1);
192 if (err)
193 return err;
194 if (jh->h_crc != logfs_crc32(jh, len + sizeof(*jh), 4)) {
195 /* Old code was confused. It forgot about the header length
196 * and stopped calculating the crc 16 bytes before the end
197 * of data - ick!
198 * FIXME: Remove this hack once the old code is fixed.
199 */
200 if (jh->h_crc == logfs_crc32(jh, len, 4))
201 WARN_ON_ONCE(1);
202 else
203 return -EIO;
204 }
205 return 0;
206}
207
208/*
209 * jh needs to be large enough to hold the complete entry, not just the header
210 */
211static int __read_je(struct super_block *sb, u64 ofs,
212 struct logfs_journal_header *jh)
213{
214 int err;
215
216 err = __read_je_header(sb, ofs, jh);
217 if (err)
218 return err;
219 return __read_je_payload(sb, ofs, jh);
220}
221
222static int read_je(struct super_block *sb, u64 ofs)
223{
224 struct logfs_super *super = logfs_super(sb);
225 struct logfs_journal_header *jh = super->s_compressed_je;
226 void *scratch = super->s_je;
227 u16 type, datalen;
228 int err;
229
230 err = __read_je(sb, ofs, jh);
231 if (err)
232 return err;
233 type = be16_to_cpu(jh->h_type);
234 datalen = be16_to_cpu(jh->h_datalen);
235
236 switch (type) {
237 case JE_DYNSB:
238 read_dynsb(sb, unpack(jh, scratch));
239 break;
240 case JE_ANCHOR:
241 read_anchor(sb, unpack(jh, scratch));
242 break;
243 case JE_ERASECOUNT:
244 read_erasecount(sb, unpack(jh, scratch));
245 break;
246 case JE_AREA:
247 read_area(sb, unpack(jh, scratch));
248 break;
249 case JE_OBJ_ALIAS:
250 err = logfs_load_object_aliases(sb, unpack(jh, scratch),
251 datalen);
252 break;
253 default:
254 WARN_ON_ONCE(1);
255 return -EIO;
256 }
257 return err;
258}
259
260static int logfs_read_segment(struct super_block *sb, u32 segno)
261{
262 struct logfs_super *super = logfs_super(sb);
263 struct logfs_journal_header *jh = super->s_compressed_je;
264 u64 ofs, seg_ofs = dev_ofs(sb, segno, 0);
265 u32 h_ofs, last_ofs = 0;
266 u16 len, datalen, last_len = 0;
267 int i, err;
268
269 /* search for most recent commit */
270 for (h_ofs = 0; h_ofs < super->s_segsize; h_ofs += sizeof(*jh)) {
271 ofs = seg_ofs + h_ofs;
272 err = __read_je_header(sb, ofs, jh);
273 if (err)
274 continue;
275 if (jh->h_type != cpu_to_be16(JE_COMMIT))
276 continue;
277 err = __read_je_payload(sb, ofs, jh);
278 if (err)
279 continue;
280 len = be16_to_cpu(jh->h_len);
281 datalen = be16_to_cpu(jh->h_datalen);
282 if ((datalen > sizeof(super->s_je_array)) ||
283 (datalen % sizeof(__be64)))
284 continue;
285 last_ofs = h_ofs;
286 last_len = datalen;
287 h_ofs += ALIGN(len, sizeof(*jh)) - sizeof(*jh);
288 }
289 /* read commit */
290 if (last_ofs == 0)
291 return -ENOENT;
292 ofs = seg_ofs + last_ofs;
293 log_journal("Read commit from %llx\n", ofs);
294 err = __read_je(sb, ofs, jh);
295 BUG_ON(err); /* We should have caught it in the scan loop already */
296 if (err)
297 return err;
298 /* uncompress */
299 unpack(jh, super->s_je_array);
300 super->s_no_je = last_len / sizeof(__be64);
301 /* iterate over array */
302 for (i = 0; i < super->s_no_je; i++) {
303 err = read_je(sb, be64_to_cpu(super->s_je_array[i]));
304 if (err)
305 return err;
306 }
307 super->s_journal_area->a_segno = segno;
308 return 0;
309}
310
311static u64 read_gec(struct super_block *sb, u32 segno)
312{
313 struct logfs_segment_header sh;
314 __be32 crc;
315 int err;
316
317 if (!segno)
318 return 0;
319 err = wbuf_read(sb, dev_ofs(sb, segno, 0), sizeof(sh), &sh);
320 if (err)
321 return 0;
322 crc = logfs_crc32(&sh, sizeof(sh), 4);
323 if (crc != sh.crc) {
324 WARN_ON(sh.gec != cpu_to_be64(0xffffffffffffffffull));
325 /* Most likely it was just erased */
326 return 0;
327 }
328 return be64_to_cpu(sh.gec);
329}
330
331static int logfs_read_journal(struct super_block *sb)
332{
333 struct logfs_super *super = logfs_super(sb);
334 u64 gec[LOGFS_JOURNAL_SEGS], max;
335 u32 segno;
336 int i, max_i;
337
338 max = 0;
339 max_i = -1;
340 journal_for_each(i) {
341 segno = super->s_journal_seg[i];
342 gec[i] = read_gec(sb, super->s_journal_seg[i]);
343 if (gec[i] > max) {
344 max = gec[i];
345 max_i = i;
346 }
347 }
348 if (max_i == -1)
349 return -EIO;
350 /* FIXME: Try older segments in case of error */
351 return logfs_read_segment(sb, super->s_journal_seg[max_i]);
352}
353
354/*
355 * First search the current segment (outer loop), then pick the next segment
356 * in the array, skipping any zero entries (inner loop).
357 */
358static void journal_get_free_segment(struct logfs_area *area)
359{
360 struct logfs_super *super = logfs_super(area->a_sb);
361 int i;
362
363 journal_for_each(i) {
364 if (area->a_segno != super->s_journal_seg[i])
365 continue;
366
367 do {
368 i++;
369 if (i == LOGFS_JOURNAL_SEGS)
370 i = 0;
371 } while (!super->s_journal_seg[i]);
372
373 area->a_segno = super->s_journal_seg[i];
374 area->a_erase_count = ++(super->s_journal_ec[i]);
375 log_journal("Journal now at %x (ec %x)\n", area->a_segno,
376 area->a_erase_count);
377 return;
378 }
379 BUG();
380}
381
382static void journal_get_erase_count(struct logfs_area *area)
383{
384 /* erase count is stored globally and incremented in
385 * journal_get_free_segment() - nothing to do here */
386}
387
388static int journal_erase_segment(struct logfs_area *area)
389{
390 struct super_block *sb = area->a_sb;
391 struct logfs_segment_header sh;
392 u64 ofs;
393 int err;
394
395 err = logfs_erase_segment(sb, area->a_segno, 1);
396 if (err)
397 return err;
398
399 sh.pad = 0;
400 sh.type = SEG_JOURNAL;
401 sh.level = 0;
402 sh.segno = cpu_to_be32(area->a_segno);
403 sh.ec = cpu_to_be32(area->a_erase_count);
404 sh.gec = cpu_to_be64(logfs_super(sb)->s_gec);
405 sh.crc = logfs_crc32(&sh, sizeof(sh), 4);
406
407 /* This causes a bug in segment.c. Not yet. */
408 //logfs_set_segment_erased(sb, area->a_segno, area->a_erase_count, 0);
409
410 ofs = dev_ofs(sb, area->a_segno, 0);
411 area->a_used_bytes = ALIGN(sizeof(sh), 16);
412 logfs_buf_write(area, ofs, &sh, sizeof(sh));
413 return 0;
414}
415
416static size_t __logfs_write_header(struct logfs_super *super,
417 struct logfs_journal_header *jh, size_t len, size_t datalen,
418 u16 type, u8 compr)
419{
420 jh->h_len = cpu_to_be16(len);
421 jh->h_type = cpu_to_be16(type);
422 jh->h_datalen = cpu_to_be16(datalen);
423 jh->h_compr = compr;
424 jh->h_pad[0] = 'H';
425 jh->h_pad[1] = 'E';
426 jh->h_pad[2] = 'A';
427 jh->h_pad[3] = 'D';
428 jh->h_pad[4] = 'R';
429 jh->h_crc = logfs_crc32(jh, len + sizeof(*jh), 4);
430 return ALIGN(len, 16) + sizeof(*jh);
431}
432
433static size_t logfs_write_header(struct logfs_super *super,
434 struct logfs_journal_header *jh, size_t datalen, u16 type)
435{
436 size_t len = datalen;
437
438 return __logfs_write_header(super, jh, len, datalen, type, COMPR_NONE);
439}
440
441static inline size_t logfs_journal_erasecount_size(struct logfs_super *super)
442{
443 return LOGFS_JOURNAL_SEGS * sizeof(__be32);
444}
445
446static void *logfs_write_erasecount(struct super_block *sb, void *_ec,
447 u16 *type, size_t *len)
448{
449 struct logfs_super *super = logfs_super(sb);
450 struct logfs_je_journal_ec *ec = _ec;
451 int i;
452
453 journal_for_each(i)
454 ec->ec[i] = cpu_to_be32(super->s_journal_ec[i]);
455 *type = JE_ERASECOUNT;
456 *len = logfs_journal_erasecount_size(super);
457 return ec;
458}
459
460static void account_shadow(void *_shadow, unsigned long _sb, u64 ignore,
461 size_t ignore2)
462{
463 struct logfs_shadow *shadow = _shadow;
464 struct super_block *sb = (void *)_sb;
465 struct logfs_super *super = logfs_super(sb);
466
467 /* consume new space */
468 super->s_free_bytes -= shadow->new_len;
469 super->s_used_bytes += shadow->new_len;
470 super->s_dirty_used_bytes -= shadow->new_len;
471
472 /* free up old space */
473 super->s_free_bytes += shadow->old_len;
474 super->s_used_bytes -= shadow->old_len;
475 super->s_dirty_free_bytes -= shadow->old_len;
476
477 logfs_set_segment_used(sb, shadow->old_ofs, -shadow->old_len);
478 logfs_set_segment_used(sb, shadow->new_ofs, shadow->new_len);
479
480 log_journal("account_shadow(%llx, %llx, %x) %llx->%llx %x->%x\n",
481 shadow->ino, shadow->bix, shadow->gc_level,
482 shadow->old_ofs, shadow->new_ofs,
483 shadow->old_len, shadow->new_len);
484 mempool_free(shadow, super->s_shadow_pool);
485}
486
487static void account_shadows(struct super_block *sb)
488{
489 struct logfs_super *super = logfs_super(sb);
490 struct inode *inode = super->s_master_inode;
491 struct logfs_inode *li = logfs_inode(inode);
492 struct shadow_tree *tree = &super->s_shadow_tree;
493
494 btree_grim_visitor64(&tree->new, (unsigned long)sb, account_shadow);
495 btree_grim_visitor64(&tree->old, (unsigned long)sb, account_shadow);
496
497 if (li->li_block) {
498 /*
499 * We never actually use the structure, when attached to the
500 * master inode. But it is easier to always free it here than
501 * to have checks in several places elsewhere when allocating
502 * it.
503 */
504 li->li_block->ops->free_block(sb, li->li_block);
505 }
506 BUG_ON((s64)li->li_used_bytes < 0);
507}
508
509static void *__logfs_write_anchor(struct super_block *sb, void *_da,
510 u16 *type, size_t *len)
511{
512 struct logfs_super *super = logfs_super(sb);
513 struct logfs_je_anchor *da = _da;
514 struct inode *inode = super->s_master_inode;
515 struct logfs_inode *li = logfs_inode(inode);
516 int i;
517
518 da->da_height = li->li_height;
519 da->da_last_ino = cpu_to_be64(super->s_last_ino);
520 da->da_size = cpu_to_be64(i_size_read(inode));
521 da->da_used_bytes = cpu_to_be64(li->li_used_bytes);
522 for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
523 da->da_data[i] = cpu_to_be64(li->li_data[i]);
524 *type = JE_ANCHOR;
525 *len = sizeof(*da);
526 return da;
527}
528
529static void *logfs_write_dynsb(struct super_block *sb, void *_dynsb,
530 u16 *type, size_t *len)
531{
532 struct logfs_super *super = logfs_super(sb);
533 struct logfs_je_dynsb *dynsb = _dynsb;
534
535 dynsb->ds_gec = cpu_to_be64(super->s_gec);
536 dynsb->ds_sweeper = cpu_to_be64(super->s_sweeper);
537 dynsb->ds_victim_ino = cpu_to_be64(super->s_victim_ino);
538 dynsb->ds_rename_dir = cpu_to_be64(super->s_rename_dir);
539 dynsb->ds_rename_pos = cpu_to_be64(super->s_rename_pos);
540 dynsb->ds_used_bytes = cpu_to_be64(super->s_used_bytes);
541 dynsb->ds_generation = cpu_to_be32(super->s_generation);
542 *type = JE_DYNSB;
543 *len = sizeof(*dynsb);
544 return dynsb;
545}
546
547static void write_wbuf(struct super_block *sb, struct logfs_area *area,
548 void *wbuf)
549{
550 struct logfs_super *super = logfs_super(sb);
551 struct address_space *mapping = super->s_mapping_inode->i_mapping;
552 u64 ofs;
553 pgoff_t index;
554 int page_ofs;
555 struct page *page;
556
557 ofs = dev_ofs(sb, area->a_segno,
558 area->a_used_bytes & ~(super->s_writesize - 1));
559 index = ofs >> PAGE_SHIFT;
560 page_ofs = ofs & (PAGE_SIZE - 1);
561
562 page = find_lock_page(mapping, index);
563 BUG_ON(!page);
564 memcpy(wbuf, page_address(page) + page_ofs, super->s_writesize);
565 unlock_page(page);
566}
567
568static void *logfs_write_area(struct super_block *sb, void *_a,
569 u16 *type, size_t *len)
570{
571 struct logfs_super *super = logfs_super(sb);
572 struct logfs_area *area = super->s_area[super->s_sum_index];
573 struct logfs_je_area *a = _a;
574
575 a->vim = VIM_DEFAULT;
576 a->gc_level = super->s_sum_index;
577 a->used_bytes = cpu_to_be32(area->a_used_bytes);
578 a->segno = cpu_to_be32(area->a_segno);
579 if (super->s_writesize > 1)
580 write_wbuf(sb, area, a + 1);
581
582 *type = JE_AREA;
583 *len = sizeof(*a) + super->s_writesize;
584 return a;
585}
586
587static void *logfs_write_commit(struct super_block *sb, void *h,
588 u16 *type, size_t *len)
589{
590 struct logfs_super *super = logfs_super(sb);
591
592 *type = JE_COMMIT;
593 *len = super->s_no_je * sizeof(__be64);
594 return super->s_je_array;
595}
596
597static size_t __logfs_write_je(struct super_block *sb, void *buf, u16 type,
598 size_t len)
599{
600 struct logfs_super *super = logfs_super(sb);
601 void *header = super->s_compressed_je;
602 void *data = header + sizeof(struct logfs_journal_header);
603 ssize_t compr_len, pad_len;
604 u8 compr = COMPR_ZLIB;
605
606 if (len == 0)
607 return logfs_write_header(super, header, 0, type);
608
609 compr_len = logfs_compress(buf, data, len, sb->s_blocksize);
610 if (compr_len < 0 || type == JE_ANCHOR) {
611 BUG_ON(len > sb->s_blocksize);
612 memcpy(data, buf, len);
613 compr_len = len;
614 compr = COMPR_NONE;
615 }
616
617 pad_len = ALIGN(compr_len, 16);
618 memset(data + compr_len, 0, pad_len - compr_len);
619
620 return __logfs_write_header(super, header, compr_len, len, type, compr);
621}
622
623static s64 logfs_get_free_bytes(struct logfs_area *area, size_t *bytes,
624 int must_pad)
625{
626 u32 writesize = logfs_super(area->a_sb)->s_writesize;
627 s32 ofs;
628 int ret;
629
630 ret = logfs_open_area(area, *bytes);
631 if (ret)
632 return -EAGAIN;
633
634 ofs = area->a_used_bytes;
635 area->a_used_bytes += *bytes;
636
637 if (must_pad) {
638 area->a_used_bytes = ALIGN(area->a_used_bytes, writesize);
639 *bytes = area->a_used_bytes - ofs;
640 }
641
642 return dev_ofs(area->a_sb, area->a_segno, ofs);
643}
644
645static int logfs_write_je_buf(struct super_block *sb, void *buf, u16 type,
646 size_t buf_len)
647{
648 struct logfs_super *super = logfs_super(sb);
649 struct logfs_area *area = super->s_journal_area;
650 struct logfs_journal_header *jh = super->s_compressed_je;
651 size_t len;
652 int must_pad = 0;
653 s64 ofs;
654
655 len = __logfs_write_je(sb, buf, type, buf_len);
656 if (jh->h_type == cpu_to_be16(JE_COMMIT))
657 must_pad = 1;
658
659 ofs = logfs_get_free_bytes(area, &len, must_pad);
660 if (ofs < 0)
661 return ofs;
662 logfs_buf_write(area, ofs, super->s_compressed_je, len);
663 super->s_je_array[super->s_no_je++] = cpu_to_be64(ofs);
664 return 0;
665}
666
667static int logfs_write_je(struct super_block *sb,
668 void* (*write)(struct super_block *sb, void *scratch,
669 u16 *type, size_t *len))
670{
671 void *buf;
672 size_t len;
673 u16 type;
674
675 buf = write(sb, logfs_super(sb)->s_je, &type, &len);
676 return logfs_write_je_buf(sb, buf, type, len);
677}
678
679int write_alias_journal(struct super_block *sb, u64 ino, u64 bix,
680 level_t level, int child_no, __be64 val)
681{
682 struct logfs_super *super = logfs_super(sb);
683 struct logfs_obj_alias *oa = super->s_je;
684 int err = 0, fill = super->s_je_fill;
685
686 log_aliases("logfs_write_obj_aliases #%x(%llx, %llx, %x, %x) %llx\n",
687 fill, ino, bix, level, child_no, be64_to_cpu(val));
688 oa[fill].ino = cpu_to_be64(ino);
689 oa[fill].bix = cpu_to_be64(bix);
690 oa[fill].val = val;
691 oa[fill].level = (__force u8)level;
692 oa[fill].child_no = cpu_to_be16(child_no);
693 fill++;
694 if (fill >= sb->s_blocksize / sizeof(*oa)) {
695 err = logfs_write_je_buf(sb, oa, JE_OBJ_ALIAS, sb->s_blocksize);
696 fill = 0;
697 }
698
699 super->s_je_fill = fill;
700 return err;
701}
702
703static int logfs_write_obj_aliases(struct super_block *sb)
704{
705 struct logfs_super *super = logfs_super(sb);
706 int err;
707
708 log_journal("logfs_write_obj_aliases: %d aliases to write\n",
709 super->s_no_object_aliases);
710 super->s_je_fill = 0;
711 err = logfs_write_obj_aliases_pagecache(sb);
712 if (err)
713 return err;
714
715 if (super->s_je_fill)
716 err = logfs_write_je_buf(sb, super->s_je, JE_OBJ_ALIAS,
717 super->s_je_fill
718 * sizeof(struct logfs_obj_alias));
719 return err;
720}
721
722/*
723 * Write all journal entries. The goto logic ensures that all journal entries
724 * are written whenever a new segment is used. It is ugly and potentially a
725 * bit wasteful, but robustness is more important. With this we can *always*
726 * erase all journal segments except the one containing the most recent commit.
727 */
728void logfs_write_anchor(struct super_block *sb)
729{
730 struct logfs_super *super = logfs_super(sb);
731 struct logfs_area *area = super->s_journal_area;
732 int i, err;
733
734 if (!(super->s_flags & LOGFS_SB_FLAG_DIRTY))
735 return;
736 super->s_flags &= ~LOGFS_SB_FLAG_DIRTY;
737
738 BUG_ON(super->s_flags & LOGFS_SB_FLAG_SHUTDOWN);
739 mutex_lock(&super->s_journal_mutex);
740
741 /* Do this first or suffer corruption */
742 logfs_sync_segments(sb);
743 account_shadows(sb);
744
745again:
746 super->s_no_je = 0;
747 for_each_area(i) {
748 if (!super->s_area[i]->a_is_open)
749 continue;
750 super->s_sum_index = i;
751 err = logfs_write_je(sb, logfs_write_area);
752 if (err)
753 goto again;
754 }
755 err = logfs_write_obj_aliases(sb);
756 if (err)
757 goto again;
758 err = logfs_write_je(sb, logfs_write_erasecount);
759 if (err)
760 goto again;
761 err = logfs_write_je(sb, __logfs_write_anchor);
762 if (err)
763 goto again;
764 err = logfs_write_je(sb, logfs_write_dynsb);
765 if (err)
766 goto again;
767 /*
768 * Order is imperative. First we sync all writes, including the
769 * non-committed journal writes. Then we write the final commit and
770 * sync the current journal segment.
771 * There is a theoretical bug here. Syncing the journal segment will
772 * write a number of journal entries and the final commit. All these
773 * are written in a single operation. If the device layer writes the
774 * data back-to-front, the commit will precede the other journal
775 * entries, leaving a race window.
776 * Two fixes are possible. Preferred is to fix the device layer to
777 * ensure writes happen front-to-back. Alternatively we can insert
778 * another logfs_sync_area() super->s_devops->sync() combo before
779 * writing the commit.
780 */
781 /*
782 * On another subject, super->s_devops->sync is usually not necessary.
783 * Unless called from sys_sync or friends, a barrier would suffice.
784 */
785 super->s_devops->sync(sb);
786 err = logfs_write_je(sb, logfs_write_commit);
787 if (err)
788 goto again;
789 log_journal("Write commit to %llx\n",
790 be64_to_cpu(super->s_je_array[super->s_no_je - 1]));
791 logfs_sync_area(area);
792 BUG_ON(area->a_used_bytes != area->a_written_bytes);
793 super->s_devops->sync(sb);
794
795 mutex_unlock(&super->s_journal_mutex);
796 return;
797}
798
799void do_logfs_journal_wl_pass(struct super_block *sb)
800{
801 struct logfs_super *super = logfs_super(sb);
802 struct logfs_area *area = super->s_journal_area;
803 u32 segno, ec;
804 int i, err;
805
806 log_journal("Journal requires wear-leveling.\n");
807 /* Drop old segments */
808 journal_for_each(i)
809 if (super->s_journal_seg[i]) {
810 logfs_set_segment_unreserved(sb,
811 super->s_journal_seg[i],
812 super->s_journal_ec[i]);
813 super->s_journal_seg[i] = 0;
814 super->s_journal_ec[i] = 0;
815 }
816 /* Get new segments */
817 for (i = 0; i < super->s_no_journal_segs; i++) {
818 segno = get_best_cand(sb, &super->s_reserve_list, &ec);
819 super->s_journal_seg[i] = segno;
820 super->s_journal_ec[i] = ec;
821 logfs_set_segment_reserved(sb, segno);
822 }
823 /* Manually move journal_area */
824 area->a_segno = super->s_journal_seg[0];
825 area->a_is_open = 0;
826 area->a_used_bytes = 0;
827 /* Write journal */
828 logfs_write_anchor(sb);
829 /* Write superblocks */
830 err = logfs_write_sb(sb);
831 BUG_ON(err);
832}
833
834static const struct logfs_area_ops journal_area_ops = {
835 .get_free_segment = journal_get_free_segment,
836 .get_erase_count = journal_get_erase_count,
837 .erase_segment = journal_erase_segment,
838};
839
840int logfs_init_journal(struct super_block *sb)
841{
842 struct logfs_super *super = logfs_super(sb);
843 size_t bufsize = max_t(size_t, sb->s_blocksize, super->s_writesize)
844 + MAX_JOURNAL_HEADER;
845 int ret = -ENOMEM;
846
847 mutex_init(&super->s_journal_mutex);
848 btree_init_mempool32(&super->s_reserved_segments, super->s_btree_pool);
849
850 super->s_je = kzalloc(bufsize, GFP_KERNEL);
851 if (!super->s_je)
852 return ret;
853
854 super->s_compressed_je = kzalloc(bufsize, GFP_KERNEL);
855 if (!super->s_compressed_je)
856 return ret;
857
858 super->s_master_inode = logfs_new_meta_inode(sb, LOGFS_INO_MASTER);
859 if (IS_ERR(super->s_master_inode))
860 return PTR_ERR(super->s_master_inode);
861
862 ret = logfs_read_journal(sb);
863 if (ret)
864 return -EIO;
865
866 reserve_sb_and_journal(sb);
867 logfs_calc_free(sb);
868
869 super->s_journal_area->a_ops = &journal_area_ops;
870 return 0;
871}
872
873void logfs_cleanup_journal(struct super_block *sb)
874{
875 struct logfs_super *super = logfs_super(sb);
876
877 btree_grim_visitor32(&super->s_reserved_segments, 0, NULL);
878 destroy_meta_inode(super->s_master_inode);
879 super->s_master_inode = NULL;
880
881 kfree(super->s_compressed_je);
882 kfree(super->s_je);
883}
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
new file mode 100644
index 000000000000..129779431373
--- /dev/null
+++ b/fs/logfs/logfs.h
@@ -0,0 +1,724 @@
1/*
2 * fs/logfs/logfs.h
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 *
8 * Private header for logfs.
9 */
10#ifndef FS_LOGFS_LOGFS_H
11#define FS_LOGFS_LOGFS_H
12
13#undef __CHECK_ENDIAN__
14#define __CHECK_ENDIAN__
15
16#include <linux/btree.h>
17#include <linux/crc32.h>
18#include <linux/fs.h>
19#include <linux/kernel.h>
20#include <linux/mempool.h>
21#include <linux/pagemap.h>
22#include <linux/mtd/mtd.h>
23#include "logfs_abi.h"
24
25#define LOGFS_DEBUG_SUPER (0x0001)
26#define LOGFS_DEBUG_SEGMENT (0x0002)
27#define LOGFS_DEBUG_JOURNAL (0x0004)
28#define LOGFS_DEBUG_DIR (0x0008)
29#define LOGFS_DEBUG_FILE (0x0010)
30#define LOGFS_DEBUG_INODE (0x0020)
31#define LOGFS_DEBUG_READWRITE (0x0040)
32#define LOGFS_DEBUG_GC (0x0080)
33#define LOGFS_DEBUG_GC_NOISY (0x0100)
34#define LOGFS_DEBUG_ALIASES (0x0200)
35#define LOGFS_DEBUG_BLOCKMOVE (0x0400)
36#define LOGFS_DEBUG_ALL (0xffffffff)
37
38#define LOGFS_DEBUG (0x01)
39/*
40 * To enable specific log messages, simply define LOGFS_DEBUG to match any
41 * or all of the above.
42 */
43#ifndef LOGFS_DEBUG
44#define LOGFS_DEBUG (0)
45#endif
46
47#define log_cond(cond, fmt, arg...) do { \
48 if (cond) \
49 printk(KERN_DEBUG fmt, ##arg); \
50} while (0)
51
52#define log_super(fmt, arg...) \
53 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_SUPER, fmt, ##arg)
54#define log_segment(fmt, arg...) \
55 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_SEGMENT, fmt, ##arg)
56#define log_journal(fmt, arg...) \
57 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_JOURNAL, fmt, ##arg)
58#define log_dir(fmt, arg...) \
59 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_DIR, fmt, ##arg)
60#define log_file(fmt, arg...) \
61 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_FILE, fmt, ##arg)
62#define log_inode(fmt, arg...) \
63 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_INODE, fmt, ##arg)
64#define log_readwrite(fmt, arg...) \
65 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_READWRITE, fmt, ##arg)
66#define log_gc(fmt, arg...) \
67 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_GC, fmt, ##arg)
68#define log_gc_noisy(fmt, arg...) \
69 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_GC_NOISY, fmt, ##arg)
70#define log_aliases(fmt, arg...) \
71 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_ALIASES, fmt, ##arg)
72#define log_blockmove(fmt, arg...) \
73 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_BLOCKMOVE, fmt, ##arg)
74
75#define PG_pre_locked PG_owner_priv_1
76#define PagePreLocked(page) test_bit(PG_pre_locked, &(page)->flags)
77#define SetPagePreLocked(page) set_bit(PG_pre_locked, &(page)->flags)
78#define ClearPagePreLocked(page) clear_bit(PG_pre_locked, &(page)->flags)
79
80/* FIXME: This should really be somewhere in the 64bit area. */
81#define LOGFS_LINK_MAX (1<<30)
82
83/* Read-only filesystem */
84#define LOGFS_SB_FLAG_RO 0x0001
85#define LOGFS_SB_FLAG_DIRTY 0x0002
86#define LOGFS_SB_FLAG_OBJ_ALIAS 0x0004
87#define LOGFS_SB_FLAG_SHUTDOWN 0x0008
88
89/* Write Control Flags */
90#define WF_LOCK 0x01 /* take write lock */
91#define WF_WRITE 0x02 /* write block */
92#define WF_DELETE 0x04 /* delete old block */
93
94typedef u8 __bitwise level_t;
95typedef u8 __bitwise gc_level_t;
96
97#define LEVEL(level) ((__force level_t)(level))
98#define GC_LEVEL(gc_level) ((__force gc_level_t)(gc_level))
99
100#define SUBLEVEL(level) ( (void)((level) == LEVEL(1)), \
101 (__force level_t)((__force u8)(level) - 1) )
102
103/**
104 * struct logfs_area - area management information
105 *
106 * @a_sb: the superblock this area belongs to
107 * @a_is_open: 1 if the area is currently open, else 0
108 * @a_segno: segment number of area
109 * @a_written_bytes: number of bytes already written back
110 * @a_used_bytes: number of used bytes
111 * @a_ops: area operations (either journal or ostore)
112 * @a_erase_count: erase count
113 * @a_level: GC level
114 */
115struct logfs_area { /* a segment open for writing */
116 struct super_block *a_sb;
117 int a_is_open;
118 u32 a_segno;
119 u32 a_written_bytes;
120 u32 a_used_bytes;
121 const struct logfs_area_ops *a_ops;
122 u32 a_erase_count;
123 gc_level_t a_level;
124};
125
126/**
127 * struct logfs_area_ops - area operations
128 *
129 * @get_free_segment: fill area->ofs with the offset of a free segment
130 * @get_erase_count: fill area->erase_count (needs area->ofs)
131 * @erase_segment: erase and setup segment
132 */
133struct logfs_area_ops {
134 void (*get_free_segment)(struct logfs_area *area);
135 void (*get_erase_count)(struct logfs_area *area);
136 int (*erase_segment)(struct logfs_area *area);
137};
138
139/**
140 * struct logfs_device_ops - device access operations
141 *
142 * @readpage: read one page (mm page)
143 * @writeseg: write one segment. may be a partial segment
144 * @erase: erase one segment
145 * @read: read from the device
146 * @erase: erase part of the device
147 */
148struct logfs_device_ops {
149 struct page *(*find_first_sb)(struct super_block *sb, u64 *ofs);
150 struct page *(*find_last_sb)(struct super_block *sb, u64 *ofs);
151 int (*write_sb)(struct super_block *sb, struct page *page);
152 int (*readpage)(void *_sb, struct page *page);
153 void (*writeseg)(struct super_block *sb, u64 ofs, size_t len);
154 int (*erase)(struct super_block *sb, loff_t ofs, size_t len,
155 int ensure_write);
156 void (*sync)(struct super_block *sb);
157 void (*put_device)(struct super_block *sb);
158};
159
160/**
161 * struct candidate_list - list of similar candidates
162 */
163struct candidate_list {
164 struct rb_root rb_tree;
165 int count;
166 int maxcount;
167 int sort_by_ec;
168};
169
170/**
171 * struct gc_candidate - "candidate" segment to be garbage collected next
172 *
173 * @list: list (either free of low)
174 * @segno: segment number
175 * @valid: number of valid bytes
176 * @erase_count: erase count of segment
177 * @dist: distance from tree root
178 *
179 * Candidates can be on two lists. The free list contains electees rather
180 * than candidates - segments that no longer contain any valid data. The
181 * low list contains candidates to be picked for GC. It should be kept
182 * short. It is not required to always pick a perfect candidate. In the
183 * worst case GC will have to move more data than absolutely necessary.
184 */
185struct gc_candidate {
186 struct rb_node rb_node;
187 struct candidate_list *list;
188 u32 segno;
189 u32 valid;
190 u32 erase_count;
191 u8 dist;
192};
193
194/**
195 * struct logfs_journal_entry - temporary structure used during journal scan
196 *
197 * @used:
198 * @version: normalized version
199 * @len: length
200 * @offset: offset
201 */
202struct logfs_journal_entry {
203 int used;
204 s16 version;
205 u16 len;
206 u16 datalen;
207 u64 offset;
208};
209
210enum transaction_state {
211 CREATE_1 = 1,
212 CREATE_2,
213 UNLINK_1,
214 UNLINK_2,
215 CROSS_RENAME_1,
216 CROSS_RENAME_2,
217 TARGET_RENAME_1,
218 TARGET_RENAME_2,
219 TARGET_RENAME_3
220};
221
222/**
223 * struct logfs_transaction - essential fields to support atomic dirops
224 *
225 * @ino: target inode
226 * @dir: inode of directory containing dentry
227 * @pos: pos of dentry in directory
228 */
229struct logfs_transaction {
230 enum transaction_state state;
231 u64 ino;
232 u64 dir;
233 u64 pos;
234};
235
236/**
237 * struct logfs_shadow - old block in the shadow of a not-yet-committed new one
238 * @old_ofs: offset of old block on medium
239 * @new_ofs: offset of new block on medium
240 * @ino: inode number
241 * @bix: block index
242 * @old_len: size of old block, including header
243 * @new_len: size of new block, including header
244 * @level: block level
245 */
246struct logfs_shadow {
247 u64 old_ofs;
248 u64 new_ofs;
249 u64 ino;
250 u64 bix;
251 int old_len;
252 int new_len;
253 gc_level_t gc_level;
254};
255
256/**
257 * struct shadow_tree
258 * @new: shadows where old_ofs==0, indexed by new_ofs
259 * @old: shadows where old_ofs!=0, indexed by old_ofs
260 */
261struct shadow_tree {
262 struct btree_head64 new;
263 struct btree_head64 old;
264};
265
266struct object_alias_item {
267 struct list_head list;
268 __be64 val;
269 int child_no;
270};
271
272/**
273 * struct logfs_block - contains any block state
274 * @type: indirect block or inode
275 * @full: number of fully populated children
276 * @partial: number of partially populated children
277 *
278 * Most blocks are directly represented by page cache pages. But when a block
279 * becomes dirty, is part of a transaction, contains aliases or is otherwise
280 * special, a struct logfs_block is allocated to track the additional state.
281 * Inodes are very similar to indirect blocks, so they can also get one of
282 * these structures added when appropriate.
283 */
284#define BLOCK_INDIRECT 1 /* Indirect block */
285#define BLOCK_INODE 2 /* Inode */
286struct logfs_block_ops;
287struct logfs_block {
288 struct list_head alias_list;
289 struct list_head item_list;
290 struct super_block *sb;
291 u64 ino;
292 u64 bix;
293 level_t level;
294 struct page *page;
295 struct inode *inode;
296 struct logfs_transaction *ta;
297 unsigned long alias_map[LOGFS_BLOCK_FACTOR / BITS_PER_LONG];
298 struct logfs_block_ops *ops;
299 int full;
300 int partial;
301 int reserved_bytes;
302};
303
304typedef int write_alias_t(struct super_block *sb, u64 ino, u64 bix,
305 level_t level, int child_no, __be64 val);
306struct logfs_block_ops {
307 void (*write_block)(struct logfs_block *block);
308 gc_level_t (*block_level)(struct logfs_block *block);
309 void (*free_block)(struct super_block *sb, struct logfs_block*block);
310 int (*write_alias)(struct super_block *sb,
311 struct logfs_block *block,
312 write_alias_t *write_one_alias);
313};
314
315struct logfs_super {
316 struct mtd_info *s_mtd; /* underlying device */
317 struct block_device *s_bdev; /* underlying device */
318 const struct logfs_device_ops *s_devops;/* device access */
319 struct inode *s_master_inode; /* inode file */
320 struct inode *s_segfile_inode; /* segment file */
321 struct inode *s_mapping_inode; /* device mapping */
322 atomic_t s_pending_writes; /* outstanting bios */
323 long s_flags;
324 mempool_t *s_btree_pool; /* for btree nodes */
325 mempool_t *s_alias_pool; /* aliases in segment.c */
326 u64 s_feature_incompat;
327 u64 s_feature_ro_compat;
328 u64 s_feature_compat;
329 u64 s_feature_flags;
330 u64 s_sb_ofs[2];
331 struct page *s_erase_page; /* for dev_bdev.c */
332 /* alias.c fields */
333 struct btree_head32 s_segment_alias; /* remapped segments */
334 int s_no_object_aliases;
335 struct list_head s_object_alias; /* remapped objects */
336 struct btree_head128 s_object_alias_tree; /* remapped objects */
337 struct mutex s_object_alias_mutex;
338 /* dir.c fields */
339 struct mutex s_dirop_mutex; /* for creat/unlink/rename */
340 u64 s_victim_ino; /* used for atomic dir-ops */
341 u64 s_rename_dir; /* source directory ino */
342 u64 s_rename_pos; /* position of source dd */
343 /* gc.c fields */
344 long s_segsize; /* size of a segment */
345 int s_segshift; /* log2 of segment size */
346 long s_segmask; /* 1 << s_segshift - 1 */
347 long s_no_segs; /* segments on device */
348 long s_no_journal_segs; /* segments used for journal */
349 long s_no_blocks; /* blocks per segment */
350 long s_writesize; /* minimum write size */
351 int s_writeshift; /* log2 of write size */
352 u64 s_size; /* filesystem size */
353 struct logfs_area *s_area[LOGFS_NO_AREAS]; /* open segment array */
354 u64 s_gec; /* global erase count */
355 u64 s_wl_gec_ostore; /* time of last wl event */
356 u64 s_wl_gec_journal; /* time of last wl event */
357 u64 s_sweeper; /* current sweeper pos */
358 u8 s_ifile_levels; /* max level of ifile */
359 u8 s_iblock_levels; /* max level of regular files */
360 u8 s_data_levels; /* # of segments to leaf block*/
361 u8 s_total_levels; /* sum of above three */
362 struct btree_head32 s_cand_tree; /* all candidates */
363 struct candidate_list s_free_list; /* 100% free segments */
364 struct candidate_list s_reserve_list; /* Bad segment reserve */
365 struct candidate_list s_low_list[LOGFS_NO_AREAS];/* good candidates */
366 struct candidate_list s_ec_list; /* wear level candidates */
367 struct btree_head32 s_reserved_segments;/* sb, journal, bad, etc. */
368 /* inode.c fields */
369 u64 s_last_ino; /* highest ino used */
370 long s_inos_till_wrap;
371 u32 s_generation; /* i_generation for new files */
372 struct list_head s_freeing_list; /* inodes being freed */
373 /* journal.c fields */
374 struct mutex s_journal_mutex;
375 void *s_je; /* journal entry to compress */
376 void *s_compressed_je; /* block to write to journal */
377 u32 s_journal_seg[LOGFS_JOURNAL_SEGS]; /* journal segments */
378 u32 s_journal_ec[LOGFS_JOURNAL_SEGS]; /* journal erasecounts */
379 u64 s_last_version;
380 struct logfs_area *s_journal_area; /* open journal segment */
381 __be64 s_je_array[64];
382 int s_no_je;
383
384 int s_sum_index; /* for the 12 summaries */
385 struct shadow_tree s_shadow_tree;
386 int s_je_fill; /* index of current je */
387 /* readwrite.c fields */
388 struct mutex s_write_mutex;
389 int s_lock_count;
390 mempool_t *s_block_pool; /* struct logfs_block pool */
391 mempool_t *s_shadow_pool; /* struct logfs_shadow pool */
392 /*
393 * Space accounting:
394 * - s_used_bytes specifies space used to store valid data objects.
395 * - s_dirty_used_bytes is space used to store non-committed data
396 * objects. Those objects have already been written themselves,
397 * but they don't become valid until all indirect blocks up to the
398 * journal have been written as well.
399 * - s_dirty_free_bytes is space used to store the old copy of a
400 * replaced object, as long as the replacement is non-committed.
401 * In other words, it is the amount of space freed when all dirty
402 * blocks are written back.
403 * - s_free_bytes is the amount of free space available for any
404 * purpose.
405 * - s_root_reserve is the amount of free space available only to
406 * the root user. Non-privileged users can no longer write once
407 * this watermark has been reached.
408 * - s_speed_reserve is space which remains unused to speed up
409 * garbage collection performance.
410 * - s_dirty_pages is the space reserved for currently dirty pages.
411 * It is a pessimistic estimate, so some/most will get freed on
412 * page writeback.
413 *
414 * s_used_bytes + s_free_bytes + s_speed_reserve = total usable size
415 */
416 u64 s_free_bytes;
417 u64 s_used_bytes;
418 u64 s_dirty_free_bytes;
419 u64 s_dirty_used_bytes;
420 u64 s_root_reserve;
421 u64 s_speed_reserve;
422 u64 s_dirty_pages;
423 /* Bad block handling:
424 * - s_bad_seg_reserve is a number of segments usually kept
425 * free. When encountering bad blocks, the affected segment's data
426 * is _temporarily_ moved to a reserved segment.
427 * - s_bad_segments is the number of known bad segments.
428 */
429 u32 s_bad_seg_reserve;
430 u32 s_bad_segments;
431};
432
433/**
434 * struct logfs_inode - in-memory inode
435 *
436 * @vfs_inode: struct inode
437 * @li_data: data pointers
438 * @li_used_bytes: number of used bytes
439 * @li_freeing_list: used to track inodes currently being freed
440 * @li_flags: inode flags
441 * @li_refcount: number of internal (GC-induced) references
442 */
443struct logfs_inode {
444 struct inode vfs_inode;
445 u64 li_data[LOGFS_EMBEDDED_FIELDS];
446 u64 li_used_bytes;
447 struct list_head li_freeing_list;
448 struct logfs_block *li_block;
449 u32 li_flags;
450 u8 li_height;
451 int li_refcount;
452};
453
454#define journal_for_each(__i) for (__i = 0; __i < LOGFS_JOURNAL_SEGS; __i++)
455#define for_each_area(__i) for (__i = 0; __i < LOGFS_NO_AREAS; __i++)
456#define for_each_area_down(__i) for (__i = LOGFS_NO_AREAS - 1; __i >= 0; __i--)
457
458/* compr.c */
459int logfs_compress(void *in, void *out, size_t inlen, size_t outlen);
460int logfs_uncompress(void *in, void *out, size_t inlen, size_t outlen);
461int __init logfs_compr_init(void);
462void logfs_compr_exit(void);
463
464/* dev_bdev.c */
465#ifdef CONFIG_BLOCK
466int logfs_get_sb_bdev(struct file_system_type *type, int flags,
467 const char *devname, struct vfsmount *mnt);
468#else
469static inline int logfs_get_sb_bdev(struct file_system_type *type, int flags,
470 const char *devname, struct vfsmount *mnt)
471{
472 return -ENODEV;
473}
474#endif
475
476/* dev_mtd.c */
477#ifdef CONFIG_MTD
478int logfs_get_sb_mtd(struct file_system_type *type, int flags,
479 int mtdnr, struct vfsmount *mnt);
480#else
481static inline int logfs_get_sb_mtd(struct file_system_type *type, int flags,
482 int mtdnr, struct vfsmount *mnt)
483{
484 return -ENODEV;
485}
486#endif
487
488/* dir.c */
489extern const struct inode_operations logfs_symlink_iops;
490extern const struct inode_operations logfs_dir_iops;
491extern const struct file_operations logfs_dir_fops;
492int logfs_replay_journal(struct super_block *sb);
493
494/* file.c */
495extern const struct inode_operations logfs_reg_iops;
496extern const struct file_operations logfs_reg_fops;
497extern const struct address_space_operations logfs_reg_aops;
498int logfs_readpage(struct file *file, struct page *page);
499int logfs_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
500 unsigned long arg);
501int logfs_fsync(struct file *file, struct dentry *dentry, int datasync);
502
503/* gc.c */
504u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec);
505void logfs_gc_pass(struct super_block *sb);
506int logfs_check_areas(struct super_block *sb);
507int logfs_init_gc(struct super_block *sb);
508void logfs_cleanup_gc(struct super_block *sb);
509
510/* inode.c */
511extern const struct super_operations logfs_super_operations;
512struct inode *logfs_iget(struct super_block *sb, ino_t ino);
513struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *cookie);
514void logfs_safe_iput(struct inode *inode, int cookie);
515struct inode *logfs_new_inode(struct inode *dir, int mode);
516struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino);
517struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino);
518int logfs_init_inode_cache(void);
519void logfs_destroy_inode_cache(void);
520void destroy_meta_inode(struct inode *inode);
521void logfs_set_blocks(struct inode *inode, u64 no);
522/* these logically belong into inode.c but actually reside in readwrite.c */
523int logfs_read_inode(struct inode *inode);
524int __logfs_write_inode(struct inode *inode, long flags);
525void logfs_delete_inode(struct inode *inode);
526void logfs_clear_inode(struct inode *inode);
527
528/* journal.c */
529void logfs_write_anchor(struct super_block *sb);
530int logfs_init_journal(struct super_block *sb);
531void logfs_cleanup_journal(struct super_block *sb);
532int write_alias_journal(struct super_block *sb, u64 ino, u64 bix,
533 level_t level, int child_no, __be64 val);
534void do_logfs_journal_wl_pass(struct super_block *sb);
535
536/* readwrite.c */
537pgoff_t logfs_pack_index(u64 bix, level_t level);
538void logfs_unpack_index(pgoff_t index, u64 *bix, level_t *level);
539int logfs_inode_write(struct inode *inode, const void *buf, size_t count,
540 loff_t bix, long flags, struct shadow_tree *shadow_tree);
541int logfs_readpage_nolock(struct page *page);
542int logfs_write_buf(struct inode *inode, struct page *page, long flags);
543int logfs_delete(struct inode *inode, pgoff_t index,
544 struct shadow_tree *shadow_tree);
545int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs,
546 gc_level_t gc_level, long flags);
547int logfs_is_valid_block(struct super_block *sb, u64 ofs, u64 ino, u64 bix,
548 gc_level_t gc_level);
549int logfs_truncate(struct inode *inode, u64 size);
550u64 logfs_seek_hole(struct inode *inode, u64 bix);
551u64 logfs_seek_data(struct inode *inode, u64 bix);
552int logfs_open_segfile(struct super_block *sb);
553int logfs_init_rw(struct super_block *sb);
554void logfs_cleanup_rw(struct super_block *sb);
555void logfs_add_transaction(struct inode *inode, struct logfs_transaction *ta);
556void logfs_del_transaction(struct inode *inode, struct logfs_transaction *ta);
557void logfs_write_block(struct logfs_block *block, long flags);
558int logfs_write_obj_aliases_pagecache(struct super_block *sb);
559void logfs_get_segment_entry(struct super_block *sb, u32 segno,
560 struct logfs_segment_entry *se);
561void logfs_set_segment_used(struct super_block *sb, u64 ofs, int increment);
562void logfs_set_segment_erased(struct super_block *sb, u32 segno, u32 ec,
563 gc_level_t gc_level);
564void logfs_set_segment_reserved(struct super_block *sb, u32 segno);
565void logfs_set_segment_unreserved(struct super_block *sb, u32 segno, u32 ec);
566struct logfs_block *__alloc_block(struct super_block *sb,
567 u64 ino, u64 bix, level_t level);
568void __free_block(struct super_block *sb, struct logfs_block *block);
569void btree_write_block(struct logfs_block *block);
570void initialize_block_counters(struct page *page, struct logfs_block *block,
571 __be64 *array, int page_is_empty);
572int logfs_exist_block(struct inode *inode, u64 bix);
573int get_page_reserve(struct inode *inode, struct page *page);
574extern struct logfs_block_ops indirect_block_ops;
575
576/* segment.c */
577int logfs_erase_segment(struct super_block *sb, u32 ofs, int ensure_erase);
578int wbuf_read(struct super_block *sb, u64 ofs, size_t len, void *buf);
579int logfs_segment_read(struct inode *inode, struct page *page, u64 ofs, u64 bix,
580 level_t level);
581int logfs_segment_write(struct inode *inode, struct page *page,
582 struct logfs_shadow *shadow);
583int logfs_segment_delete(struct inode *inode, struct logfs_shadow *shadow);
584int logfs_load_object_aliases(struct super_block *sb,
585 struct logfs_obj_alias *oa, int count);
586void move_page_to_btree(struct page *page);
587int logfs_init_mapping(struct super_block *sb);
588void logfs_sync_area(struct logfs_area *area);
589void logfs_sync_segments(struct super_block *sb);
590
591/* area handling */
592int logfs_init_areas(struct super_block *sb);
593void logfs_cleanup_areas(struct super_block *sb);
594int logfs_open_area(struct logfs_area *area, size_t bytes);
595void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
596 int use_filler);
597
598static inline void logfs_buf_write(struct logfs_area *area, u64 ofs,
599 void *buf, size_t len)
600{
601 __logfs_buf_write(area, ofs, buf, len, 0);
602}
603
604static inline void logfs_buf_recover(struct logfs_area *area, u64 ofs,
605 void *buf, size_t len)
606{
607 __logfs_buf_write(area, ofs, buf, len, 1);
608}
609
610/* super.c */
611struct page *emergency_read_begin(struct address_space *mapping, pgoff_t index);
612void emergency_read_end(struct page *page);
613void logfs_crash_dump(struct super_block *sb);
614void *memchr_inv(const void *s, int c, size_t n);
615int logfs_statfs(struct dentry *dentry, struct kstatfs *stats);
616int logfs_get_sb_device(struct file_system_type *type, int flags,
617 struct mtd_info *mtd, struct block_device *bdev,
618 const struct logfs_device_ops *devops, struct vfsmount *mnt);
619int logfs_check_ds(struct logfs_disk_super *ds);
620int logfs_write_sb(struct super_block *sb);
621
622static inline struct logfs_super *logfs_super(struct super_block *sb)
623{
624 return sb->s_fs_info;
625}
626
627static inline struct logfs_inode *logfs_inode(struct inode *inode)
628{
629 return container_of(inode, struct logfs_inode, vfs_inode);
630}
631
632static inline void logfs_set_ro(struct super_block *sb)
633{
634 logfs_super(sb)->s_flags |= LOGFS_SB_FLAG_RO;
635}
636
637#define LOGFS_BUG(sb) do { \
638 struct super_block *__sb = sb; \
639 logfs_crash_dump(__sb); \
640 logfs_super(__sb)->s_flags |= LOGFS_SB_FLAG_RO; \
641 BUG(); \
642} while (0)
643
644#define LOGFS_BUG_ON(condition, sb) \
645 do { if (unlikely(condition)) LOGFS_BUG((sb)); } while (0)
646
647static inline __be32 logfs_crc32(void *data, size_t len, size_t skip)
648{
649 return cpu_to_be32(crc32(~0, data+skip, len-skip));
650}
651
652static inline u8 logfs_type(struct inode *inode)
653{
654 return (inode->i_mode >> 12) & 15;
655}
656
657static inline pgoff_t logfs_index(struct super_block *sb, u64 pos)
658{
659 return pos >> sb->s_blocksize_bits;
660}
661
662static inline u64 dev_ofs(struct super_block *sb, u32 segno, u32 ofs)
663{
664 return ((u64)segno << logfs_super(sb)->s_segshift) + ofs;
665}
666
667static inline u32 seg_no(struct super_block *sb, u64 ofs)
668{
669 return ofs >> logfs_super(sb)->s_segshift;
670}
671
672static inline u32 seg_ofs(struct super_block *sb, u64 ofs)
673{
674 return ofs & logfs_super(sb)->s_segmask;
675}
676
677static inline u64 seg_align(struct super_block *sb, u64 ofs)
678{
679 return ofs & ~logfs_super(sb)->s_segmask;
680}
681
682static inline struct logfs_block *logfs_block(struct page *page)
683{
684 return (void *)page->private;
685}
686
687static inline level_t shrink_level(gc_level_t __level)
688{
689 u8 level = (__force u8)__level;
690
691 if (level >= LOGFS_MAX_LEVELS)
692 level -= LOGFS_MAX_LEVELS;
693 return (__force level_t)level;
694}
695
696static inline gc_level_t expand_level(u64 ino, level_t __level)
697{
698 u8 level = (__force u8)__level;
699
700 if (ino == LOGFS_INO_MASTER) {
701 /* ifile has seperate areas */
702 level += LOGFS_MAX_LEVELS;
703 }
704 return (__force gc_level_t)level;
705}
706
707static inline int logfs_block_shift(struct super_block *sb, level_t level)
708{
709 level = shrink_level((__force gc_level_t)level);
710 return (__force int)level * (sb->s_blocksize_bits - 3);
711}
712
713static inline u64 logfs_block_mask(struct super_block *sb, level_t level)
714{
715 return ~0ull << logfs_block_shift(sb, level);
716}
717
718static inline struct logfs_area *get_area(struct super_block *sb,
719 gc_level_t gc_level)
720{
721 return logfs_super(sb)->s_area[(__force u8)gc_level];
722}
723
724#endif
diff --git a/fs/logfs/logfs_abi.h b/fs/logfs/logfs_abi.h
new file mode 100644
index 000000000000..f674725663fe
--- /dev/null
+++ b/fs/logfs/logfs_abi.h
@@ -0,0 +1,629 @@
1/*
2 * fs/logfs/logfs_abi.h
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 *
8 * Public header for logfs.
9 */
10#ifndef FS_LOGFS_LOGFS_ABI_H
11#define FS_LOGFS_LOGFS_ABI_H
12
13/* For out-of-kernel compiles */
14#ifndef BUILD_BUG_ON
15#define BUILD_BUG_ON(condition) /**/
16#endif
17
18#define SIZE_CHECK(type, size) \
19static inline void check_##type(void) \
20{ \
21 BUILD_BUG_ON(sizeof(struct type) != (size)); \
22}
23
24/*
25 * Throughout the logfs code, we're constantly dealing with blocks at
26 * various positions or offsets. To remove confusion, we stricly
27 * distinguish between a "position" - the logical position within a
28 * file and an "offset" - the physical location within the device.
29 *
30 * Any usage of the term offset for a logical location or position for
31 * a physical one is a bug and should get fixed.
32 */
33
34/*
35 * Block are allocated in one of several segments depending on their
36 * level. The following levels are used:
37 * 0 - regular data block
38 * 1 - i1 indirect blocks
39 * 2 - i2 indirect blocks
40 * 3 - i3 indirect blocks
41 * 4 - i4 indirect blocks
42 * 5 - i5 indirect blocks
43 * 6 - ifile data blocks
44 * 7 - ifile i1 indirect blocks
45 * 8 - ifile i2 indirect blocks
46 * 9 - ifile i3 indirect blocks
47 * 10 - ifile i4 indirect blocks
48 * 11 - ifile i5 indirect blocks
49 * Potential levels to be used in the future:
50 * 12 - gc recycled blocks, long-lived data
51 * 13 - replacement blocks, short-lived data
52 *
53 * Levels 1-11 are necessary for robust gc operations and help seperate
54 * short-lived metadata from longer-lived file data. In the future,
55 * file data should get seperated into several segments based on simple
56 * heuristics. Old data recycled during gc operation is expected to be
57 * long-lived. New data is of uncertain life expectancy. New data
58 * used to replace older blocks in existing files is expected to be
59 * short-lived.
60 */
61
62
63/* Magic numbers. 64bit for superblock, 32bit for statfs f_type */
64#define LOGFS_MAGIC 0x7a3a8e5cb9d5bf67ull
65#define LOGFS_MAGIC_U32 0xc97e8168u
66
67/*
68 * Various blocksize related macros. Blocksize is currently fixed at 4KiB.
69 * Sooner or later that should become configurable and the macros replaced
70 * by something superblock-dependent. Pointers in indirect blocks are and
71 * will remain 64bit.
72 *
73 * LOGFS_BLOCKSIZE - self-explaining
74 * LOGFS_BLOCK_FACTOR - number of pointers per indirect block
75 * LOGFS_BLOCK_BITS - log2 of LOGFS_BLOCK_FACTOR, used for shifts
76 */
77#define LOGFS_BLOCKSIZE (4096ull)
78#define LOGFS_BLOCK_FACTOR (LOGFS_BLOCKSIZE / sizeof(u64))
79#define LOGFS_BLOCK_BITS (9)
80
81/*
82 * Number of blocks at various levels of indirection. There are 16 direct
83 * block pointers plus a single indirect pointer.
84 */
85#define I0_BLOCKS (16)
86#define I1_BLOCKS LOGFS_BLOCK_FACTOR
87#define I2_BLOCKS (LOGFS_BLOCK_FACTOR * I1_BLOCKS)
88#define I3_BLOCKS (LOGFS_BLOCK_FACTOR * I2_BLOCKS)
89#define I4_BLOCKS (LOGFS_BLOCK_FACTOR * I3_BLOCKS)
90#define I5_BLOCKS (LOGFS_BLOCK_FACTOR * I4_BLOCKS)
91
92#define INDIRECT_INDEX I0_BLOCKS
93#define LOGFS_EMBEDDED_FIELDS (I0_BLOCKS + 1)
94
95/*
96 * Sizes at which files require another level of indirection. Files smaller
97 * than LOGFS_EMBEDDED_SIZE can be completely stored in the inode itself,
98 * similar like ext2 fast symlinks.
99 *
100 * Data at a position smaller than LOGFS_I0_SIZE is accessed through the
101 * direct pointers, else through the 1x indirect pointer and so forth.
102 */
103#define LOGFS_EMBEDDED_SIZE (LOGFS_EMBEDDED_FIELDS * sizeof(u64))
104#define LOGFS_I0_SIZE (I0_BLOCKS * LOGFS_BLOCKSIZE)
105#define LOGFS_I1_SIZE (I1_BLOCKS * LOGFS_BLOCKSIZE)
106#define LOGFS_I2_SIZE (I2_BLOCKS * LOGFS_BLOCKSIZE)
107#define LOGFS_I3_SIZE (I3_BLOCKS * LOGFS_BLOCKSIZE)
108#define LOGFS_I4_SIZE (I4_BLOCKS * LOGFS_BLOCKSIZE)
109#define LOGFS_I5_SIZE (I5_BLOCKS * LOGFS_BLOCKSIZE)
110
111/*
112 * Each indirect block pointer must have this flag set, if all block pointers
113 * behind it are set, i.e. there is no hole hidden in the shadow of this
114 * indirect block pointer.
115 */
116#define LOGFS_FULLY_POPULATED (1ULL << 63)
117#define pure_ofs(ofs) (ofs & ~LOGFS_FULLY_POPULATED)
118
119/*
120 * LogFS needs to seperate data into levels. Each level is defined as the
121 * maximal possible distance from the master inode (inode of the inode file).
122 * Data blocks reside on level 0, 1x indirect block on level 1, etc.
123 * Inodes reside on level 6, indirect blocks for the inode file on levels 7-11.
124 * This effort is necessary to guarantee garbage collection to always make
125 * progress.
126 *
127 * LOGFS_MAX_INDIRECT is the maximal indirection through indirect blocks,
128 * LOGFS_MAX_LEVELS is one more for the actual data level of a file. It is
129 * the maximal number of levels for one file.
130 * LOGFS_NO_AREAS is twice that, as the inode file and regular files are
131 * effectively stacked on top of each other.
132 */
133#define LOGFS_MAX_INDIRECT (5)
134#define LOGFS_MAX_LEVELS (LOGFS_MAX_INDIRECT + 1)
135#define LOGFS_NO_AREAS (2 * LOGFS_MAX_LEVELS)
136
137/* Maximum size of filenames */
138#define LOGFS_MAX_NAMELEN (255)
139
140/* Number of segments in the primary journal. */
141#define LOGFS_JOURNAL_SEGS (16)
142
143/* Maximum number of free/erased/etc. segments in journal entries */
144#define MAX_CACHED_SEGS (64)
145
146
147/*
148 * LOGFS_OBJECT_HEADERSIZE is the size of a single header in the object store,
149 * LOGFS_MAX_OBJECTSIZE the size of the largest possible object, including
150 * its header,
151 * LOGFS_SEGMENT_RESERVE is the amount of space reserved for each segment for
152 * its segment header and the padded space at the end when no further objects
153 * fit.
154 */
155#define LOGFS_OBJECT_HEADERSIZE (0x1c)
156#define LOGFS_SEGMENT_HEADERSIZE (0x18)
157#define LOGFS_MAX_OBJECTSIZE (LOGFS_OBJECT_HEADERSIZE + LOGFS_BLOCKSIZE)
158#define LOGFS_SEGMENT_RESERVE \
159 (LOGFS_SEGMENT_HEADERSIZE + LOGFS_MAX_OBJECTSIZE - 1)
160
161/*
162 * Segment types:
163 * SEG_SUPER - Data or indirect block
164 * SEG_JOURNAL - Inode
165 * SEG_OSTORE - Dentry
166 */
167enum {
168 SEG_SUPER = 0x01,
169 SEG_JOURNAL = 0x02,
170 SEG_OSTORE = 0x03,
171};
172
173/**
174 * struct logfs_segment_header - per-segment header in the ostore
175 *
176 * @crc: crc32 of header (there is no data)
177 * @pad: unused, must be 0
178 * @type: segment type, see above
179 * @level: GC level for all objects in this segment
180 * @segno: segment number
181 * @ec: erase count for this segment
182 * @gec: global erase count at time of writing
183 */
184struct logfs_segment_header {
185 __be32 crc;
186 __be16 pad;
187 __u8 type;
188 __u8 level;
189 __be32 segno;
190 __be32 ec;
191 __be64 gec;
192};
193
194SIZE_CHECK(logfs_segment_header, LOGFS_SEGMENT_HEADERSIZE);
195
196#define LOGFS_FEATURES_INCOMPAT (0ull)
197#define LOGFS_FEATURES_RO_COMPAT (0ull)
198#define LOGFS_FEATURES_COMPAT (0ull)
199
200/**
201 * struct logfs_disk_super - on-medium superblock
202 *
203 * @ds_magic: magic number, must equal LOGFS_MAGIC
204 * @ds_crc: crc32 of structure starting with the next field
205 * @ds_ifile_levels: maximum number of levels for ifile
206 * @ds_iblock_levels: maximum number of levels for regular files
207 * @ds_data_levels: number of seperate levels for data
208 * @pad0: reserved, must be 0
209 * @ds_feature_incompat: incompatible filesystem features
210 * @ds_feature_ro_compat: read-only compatible filesystem features
211 * @ds_feature_compat: compatible filesystem features
212 * @ds_flags: flags
213 * @ds_segment_shift: log2 of segment size
214 * @ds_block_shift: log2 of block size
215 * @ds_write_shift: log2 of write size
216 * @pad1: reserved, must be 0
217 * @ds_journal_seg: segments used by primary journal
218 * @ds_root_reserve: bytes reserved for the superuser
219 * @ds_speed_reserve: bytes reserved to speed up GC
220 * @ds_bad_seg_reserve: number of segments reserved to handle bad blocks
221 * @pad2: reserved, must be 0
222 * @pad3: reserved, must be 0
223 *
224 * Contains only read-only fields. Read-write fields like the amount of used
225 * space is tracked in the dynamic superblock, which is stored in the journal.
226 */
227struct logfs_disk_super {
228 struct logfs_segment_header ds_sh;
229 __be64 ds_magic;
230
231 __be32 ds_crc;
232 __u8 ds_ifile_levels;
233 __u8 ds_iblock_levels;
234 __u8 ds_data_levels;
235 __u8 ds_segment_shift;
236 __u8 ds_block_shift;
237 __u8 ds_write_shift;
238 __u8 pad0[6];
239
240 __be64 ds_filesystem_size;
241 __be32 ds_segment_size;
242 __be32 ds_bad_seg_reserve;
243
244 __be64 ds_feature_incompat;
245 __be64 ds_feature_ro_compat;
246
247 __be64 ds_feature_compat;
248 __be64 ds_feature_flags;
249
250 __be64 ds_root_reserve;
251 __be64 ds_speed_reserve;
252
253 __be32 ds_journal_seg[LOGFS_JOURNAL_SEGS];
254
255 __be64 ds_super_ofs[2];
256 __be64 pad3[8];
257};
258
259SIZE_CHECK(logfs_disk_super, 256);
260
261/*
262 * Object types:
263 * OBJ_BLOCK - Data or indirect block
264 * OBJ_INODE - Inode
265 * OBJ_DENTRY - Dentry
266 */
267enum {
268 OBJ_BLOCK = 0x04,
269 OBJ_INODE = 0x05,
270 OBJ_DENTRY = 0x06,
271};
272
273/**
274 * struct logfs_object_header - per-object header in the ostore
275 *
276 * @crc: crc32 of header, excluding data_crc
277 * @len: length of data
278 * @type: object type, see above
279 * @compr: compression type
280 * @ino: inode number
281 * @bix: block index
282 * @data_crc: crc32 of payload
283 */
284struct logfs_object_header {
285 __be32 crc;
286 __be16 len;
287 __u8 type;
288 __u8 compr;
289 __be64 ino;
290 __be64 bix;
291 __be32 data_crc;
292} __attribute__((packed));
293
294SIZE_CHECK(logfs_object_header, LOGFS_OBJECT_HEADERSIZE);
295
296/*
297 * Reserved inode numbers:
298 * LOGFS_INO_MASTER - master inode (for inode file)
299 * LOGFS_INO_ROOT - root directory
300 * LOGFS_INO_SEGFILE - per-segment used bytes and erase count
301 */
302enum {
303 LOGFS_INO_MAPPING = 0x00,
304 LOGFS_INO_MASTER = 0x01,
305 LOGFS_INO_ROOT = 0x02,
306 LOGFS_INO_SEGFILE = 0x03,
307 LOGFS_RESERVED_INOS = 0x10,
308};
309
310/*
311 * Inode flags. High bits should never be written to the medium. They are
312 * reserved for in-memory usage.
313 * Low bits should either remain in sync with the corresponding FS_*_FL or
314 * reuse slots that obviously don't make sense for logfs.
315 *
316 * LOGFS_IF_DIRTY Inode must be written back
317 * LOGFS_IF_ZOMBIE Inode has been deleted
318 * LOGFS_IF_STILLBORN -ENOSPC happened when creating inode
319 */
320#define LOGFS_IF_COMPRESSED 0x00000004 /* == FS_COMPR_FL */
321#define LOGFS_IF_DIRTY 0x20000000
322#define LOGFS_IF_ZOMBIE 0x40000000
323#define LOGFS_IF_STILLBORN 0x80000000
324
325/* Flags available to chattr */
326#define LOGFS_FL_USER_VISIBLE (LOGFS_IF_COMPRESSED)
327#define LOGFS_FL_USER_MODIFIABLE (LOGFS_IF_COMPRESSED)
328/* Flags inherited from parent directory on file/directory creation */
329#define LOGFS_FL_INHERITED (LOGFS_IF_COMPRESSED)
330
331/**
332 * struct logfs_disk_inode - on-medium inode
333 *
334 * @di_mode: file mode
335 * @di_pad: reserved, must be 0
336 * @di_flags: inode flags, see above
337 * @di_uid: user id
338 * @di_gid: group id
339 * @di_ctime: change time
340 * @di_mtime: modify time
341 * @di_refcount: reference count (aka nlink or link count)
342 * @di_generation: inode generation, for nfs
343 * @di_used_bytes: number of bytes used
344 * @di_size: file size
345 * @di_data: data pointers
346 */
347struct logfs_disk_inode {
348 __be16 di_mode;
349 __u8 di_height;
350 __u8 di_pad;
351 __be32 di_flags;
352 __be32 di_uid;
353 __be32 di_gid;
354
355 __be64 di_ctime;
356 __be64 di_mtime;
357
358 __be64 di_atime;
359 __be32 di_refcount;
360 __be32 di_generation;
361
362 __be64 di_used_bytes;
363 __be64 di_size;
364
365 __be64 di_data[LOGFS_EMBEDDED_FIELDS];
366};
367
368SIZE_CHECK(logfs_disk_inode, 200);
369
370#define INODE_POINTER_OFS \
371 (offsetof(struct logfs_disk_inode, di_data) / sizeof(__be64))
372#define INODE_USED_OFS \
373 (offsetof(struct logfs_disk_inode, di_used_bytes) / sizeof(__be64))
374#define INODE_SIZE_OFS \
375 (offsetof(struct logfs_disk_inode, di_size) / sizeof(__be64))
376#define INODE_HEIGHT_OFS (0)
377
378/**
379 * struct logfs_disk_dentry - on-medium dentry structure
380 *
381 * @ino: inode number
382 * @namelen: length of file name
383 * @type: file type, identical to bits 12..15 of mode
384 * @name: file name
385 */
386/* FIXME: add 6 bytes of padding to remove the __packed */
387struct logfs_disk_dentry {
388 __be64 ino;
389 __be16 namelen;
390 __u8 type;
391 __u8 name[LOGFS_MAX_NAMELEN];
392} __attribute__((packed));
393
394SIZE_CHECK(logfs_disk_dentry, 266);
395
396#define RESERVED 0xffffffff
397#define BADSEG 0xffffffff
398/**
399 * struct logfs_segment_entry - segment file entry
400 *
401 * @ec_level: erase count and level
402 * @valid: number of valid bytes
403 *
404 * Segment file contains one entry for every segment. ec_level contains the
405 * erasecount in the upper 28 bits and the level in the lower 4 bits. An
406 * ec_level of BADSEG (-1) identifies bad segments. valid contains the number
407 * of valid bytes or RESERVED (-1 again) if the segment is used for either the
408 * superblock or the journal, or when the segment is bad.
409 */
410struct logfs_segment_entry {
411 __be32 ec_level;
412 __be32 valid;
413};
414
415SIZE_CHECK(logfs_segment_entry, 8);
416
417/**
418 * struct logfs_journal_header - header for journal entries (JEs)
419 *
420 * @h_crc: crc32 of journal entry
421 * @h_len: length of compressed journal entry,
422 * not including header
423 * @h_datalen: length of uncompressed data
424 * @h_type: JE type
425 * @h_compr: compression type
426 * @h_pad: reserved
427 */
428struct logfs_journal_header {
429 __be32 h_crc;
430 __be16 h_len;
431 __be16 h_datalen;
432 __be16 h_type;
433 __u8 h_compr;
434 __u8 h_pad[5];
435};
436
437SIZE_CHECK(logfs_journal_header, 16);
438
439/*
440 * Life expectency of data.
441 * VIM_DEFAULT - default vim
442 * VIM_SEGFILE - for segment file only - very short-living
443 * VIM_GC - GC'd data - likely long-living
444 */
445enum logfs_vim {
446 VIM_DEFAULT = 0,
447 VIM_SEGFILE = 1,
448};
449
450/**
451 * struct logfs_je_area - wbuf header
452 *
453 * @segno: segment number of area
454 * @used_bytes: number of bytes already used
455 * @gc_level: GC level
456 * @vim: life expectancy of data
457 *
458 * "Areas" are segments currently being used for writing. There is at least
459 * one area per GC level. Several may be used to seperate long-living from
460 * short-living data. If an area with unknown vim is encountered, it can
461 * simply be closed.
462 * The write buffer immediately follow this header.
463 */
464struct logfs_je_area {
465 __be32 segno;
466 __be32 used_bytes;
467 __u8 gc_level;
468 __u8 vim;
469} __attribute__((packed));
470
471SIZE_CHECK(logfs_je_area, 10);
472
473#define MAX_JOURNAL_HEADER \
474 (sizeof(struct logfs_journal_header) + sizeof(struct logfs_je_area))
475
476/**
477 * struct logfs_je_dynsb - dynamic superblock
478 *
479 * @ds_gec: global erase count
480 * @ds_sweeper: current position of GC "sweeper"
481 * @ds_rename_dir: source directory ino (see dir.c documentation)
482 * @ds_rename_pos: position of source dd (see dir.c documentation)
483 * @ds_victim_ino: victims of incomplete dir operation (see dir.c)
484 * @ds_victim_ino: parent inode of victim (see dir.c)
485 * @ds_used_bytes: number of used bytes
486 */
487struct logfs_je_dynsb {
488 __be64 ds_gec;
489 __be64 ds_sweeper;
490
491 __be64 ds_rename_dir;
492 __be64 ds_rename_pos;
493
494 __be64 ds_victim_ino;
495 __be64 ds_victim_parent; /* XXX */
496
497 __be64 ds_used_bytes;
498 __be32 ds_generation;
499 __be32 pad;
500};
501
502SIZE_CHECK(logfs_je_dynsb, 64);
503
504/**
505 * struct logfs_je_anchor - anchor of filesystem tree, aka master inode
506 *
507 * @da_size: size of inode file
508 * @da_last_ino: last created inode
509 * @da_used_bytes: number of bytes used
510 * @da_data: data pointers
511 */
512struct logfs_je_anchor {
513 __be64 da_size;
514 __be64 da_last_ino;
515
516 __be64 da_used_bytes;
517 u8 da_height;
518 u8 pad[7];
519
520 __be64 da_data[LOGFS_EMBEDDED_FIELDS];
521};
522
523SIZE_CHECK(logfs_je_anchor, 168);
524
525/**
526 * struct logfs_je_spillout - spillout entry (from 1st to 2nd journal)
527 *
528 * @so_segment: segments used for 2nd journal
529 *
530 * Length of the array is given by h_len field in the header.
531 */
532struct logfs_je_spillout {
533 __be64 so_segment[0];
534};
535
536SIZE_CHECK(logfs_je_spillout, 0);
537
538/**
539 * struct logfs_je_journal_ec - erase counts for all journal segments
540 *
541 * @ec: erase count
542 *
543 * Length of the array is given by h_len field in the header.
544 */
545struct logfs_je_journal_ec {
546 __be32 ec[0];
547};
548
549SIZE_CHECK(logfs_je_journal_ec, 0);
550
551/**
552 * struct logfs_je_free_segments - list of free segmetns with erase count
553 */
554struct logfs_je_free_segments {
555 __be32 segno;
556 __be32 ec;
557};
558
559SIZE_CHECK(logfs_je_free_segments, 8);
560
561/**
562 * struct logfs_seg_alias - list of segment aliases
563 */
564struct logfs_seg_alias {
565 __be32 old_segno;
566 __be32 new_segno;
567};
568
569SIZE_CHECK(logfs_seg_alias, 8);
570
571/**
572 * struct logfs_obj_alias - list of object aliases
573 */
574struct logfs_obj_alias {
575 __be64 ino;
576 __be64 bix;
577 __be64 val;
578 u8 level;
579 u8 pad[5];
580 __be16 child_no;
581};
582
583SIZE_CHECK(logfs_obj_alias, 32);
584
585/**
586 * Compression types.
587 *
588 * COMPR_NONE - uncompressed
589 * COMPR_ZLIB - compressed with zlib
590 */
591enum {
592 COMPR_NONE = 0,
593 COMPR_ZLIB = 1,
594};
595
596/*
597 * Journal entries come in groups of 16. First group contains unique
598 * entries, next groups contain one entry per level
599 *
600 * JE_FIRST - smallest possible journal entry number
601 *
602 * JEG_BASE - base group, containing unique entries
603 * JE_COMMIT - commit entry, validates all previous entries
604 * JE_DYNSB - dynamic superblock, anything that ought to be in the
605 * superblock but cannot because it is read-write data
606 * JE_ANCHOR - anchor aka master inode aka inode file's inode
607 * JE_ERASECOUNT erasecounts for all journal segments
608 * JE_SPILLOUT - unused
609 * JE_SEG_ALIAS - aliases segments
610 * JE_AREA - area description
611 *
612 * JE_LAST - largest possible journal entry number
613 */
614enum {
615 JE_FIRST = 0x01,
616
617 JEG_BASE = 0x00,
618 JE_COMMIT = 0x02,
619 JE_DYNSB = 0x03,
620 JE_ANCHOR = 0x04,
621 JE_ERASECOUNT = 0x05,
622 JE_SPILLOUT = 0x06,
623 JE_OBJ_ALIAS = 0x0d,
624 JE_AREA = 0x0e,
625
626 JE_LAST = 0x0e,
627};
628
629#endif
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
new file mode 100644
index 000000000000..7a23b3e7c0a7
--- /dev/null
+++ b/fs/logfs/readwrite.c
@@ -0,0 +1,2246 @@
1/*
2 * fs/logfs/readwrite.c
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 *
8 *
9 * Actually contains five sets of very similar functions:
10 * read read blocks from a file
11 * seek_hole find next hole
12 * seek_data find next data block
13 * valid check whether a block still belongs to a file
14 * write write blocks to a file
15 * delete delete a block (for directories and ifile)
16 * rewrite move existing blocks of a file to a new location (gc helper)
17 * truncate truncate a file
18 */
19#include "logfs.h"
20#include <linux/sched.h>
21
22static u64 adjust_bix(u64 bix, level_t level)
23{
24 switch (level) {
25 case 0:
26 return bix;
27 case LEVEL(1):
28 return max_t(u64, bix, I0_BLOCKS);
29 case LEVEL(2):
30 return max_t(u64, bix, I1_BLOCKS);
31 case LEVEL(3):
32 return max_t(u64, bix, I2_BLOCKS);
33 case LEVEL(4):
34 return max_t(u64, bix, I3_BLOCKS);
35 case LEVEL(5):
36 return max_t(u64, bix, I4_BLOCKS);
37 default:
38 WARN_ON(1);
39 return bix;
40 }
41}
42
43static inline u64 maxbix(u8 height)
44{
45 return 1ULL << (LOGFS_BLOCK_BITS * height);
46}
47
48/**
49 * The inode address space is cut in two halves. Lower half belongs to data
50 * pages, upper half to indirect blocks. If the high bit (INDIRECT_BIT) is
51 * set, the actual block index (bix) and level can be derived from the page
52 * index.
53 *
54 * The lowest three bits of the block index are set to 0 after packing and
55 * unpacking. Since the lowest n bits (9 for 4KiB blocksize) are ignored
56 * anyway this is harmless.
57 */
58#define ARCH_SHIFT (BITS_PER_LONG - 32)
59#define INDIRECT_BIT (0x80000000UL << ARCH_SHIFT)
60#define LEVEL_SHIFT (28 + ARCH_SHIFT)
61static inline pgoff_t first_indirect_block(void)
62{
63 return INDIRECT_BIT | (1ULL << LEVEL_SHIFT);
64}
65
66pgoff_t logfs_pack_index(u64 bix, level_t level)
67{
68 pgoff_t index;
69
70 BUG_ON(bix >= INDIRECT_BIT);
71 if (level == 0)
72 return bix;
73
74 index = INDIRECT_BIT;
75 index |= (__force long)level << LEVEL_SHIFT;
76 index |= bix >> ((__force u8)level * LOGFS_BLOCK_BITS);
77 return index;
78}
79
80void logfs_unpack_index(pgoff_t index, u64 *bix, level_t *level)
81{
82 u8 __level;
83
84 if (!(index & INDIRECT_BIT)) {
85 *bix = index;
86 *level = 0;
87 return;
88 }
89
90 __level = (index & ~INDIRECT_BIT) >> LEVEL_SHIFT;
91 *level = LEVEL(__level);
92 *bix = (index << (__level * LOGFS_BLOCK_BITS)) & ~INDIRECT_BIT;
93 *bix = adjust_bix(*bix, *level);
94 return;
95}
96#undef ARCH_SHIFT
97#undef INDIRECT_BIT
98#undef LEVEL_SHIFT
99
100/*
101 * Time is stored as nanoseconds since the epoch.
102 */
103static struct timespec be64_to_timespec(__be64 betime)
104{
105 return ns_to_timespec(be64_to_cpu(betime));
106}
107
108static __be64 timespec_to_be64(struct timespec tsp)
109{
110 return cpu_to_be64((u64)tsp.tv_sec * NSEC_PER_SEC + tsp.tv_nsec);
111}
112
113static void logfs_disk_to_inode(struct logfs_disk_inode *di, struct inode*inode)
114{
115 struct logfs_inode *li = logfs_inode(inode);
116 int i;
117
118 inode->i_mode = be16_to_cpu(di->di_mode);
119 li->li_height = di->di_height;
120 li->li_flags = be32_to_cpu(di->di_flags);
121 inode->i_uid = be32_to_cpu(di->di_uid);
122 inode->i_gid = be32_to_cpu(di->di_gid);
123 inode->i_size = be64_to_cpu(di->di_size);
124 logfs_set_blocks(inode, be64_to_cpu(di->di_used_bytes));
125 inode->i_atime = be64_to_timespec(di->di_atime);
126 inode->i_ctime = be64_to_timespec(di->di_ctime);
127 inode->i_mtime = be64_to_timespec(di->di_mtime);
128 inode->i_nlink = be32_to_cpu(di->di_refcount);
129 inode->i_generation = be32_to_cpu(di->di_generation);
130
131 switch (inode->i_mode & S_IFMT) {
132 case S_IFSOCK: /* fall through */
133 case S_IFBLK: /* fall through */
134 case S_IFCHR: /* fall through */
135 case S_IFIFO:
136 inode->i_rdev = be64_to_cpu(di->di_data[0]);
137 break;
138 case S_IFDIR: /* fall through */
139 case S_IFREG: /* fall through */
140 case S_IFLNK:
141 for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
142 li->li_data[i] = be64_to_cpu(di->di_data[i]);
143 break;
144 default:
145 BUG();
146 }
147}
148
149static void logfs_inode_to_disk(struct inode *inode, struct logfs_disk_inode*di)
150{
151 struct logfs_inode *li = logfs_inode(inode);
152 int i;
153
154 di->di_mode = cpu_to_be16(inode->i_mode);
155 di->di_height = li->li_height;
156 di->di_pad = 0;
157 di->di_flags = cpu_to_be32(li->li_flags);
158 di->di_uid = cpu_to_be32(inode->i_uid);
159 di->di_gid = cpu_to_be32(inode->i_gid);
160 di->di_size = cpu_to_be64(i_size_read(inode));
161 di->di_used_bytes = cpu_to_be64(li->li_used_bytes);
162 di->di_atime = timespec_to_be64(inode->i_atime);
163 di->di_ctime = timespec_to_be64(inode->i_ctime);
164 di->di_mtime = timespec_to_be64(inode->i_mtime);
165 di->di_refcount = cpu_to_be32(inode->i_nlink);
166 di->di_generation = cpu_to_be32(inode->i_generation);
167
168 switch (inode->i_mode & S_IFMT) {
169 case S_IFSOCK: /* fall through */
170 case S_IFBLK: /* fall through */
171 case S_IFCHR: /* fall through */
172 case S_IFIFO:
173 di->di_data[0] = cpu_to_be64(inode->i_rdev);
174 break;
175 case S_IFDIR: /* fall through */
176 case S_IFREG: /* fall through */
177 case S_IFLNK:
178 for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
179 di->di_data[i] = cpu_to_be64(li->li_data[i]);
180 break;
181 default:
182 BUG();
183 }
184}
185
186static void __logfs_set_blocks(struct inode *inode)
187{
188 struct super_block *sb = inode->i_sb;
189 struct logfs_inode *li = logfs_inode(inode);
190
191 inode->i_blocks = ULONG_MAX;
192 if (li->li_used_bytes >> sb->s_blocksize_bits < ULONG_MAX)
193 inode->i_blocks = ALIGN(li->li_used_bytes, 512) >> 9;
194}
195
196void logfs_set_blocks(struct inode *inode, u64 bytes)
197{
198 struct logfs_inode *li = logfs_inode(inode);
199
200 li->li_used_bytes = bytes;
201 __logfs_set_blocks(inode);
202}
203
204static void prelock_page(struct super_block *sb, struct page *page, int lock)
205{
206 struct logfs_super *super = logfs_super(sb);
207
208 BUG_ON(!PageLocked(page));
209 if (lock) {
210 BUG_ON(PagePreLocked(page));
211 SetPagePreLocked(page);
212 } else {
213 /* We are in GC path. */
214 if (PagePreLocked(page))
215 super->s_lock_count++;
216 else
217 SetPagePreLocked(page);
218 }
219}
220
221static void preunlock_page(struct super_block *sb, struct page *page, int lock)
222{
223 struct logfs_super *super = logfs_super(sb);
224
225 BUG_ON(!PageLocked(page));
226 if (lock)
227 ClearPagePreLocked(page);
228 else {
229 /* We are in GC path. */
230 BUG_ON(!PagePreLocked(page));
231 if (super->s_lock_count)
232 super->s_lock_count--;
233 else
234 ClearPagePreLocked(page);
235 }
236}
237
238/*
239 * Logfs is prone to an AB-BA deadlock where one task tries to acquire
240 * s_write_mutex with a locked page and GC tries to get that page while holding
241 * s_write_mutex.
242 * To solve this issue logfs will ignore the page lock iff the page in question
243 * is waiting for s_write_mutex. We annotate this fact by setting PG_pre_locked
244 * in addition to PG_locked.
245 */
246static void logfs_get_wblocks(struct super_block *sb, struct page *page,
247 int lock)
248{
249 struct logfs_super *super = logfs_super(sb);
250
251 if (page)
252 prelock_page(sb, page, lock);
253
254 if (lock) {
255 mutex_lock(&super->s_write_mutex);
256 logfs_gc_pass(sb);
257 /* FIXME: We also have to check for shadowed space
258 * and mempool fill grade */
259 }
260}
261
262static void logfs_put_wblocks(struct super_block *sb, struct page *page,
263 int lock)
264{
265 struct logfs_super *super = logfs_super(sb);
266
267 if (page)
268 preunlock_page(sb, page, lock);
269 /* Order matters - we must clear PG_pre_locked before releasing
270 * s_write_mutex or we could race against another task. */
271 if (lock)
272 mutex_unlock(&super->s_write_mutex);
273}
274
275static struct page *logfs_get_read_page(struct inode *inode, u64 bix,
276 level_t level)
277{
278 return find_or_create_page(inode->i_mapping,
279 logfs_pack_index(bix, level), GFP_NOFS);
280}
281
282static void logfs_put_read_page(struct page *page)
283{
284 unlock_page(page);
285 page_cache_release(page);
286}
287
288static void logfs_lock_write_page(struct page *page)
289{
290 int loop = 0;
291
292 while (unlikely(!trylock_page(page))) {
293 if (loop++ > 0x1000) {
294 /* Has been observed once so far... */
295 printk(KERN_ERR "stack at %p\n", &loop);
296 BUG();
297 }
298 if (PagePreLocked(page)) {
299 /* Holder of page lock is waiting for us, it
300 * is safe to use this page. */
301 break;
302 }
303 /* Some other process has this page locked and has
304 * nothing to do with us. Wait for it to finish.
305 */
306 schedule();
307 }
308 BUG_ON(!PageLocked(page));
309}
310
311static struct page *logfs_get_write_page(struct inode *inode, u64 bix,
312 level_t level)
313{
314 struct address_space *mapping = inode->i_mapping;
315 pgoff_t index = logfs_pack_index(bix, level);
316 struct page *page;
317 int err;
318
319repeat:
320 page = find_get_page(mapping, index);
321 if (!page) {
322 page = __page_cache_alloc(GFP_NOFS);
323 if (!page)
324 return NULL;
325 err = add_to_page_cache_lru(page, mapping, index, GFP_NOFS);
326 if (unlikely(err)) {
327 page_cache_release(page);
328 if (err == -EEXIST)
329 goto repeat;
330 return NULL;
331 }
332 } else logfs_lock_write_page(page);
333 BUG_ON(!PageLocked(page));
334 return page;
335}
336
337static void logfs_unlock_write_page(struct page *page)
338{
339 if (!PagePreLocked(page))
340 unlock_page(page);
341}
342
343static void logfs_put_write_page(struct page *page)
344{
345 logfs_unlock_write_page(page);
346 page_cache_release(page);
347}
348
349static struct page *logfs_get_page(struct inode *inode, u64 bix, level_t level,
350 int rw)
351{
352 if (rw == READ)
353 return logfs_get_read_page(inode, bix, level);
354 else
355 return logfs_get_write_page(inode, bix, level);
356}
357
358static void logfs_put_page(struct page *page, int rw)
359{
360 if (rw == READ)
361 logfs_put_read_page(page);
362 else
363 logfs_put_write_page(page);
364}
365
366static unsigned long __get_bits(u64 val, int skip, int no)
367{
368 u64 ret = val;
369
370 ret >>= skip * no;
371 ret <<= 64 - no;
372 ret >>= 64 - no;
373 return ret;
374}
375
376static unsigned long get_bits(u64 val, level_t skip)
377{
378 return __get_bits(val, (__force int)skip, LOGFS_BLOCK_BITS);
379}
380
381static inline void init_shadow_tree(struct super_block *sb,
382 struct shadow_tree *tree)
383{
384 struct logfs_super *super = logfs_super(sb);
385
386 btree_init_mempool64(&tree->new, super->s_btree_pool);
387 btree_init_mempool64(&tree->old, super->s_btree_pool);
388}
389
390static void indirect_write_block(struct logfs_block *block)
391{
392 struct page *page;
393 struct inode *inode;
394 int ret;
395
396 page = block->page;
397 inode = page->mapping->host;
398 logfs_lock_write_page(page);
399 ret = logfs_write_buf(inode, page, 0);
400 logfs_unlock_write_page(page);
401 /*
402 * This needs some rework. Unless you want your filesystem to run
403 * completely synchronously (you don't), the filesystem will always
404 * report writes as 'successful' before the actual work has been
405 * done. The actual work gets done here and this is where any errors
406 * will show up. And there isn't much we can do about it, really.
407 *
408 * Some attempts to fix the errors (move from bad blocks, retry io,...)
409 * have already been done, so anything left should be either a broken
410 * device or a bug somewhere in logfs itself. Being relatively new,
411 * the odds currently favor a bug, so for now the line below isn't
412 * entirely tasteles.
413 */
414 BUG_ON(ret);
415}
416
417static void inode_write_block(struct logfs_block *block)
418{
419 struct inode *inode;
420 int ret;
421
422 inode = block->inode;
423 if (inode->i_ino == LOGFS_INO_MASTER)
424 logfs_write_anchor(inode->i_sb);
425 else {
426 ret = __logfs_write_inode(inode, 0);
427 /* see indirect_write_block comment */
428 BUG_ON(ret);
429 }
430}
431
432static gc_level_t inode_block_level(struct logfs_block *block)
433{
434 BUG_ON(block->inode->i_ino == LOGFS_INO_MASTER);
435 return GC_LEVEL(LOGFS_MAX_LEVELS);
436}
437
438static gc_level_t indirect_block_level(struct logfs_block *block)
439{
440 struct page *page;
441 struct inode *inode;
442 u64 bix;
443 level_t level;
444
445 page = block->page;
446 inode = page->mapping->host;
447 logfs_unpack_index(page->index, &bix, &level);
448 return expand_level(inode->i_ino, level);
449}
450
451/*
452 * This silences a false, yet annoying gcc warning. I hate it when my editor
453 * jumps into bitops.h each time I recompile this file.
454 * TODO: Complain to gcc folks about this and upgrade compiler.
455 */
456static unsigned long fnb(const unsigned long *addr,
457 unsigned long size, unsigned long offset)
458{
459 return find_next_bit(addr, size, offset);
460}
461
462static __be64 inode_val0(struct inode *inode)
463{
464 struct logfs_inode *li = logfs_inode(inode);
465 u64 val;
466
467 /*
468 * Explicit shifting generates good code, but must match the format
469 * of the structure. Add some paranoia just in case.
470 */
471 BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_mode) != 0);
472 BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_height) != 2);
473 BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_flags) != 4);
474
475 val = (u64)inode->i_mode << 48 |
476 (u64)li->li_height << 40 |
477 (u64)li->li_flags;
478 return cpu_to_be64(val);
479}
480
481static int inode_write_alias(struct super_block *sb,
482 struct logfs_block *block, write_alias_t *write_one_alias)
483{
484 struct inode *inode = block->inode;
485 struct logfs_inode *li = logfs_inode(inode);
486 unsigned long pos;
487 u64 ino , bix;
488 __be64 val;
489 level_t level;
490 int err;
491
492 for (pos = 0; ; pos++) {
493 pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos);
494 if (pos >= LOGFS_EMBEDDED_FIELDS + INODE_POINTER_OFS)
495 return 0;
496
497 switch (pos) {
498 case INODE_HEIGHT_OFS:
499 val = inode_val0(inode);
500 break;
501 case INODE_USED_OFS:
502 val = cpu_to_be64(li->li_used_bytes);;
503 break;
504 case INODE_SIZE_OFS:
505 val = cpu_to_be64(i_size_read(inode));
506 break;
507 case INODE_POINTER_OFS ... INODE_POINTER_OFS + LOGFS_EMBEDDED_FIELDS - 1:
508 val = cpu_to_be64(li->li_data[pos - INODE_POINTER_OFS]);
509 break;
510 default:
511 BUG();
512 }
513
514 ino = LOGFS_INO_MASTER;
515 bix = inode->i_ino;
516 level = LEVEL(0);
517 err = write_one_alias(sb, ino, bix, level, pos, val);
518 if (err)
519 return err;
520 }
521}
522
523static int indirect_write_alias(struct super_block *sb,
524 struct logfs_block *block, write_alias_t *write_one_alias)
525{
526 unsigned long pos;
527 struct page *page = block->page;
528 u64 ino , bix;
529 __be64 *child, val;
530 level_t level;
531 int err;
532
533 for (pos = 0; ; pos++) {
534 pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos);
535 if (pos >= LOGFS_BLOCK_FACTOR)
536 return 0;
537
538 ino = page->mapping->host->i_ino;
539 logfs_unpack_index(page->index, &bix, &level);
540 child = kmap_atomic(page, KM_USER0);
541 val = child[pos];
542 kunmap_atomic(child, KM_USER0);
543 err = write_one_alias(sb, ino, bix, level, pos, val);
544 if (err)
545 return err;
546 }
547}
548
549int logfs_write_obj_aliases_pagecache(struct super_block *sb)
550{
551 struct logfs_super *super = logfs_super(sb);
552 struct logfs_block *block;
553 int err;
554
555 list_for_each_entry(block, &super->s_object_alias, alias_list) {
556 err = block->ops->write_alias(sb, block, write_alias_journal);
557 if (err)
558 return err;
559 }
560 return 0;
561}
562
563void __free_block(struct super_block *sb, struct logfs_block *block)
564{
565 BUG_ON(!list_empty(&block->item_list));
566 list_del(&block->alias_list);
567 mempool_free(block, logfs_super(sb)->s_block_pool);
568}
569
570static void inode_free_block(struct super_block *sb, struct logfs_block *block)
571{
572 struct inode *inode = block->inode;
573
574 logfs_inode(inode)->li_block = NULL;
575 __free_block(sb, block);
576}
577
578static void indirect_free_block(struct super_block *sb,
579 struct logfs_block *block)
580{
581 ClearPagePrivate(block->page);
582 block->page->private = 0;
583 __free_block(sb, block);
584}
585
586
587static struct logfs_block_ops inode_block_ops = {
588 .write_block = inode_write_block,
589 .block_level = inode_block_level,
590 .free_block = inode_free_block,
591 .write_alias = inode_write_alias,
592};
593
594struct logfs_block_ops indirect_block_ops = {
595 .write_block = indirect_write_block,
596 .block_level = indirect_block_level,
597 .free_block = indirect_free_block,
598 .write_alias = indirect_write_alias,
599};
600
601struct logfs_block *__alloc_block(struct super_block *sb,
602 u64 ino, u64 bix, level_t level)
603{
604 struct logfs_super *super = logfs_super(sb);
605 struct logfs_block *block;
606
607 block = mempool_alloc(super->s_block_pool, GFP_NOFS);
608 memset(block, 0, sizeof(*block));
609 INIT_LIST_HEAD(&block->alias_list);
610 INIT_LIST_HEAD(&block->item_list);
611 block->sb = sb;
612 block->ino = ino;
613 block->bix = bix;
614 block->level = level;
615 return block;
616}
617
618static void alloc_inode_block(struct inode *inode)
619{
620 struct logfs_inode *li = logfs_inode(inode);
621 struct logfs_block *block;
622
623 if (li->li_block)
624 return;
625
626 block = __alloc_block(inode->i_sb, LOGFS_INO_MASTER, inode->i_ino, 0);
627 block->inode = inode;
628 li->li_block = block;
629 block->ops = &inode_block_ops;
630}
631
632void initialize_block_counters(struct page *page, struct logfs_block *block,
633 __be64 *array, int page_is_empty)
634{
635 u64 ptr;
636 int i, start;
637
638 block->partial = 0;
639 block->full = 0;
640 start = 0;
641 if (page->index < first_indirect_block()) {
642 /* Counters are pointless on level 0 */
643 return;
644 }
645 if (page->index == first_indirect_block()) {
646 /* Skip unused pointers */
647 start = I0_BLOCKS;
648 block->full = I0_BLOCKS;
649 }
650 if (!page_is_empty) {
651 for (i = start; i < LOGFS_BLOCK_FACTOR; i++) {
652 ptr = be64_to_cpu(array[i]);
653 if (ptr)
654 block->partial++;
655 if (ptr & LOGFS_FULLY_POPULATED)
656 block->full++;
657 }
658 }
659}
660
661static void alloc_data_block(struct inode *inode, struct page *page)
662{
663 struct logfs_block *block;
664 u64 bix;
665 level_t level;
666
667 if (PagePrivate(page))
668 return;
669
670 logfs_unpack_index(page->index, &bix, &level);
671 block = __alloc_block(inode->i_sb, inode->i_ino, bix, level);
672 block->page = page;
673 SetPagePrivate(page);
674 page->private = (unsigned long)block;
675 block->ops = &indirect_block_ops;
676}
677
678static void alloc_indirect_block(struct inode *inode, struct page *page,
679 int page_is_empty)
680{
681 struct logfs_block *block;
682 __be64 *array;
683
684 if (PagePrivate(page))
685 return;
686
687 alloc_data_block(inode, page);
688
689 block = logfs_block(page);
690 array = kmap_atomic(page, KM_USER0);
691 initialize_block_counters(page, block, array, page_is_empty);
692 kunmap_atomic(array, KM_USER0);
693}
694
695static void block_set_pointer(struct page *page, int index, u64 ptr)
696{
697 struct logfs_block *block = logfs_block(page);
698 __be64 *array;
699 u64 oldptr;
700
701 BUG_ON(!block);
702 array = kmap_atomic(page, KM_USER0);
703 oldptr = be64_to_cpu(array[index]);
704 array[index] = cpu_to_be64(ptr);
705 kunmap_atomic(array, KM_USER0);
706 SetPageUptodate(page);
707
708 block->full += !!(ptr & LOGFS_FULLY_POPULATED)
709 - !!(oldptr & LOGFS_FULLY_POPULATED);
710 block->partial += !!ptr - !!oldptr;
711}
712
713static u64 block_get_pointer(struct page *page, int index)
714{
715 __be64 *block;
716 u64 ptr;
717
718 block = kmap_atomic(page, KM_USER0);
719 ptr = be64_to_cpu(block[index]);
720 kunmap_atomic(block, KM_USER0);
721 return ptr;
722}
723
724static int logfs_read_empty(struct page *page)
725{
726 zero_user_segment(page, 0, PAGE_CACHE_SIZE);
727 return 0;
728}
729
730static int logfs_read_direct(struct inode *inode, struct page *page)
731{
732 struct logfs_inode *li = logfs_inode(inode);
733 pgoff_t index = page->index;
734 u64 block;
735
736 block = li->li_data[index];
737 if (!block)
738 return logfs_read_empty(page);
739
740 return logfs_segment_read(inode, page, block, index, 0);
741}
742
743static int logfs_read_loop(struct inode *inode, struct page *page,
744 int rw_context)
745{
746 struct logfs_inode *li = logfs_inode(inode);
747 u64 bix, bofs = li->li_data[INDIRECT_INDEX];
748 level_t level, target_level;
749 int ret;
750 struct page *ipage;
751
752 logfs_unpack_index(page->index, &bix, &target_level);
753 if (!bofs)
754 return logfs_read_empty(page);
755
756 if (bix >= maxbix(li->li_height))
757 return logfs_read_empty(page);
758
759 for (level = LEVEL(li->li_height);
760 (__force u8)level > (__force u8)target_level;
761 level = SUBLEVEL(level)){
762 ipage = logfs_get_page(inode, bix, level, rw_context);
763 if (!ipage)
764 return -ENOMEM;
765
766 ret = logfs_segment_read(inode, ipage, bofs, bix, level);
767 if (ret) {
768 logfs_put_read_page(ipage);
769 return ret;
770 }
771
772 bofs = block_get_pointer(ipage, get_bits(bix, SUBLEVEL(level)));
773 logfs_put_page(ipage, rw_context);
774 if (!bofs)
775 return logfs_read_empty(page);
776 }
777
778 return logfs_segment_read(inode, page, bofs, bix, 0);
779}
780
781static int logfs_read_block(struct inode *inode, struct page *page,
782 int rw_context)
783{
784 pgoff_t index = page->index;
785
786 if (index < I0_BLOCKS)
787 return logfs_read_direct(inode, page);
788 return logfs_read_loop(inode, page, rw_context);
789}
790
791static int logfs_exist_loop(struct inode *inode, u64 bix)
792{
793 struct logfs_inode *li = logfs_inode(inode);
794 u64 bofs = li->li_data[INDIRECT_INDEX];
795 level_t level;
796 int ret;
797 struct page *ipage;
798
799 if (!bofs)
800 return 0;
801 if (bix >= maxbix(li->li_height))
802 return 0;
803
804 for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)) {
805 ipage = logfs_get_read_page(inode, bix, level);
806 if (!ipage)
807 return -ENOMEM;
808
809 ret = logfs_segment_read(inode, ipage, bofs, bix, level);
810 if (ret) {
811 logfs_put_read_page(ipage);
812 return ret;
813 }
814
815 bofs = block_get_pointer(ipage, get_bits(bix, SUBLEVEL(level)));
816 logfs_put_read_page(ipage);
817 if (!bofs)
818 return 0;
819 }
820
821 return 1;
822}
823
824int logfs_exist_block(struct inode *inode, u64 bix)
825{
826 struct logfs_inode *li = logfs_inode(inode);
827
828 if (bix < I0_BLOCKS)
829 return !!li->li_data[bix];
830 return logfs_exist_loop(inode, bix);
831}
832
833static u64 seek_holedata_direct(struct inode *inode, u64 bix, int data)
834{
835 struct logfs_inode *li = logfs_inode(inode);
836
837 for (; bix < I0_BLOCKS; bix++)
838 if (data ^ (li->li_data[bix] == 0))
839 return bix;
840 return I0_BLOCKS;
841}
842
843static u64 seek_holedata_loop(struct inode *inode, u64 bix, int data)
844{
845 struct logfs_inode *li = logfs_inode(inode);
846 __be64 *rblock;
847 u64 increment, bofs = li->li_data[INDIRECT_INDEX];
848 level_t level;
849 int ret, slot;
850 struct page *page;
851
852 BUG_ON(!bofs);
853
854 for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)) {
855 increment = 1 << (LOGFS_BLOCK_BITS * ((__force u8)level-1));
856 page = logfs_get_read_page(inode, bix, level);
857 if (!page)
858 return bix;
859
860 ret = logfs_segment_read(inode, page, bofs, bix, level);
861 if (ret) {
862 logfs_put_read_page(page);
863 return bix;
864 }
865
866 slot = get_bits(bix, SUBLEVEL(level));
867 rblock = kmap_atomic(page, KM_USER0);
868 while (slot < LOGFS_BLOCK_FACTOR) {
869 if (data && (rblock[slot] != 0))
870 break;
871 if (!data && !(be64_to_cpu(rblock[slot]) & LOGFS_FULLY_POPULATED))
872 break;
873 slot++;
874 bix += increment;
875 bix &= ~(increment - 1);
876 }
877 if (slot >= LOGFS_BLOCK_FACTOR) {
878 kunmap_atomic(rblock, KM_USER0);
879 logfs_put_read_page(page);
880 return bix;
881 }
882 bofs = be64_to_cpu(rblock[slot]);
883 kunmap_atomic(rblock, KM_USER0);
884 logfs_put_read_page(page);
885 if (!bofs) {
886 BUG_ON(data);
887 return bix;
888 }
889 }
890 return bix;
891}
892
893/**
894 * logfs_seek_hole - find next hole starting at a given block index
895 * @inode: inode to search in
896 * @bix: block index to start searching
897 *
898 * Returns next hole. If the file doesn't contain any further holes, the
899 * block address next to eof is returned instead.
900 */
901u64 logfs_seek_hole(struct inode *inode, u64 bix)
902{
903 struct logfs_inode *li = logfs_inode(inode);
904
905 if (bix < I0_BLOCKS) {
906 bix = seek_holedata_direct(inode, bix, 0);
907 if (bix < I0_BLOCKS)
908 return bix;
909 }
910
911 if (!li->li_data[INDIRECT_INDEX])
912 return bix;
913 else if (li->li_data[INDIRECT_INDEX] & LOGFS_FULLY_POPULATED)
914 bix = maxbix(li->li_height);
915 else {
916 bix = seek_holedata_loop(inode, bix, 0);
917 if (bix < maxbix(li->li_height))
918 return bix;
919 /* Should not happen anymore. But if some port writes semi-
920 * corrupt images (as this one used to) we might run into it.
921 */
922 WARN_ON_ONCE(bix == maxbix(li->li_height));
923 }
924
925 return bix;
926}
927
928static u64 __logfs_seek_data(struct inode *inode, u64 bix)
929{
930 struct logfs_inode *li = logfs_inode(inode);
931
932 if (bix < I0_BLOCKS) {
933 bix = seek_holedata_direct(inode, bix, 1);
934 if (bix < I0_BLOCKS)
935 return bix;
936 }
937
938 if (bix < maxbix(li->li_height)) {
939 if (!li->li_data[INDIRECT_INDEX])
940 bix = maxbix(li->li_height);
941 else
942 return seek_holedata_loop(inode, bix, 1);
943 }
944
945 return bix;
946}
947
948/**
949 * logfs_seek_data - find next data block after a given block index
950 * @inode: inode to search in
951 * @bix: block index to start searching
952 *
953 * Returns next data block. If the file doesn't contain any further data
954 * blocks, the last block in the file is returned instead.
955 */
956u64 logfs_seek_data(struct inode *inode, u64 bix)
957{
958 struct super_block *sb = inode->i_sb;
959 u64 ret, end;
960
961 ret = __logfs_seek_data(inode, bix);
962 end = i_size_read(inode) >> sb->s_blocksize_bits;
963 if (ret >= end)
964 ret = max(bix, end);
965 return ret;
966}
967
968static int logfs_is_valid_direct(struct logfs_inode *li, u64 bix, u64 ofs)
969{
970 return pure_ofs(li->li_data[bix]) == ofs;
971}
972
973static int __logfs_is_valid_loop(struct inode *inode, u64 bix,
974 u64 ofs, u64 bofs)
975{
976 struct logfs_inode *li = logfs_inode(inode);
977 level_t level;
978 int ret;
979 struct page *page;
980
981 for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)){
982 page = logfs_get_write_page(inode, bix, level);
983 BUG_ON(!page);
984
985 ret = logfs_segment_read(inode, page, bofs, bix, level);
986 if (ret) {
987 logfs_put_write_page(page);
988 return 0;
989 }
990
991 bofs = block_get_pointer(page, get_bits(bix, SUBLEVEL(level)));
992 logfs_put_write_page(page);
993 if (!bofs)
994 return 0;
995
996 if (pure_ofs(bofs) == ofs)
997 return 1;
998 }
999 return 0;
1000}
1001
1002static int logfs_is_valid_loop(struct inode *inode, u64 bix, u64 ofs)
1003{
1004 struct logfs_inode *li = logfs_inode(inode);
1005 u64 bofs = li->li_data[INDIRECT_INDEX];
1006
1007 if (!bofs)
1008 return 0;
1009
1010 if (bix >= maxbix(li->li_height))
1011 return 0;
1012
1013 if (pure_ofs(bofs) == ofs)
1014 return 1;
1015
1016 return __logfs_is_valid_loop(inode, bix, ofs, bofs);
1017}
1018
1019static int __logfs_is_valid_block(struct inode *inode, u64 bix, u64 ofs)
1020{
1021 struct logfs_inode *li = logfs_inode(inode);
1022
1023 if ((inode->i_nlink == 0) && atomic_read(&inode->i_count) == 1)
1024 return 0;
1025
1026 if (bix < I0_BLOCKS)
1027 return logfs_is_valid_direct(li, bix, ofs);
1028 return logfs_is_valid_loop(inode, bix, ofs);
1029}
1030
1031/**
1032 * logfs_is_valid_block - check whether this block is still valid
1033 *
1034 * @sb - superblock
1035 * @ofs - block physical offset
1036 * @ino - block inode number
1037 * @bix - block index
1038 * @level - block level
1039 *
1040 * Returns 0 if the block is invalid, 1 if it is valid and 2 if it will
1041 * become invalid once the journal is written.
1042 */
1043int logfs_is_valid_block(struct super_block *sb, u64 ofs, u64 ino, u64 bix,
1044 gc_level_t gc_level)
1045{
1046 struct logfs_super *super = logfs_super(sb);
1047 struct inode *inode;
1048 int ret, cookie;
1049
1050 /* Umount closes a segment with free blocks remaining. Those
1051 * blocks are by definition invalid. */
1052 if (ino == -1)
1053 return 0;
1054
1055 LOGFS_BUG_ON((u64)(u_long)ino != ino, sb);
1056
1057 inode = logfs_safe_iget(sb, ino, &cookie);
1058 if (IS_ERR(inode))
1059 goto invalid;
1060
1061 ret = __logfs_is_valid_block(inode, bix, ofs);
1062 logfs_safe_iput(inode, cookie);
1063 if (ret)
1064 return ret;
1065
1066invalid:
1067 /* Block is nominally invalid, but may still sit in the shadow tree,
1068 * waiting for a journal commit.
1069 */
1070 if (btree_lookup64(&super->s_shadow_tree.old, ofs))
1071 return 2;
1072 return 0;
1073}
1074
1075int logfs_readpage_nolock(struct page *page)
1076{
1077 struct inode *inode = page->mapping->host;
1078 int ret = -EIO;
1079
1080 ret = logfs_read_block(inode, page, READ);
1081
1082 if (ret) {
1083 ClearPageUptodate(page);
1084 SetPageError(page);
1085 } else {
1086 SetPageUptodate(page);
1087 ClearPageError(page);
1088 }
1089 flush_dcache_page(page);
1090
1091 return ret;
1092}
1093
1094static int logfs_reserve_bytes(struct inode *inode, int bytes)
1095{
1096 struct logfs_super *super = logfs_super(inode->i_sb);
1097 u64 available = super->s_free_bytes + super->s_dirty_free_bytes
1098 - super->s_dirty_used_bytes - super->s_dirty_pages;
1099
1100 if (!bytes)
1101 return 0;
1102
1103 if (available < bytes)
1104 return -ENOSPC;
1105
1106 if (available < bytes + super->s_root_reserve &&
1107 !capable(CAP_SYS_RESOURCE))
1108 return -ENOSPC;
1109
1110 return 0;
1111}
1112
1113int get_page_reserve(struct inode *inode, struct page *page)
1114{
1115 struct logfs_super *super = logfs_super(inode->i_sb);
1116 int ret;
1117
1118 if (logfs_block(page) && logfs_block(page)->reserved_bytes)
1119 return 0;
1120
1121 logfs_get_wblocks(inode->i_sb, page, WF_LOCK);
1122 ret = logfs_reserve_bytes(inode, 6 * LOGFS_MAX_OBJECTSIZE);
1123 if (!ret) {
1124 alloc_data_block(inode, page);
1125 logfs_block(page)->reserved_bytes += 6 * LOGFS_MAX_OBJECTSIZE;
1126 super->s_dirty_pages += 6 * LOGFS_MAX_OBJECTSIZE;
1127 }
1128 logfs_put_wblocks(inode->i_sb, page, WF_LOCK);
1129 return ret;
1130}
1131
1132/*
1133 * We are protected by write lock. Push victims up to superblock level
1134 * and release transaction when appropriate.
1135 */
1136/* FIXME: This is currently called from the wrong spots. */
1137static void logfs_handle_transaction(struct inode *inode,
1138 struct logfs_transaction *ta)
1139{
1140 struct logfs_super *super = logfs_super(inode->i_sb);
1141
1142 if (!ta)
1143 return;
1144 logfs_inode(inode)->li_block->ta = NULL;
1145
1146 if (inode->i_ino != LOGFS_INO_MASTER) {
1147 BUG(); /* FIXME: Yes, this needs more thought */
1148 /* just remember the transaction until inode is written */
1149 //BUG_ON(logfs_inode(inode)->li_transaction);
1150 //logfs_inode(inode)->li_transaction = ta;
1151 return;
1152 }
1153
1154 switch (ta->state) {
1155 case CREATE_1: /* fall through */
1156 case UNLINK_1:
1157 BUG_ON(super->s_victim_ino);
1158 super->s_victim_ino = ta->ino;
1159 break;
1160 case CREATE_2: /* fall through */
1161 case UNLINK_2:
1162 BUG_ON(super->s_victim_ino != ta->ino);
1163 super->s_victim_ino = 0;
1164 /* transaction ends here - free it */
1165 kfree(ta);
1166 break;
1167 case CROSS_RENAME_1:
1168 BUG_ON(super->s_rename_dir);
1169 BUG_ON(super->s_rename_pos);
1170 super->s_rename_dir = ta->dir;
1171 super->s_rename_pos = ta->pos;
1172 break;
1173 case CROSS_RENAME_2:
1174 BUG_ON(super->s_rename_dir != ta->dir);
1175 BUG_ON(super->s_rename_pos != ta->pos);
1176 super->s_rename_dir = 0;
1177 super->s_rename_pos = 0;
1178 kfree(ta);
1179 break;
1180 case TARGET_RENAME_1:
1181 BUG_ON(super->s_rename_dir);
1182 BUG_ON(super->s_rename_pos);
1183 BUG_ON(super->s_victim_ino);
1184 super->s_rename_dir = ta->dir;
1185 super->s_rename_pos = ta->pos;
1186 super->s_victim_ino = ta->ino;
1187 break;
1188 case TARGET_RENAME_2:
1189 BUG_ON(super->s_rename_dir != ta->dir);
1190 BUG_ON(super->s_rename_pos != ta->pos);
1191 BUG_ON(super->s_victim_ino != ta->ino);
1192 super->s_rename_dir = 0;
1193 super->s_rename_pos = 0;
1194 break;
1195 case TARGET_RENAME_3:
1196 BUG_ON(super->s_rename_dir);
1197 BUG_ON(super->s_rename_pos);
1198 BUG_ON(super->s_victim_ino != ta->ino);
1199 super->s_victim_ino = 0;
1200 kfree(ta);
1201 break;
1202 default:
1203 BUG();
1204 }
1205}
1206
1207/*
1208 * Not strictly a reservation, but rather a check that we still have enough
1209 * space to satisfy the write.
1210 */
1211static int logfs_reserve_blocks(struct inode *inode, int blocks)
1212{
1213 return logfs_reserve_bytes(inode, blocks * LOGFS_MAX_OBJECTSIZE);
1214}
1215
1216struct write_control {
1217 u64 ofs;
1218 long flags;
1219};
1220
1221static struct logfs_shadow *alloc_shadow(struct inode *inode, u64 bix,
1222 level_t level, u64 old_ofs)
1223{
1224 struct logfs_super *super = logfs_super(inode->i_sb);
1225 struct logfs_shadow *shadow;
1226
1227 shadow = mempool_alloc(super->s_shadow_pool, GFP_NOFS);
1228 memset(shadow, 0, sizeof(*shadow));
1229 shadow->ino = inode->i_ino;
1230 shadow->bix = bix;
1231 shadow->gc_level = expand_level(inode->i_ino, level);
1232 shadow->old_ofs = old_ofs & ~LOGFS_FULLY_POPULATED;
1233 return shadow;
1234}
1235
1236static void free_shadow(struct inode *inode, struct logfs_shadow *shadow)
1237{
1238 struct logfs_super *super = logfs_super(inode->i_sb);
1239
1240 mempool_free(shadow, super->s_shadow_pool);
1241}
1242
1243/**
1244 * fill_shadow_tree - Propagate shadow tree changes due to a write
1245 * @inode: Inode owning the page
1246 * @page: Struct page that was written
1247 * @shadow: Shadow for the current write
1248 *
1249 * Writes in logfs can result in two semi-valid objects. The old object
1250 * is still valid as long as it can be reached by following pointers on
1251 * the medium. Only when writes propagate all the way up to the journal
1252 * has the new object safely replaced the old one.
1253 *
1254 * To handle this problem, a struct logfs_shadow is used to represent
1255 * every single write. It is attached to the indirect block, which is
1256 * marked dirty. When the indirect block is written, its shadows are
1257 * handed up to the next indirect block (or inode). Untimately they
1258 * will reach the master inode and be freed upon journal commit.
1259 *
1260 * This function handles a single step in the propagation. It adds the
1261 * shadow for the current write to the tree, along with any shadows in
1262 * the page's tree, in case it was an indirect block. If a page is
1263 * written, the inode parameter is left NULL, if an inode is written,
1264 * the page parameter is left NULL.
1265 */
1266static void fill_shadow_tree(struct inode *inode, struct page *page,
1267 struct logfs_shadow *shadow)
1268{
1269 struct logfs_super *super = logfs_super(inode->i_sb);
1270 struct logfs_block *block = logfs_block(page);
1271 struct shadow_tree *tree = &super->s_shadow_tree;
1272
1273 if (PagePrivate(page)) {
1274 if (block->alias_map)
1275 super->s_no_object_aliases -= bitmap_weight(
1276 block->alias_map, LOGFS_BLOCK_FACTOR);
1277 logfs_handle_transaction(inode, block->ta);
1278 block->ops->free_block(inode->i_sb, block);
1279 }
1280 if (shadow) {
1281 if (shadow->old_ofs)
1282 btree_insert64(&tree->old, shadow->old_ofs, shadow,
1283 GFP_NOFS);
1284 else
1285 btree_insert64(&tree->new, shadow->new_ofs, shadow,
1286 GFP_NOFS);
1287
1288 super->s_dirty_used_bytes += shadow->new_len;
1289 super->s_dirty_free_bytes += shadow->old_len;
1290 }
1291}
1292
1293static void logfs_set_alias(struct super_block *sb, struct logfs_block *block,
1294 long child_no)
1295{
1296 struct logfs_super *super = logfs_super(sb);
1297
1298 if (block->inode && block->inode->i_ino == LOGFS_INO_MASTER) {
1299 /* Aliases in the master inode are pointless. */
1300 return;
1301 }
1302
1303 if (!test_bit(child_no, block->alias_map)) {
1304 set_bit(child_no, block->alias_map);
1305 super->s_no_object_aliases++;
1306 }
1307 list_move_tail(&block->alias_list, &super->s_object_alias);
1308}
1309
1310/*
1311 * Object aliases can and often do change the size and occupied space of a
1312 * file. So not only do we have to change the pointers, we also have to
1313 * change inode->i_size and li->li_used_bytes. Which is done by setting
1314 * another two object aliases for the inode itself.
1315 */
1316static void set_iused(struct inode *inode, struct logfs_shadow *shadow)
1317{
1318 struct logfs_inode *li = logfs_inode(inode);
1319
1320 if (shadow->new_len == shadow->old_len)
1321 return;
1322
1323 alloc_inode_block(inode);
1324 li->li_used_bytes += shadow->new_len - shadow->old_len;
1325 __logfs_set_blocks(inode);
1326 logfs_set_alias(inode->i_sb, li->li_block, INODE_USED_OFS);
1327 logfs_set_alias(inode->i_sb, li->li_block, INODE_SIZE_OFS);
1328}
1329
1330static int logfs_write_i0(struct inode *inode, struct page *page,
1331 struct write_control *wc)
1332{
1333 struct logfs_shadow *shadow;
1334 u64 bix;
1335 level_t level;
1336 int full, err = 0;
1337
1338 logfs_unpack_index(page->index, &bix, &level);
1339 if (wc->ofs == 0)
1340 if (logfs_reserve_blocks(inode, 1))
1341 return -ENOSPC;
1342
1343 shadow = alloc_shadow(inode, bix, level, wc->ofs);
1344 if (wc->flags & WF_WRITE)
1345 err = logfs_segment_write(inode, page, shadow);
1346 if (wc->flags & WF_DELETE)
1347 logfs_segment_delete(inode, shadow);
1348 if (err) {
1349 free_shadow(inode, shadow);
1350 return err;
1351 }
1352
1353 set_iused(inode, shadow);
1354 full = 1;
1355 if (level != 0) {
1356 alloc_indirect_block(inode, page, 0);
1357 full = logfs_block(page)->full == LOGFS_BLOCK_FACTOR;
1358 }
1359 fill_shadow_tree(inode, page, shadow);
1360 wc->ofs = shadow->new_ofs;
1361 if (wc->ofs && full)
1362 wc->ofs |= LOGFS_FULLY_POPULATED;
1363 return 0;
1364}
1365
1366static int logfs_write_direct(struct inode *inode, struct page *page,
1367 long flags)
1368{
1369 struct logfs_inode *li = logfs_inode(inode);
1370 struct write_control wc = {
1371 .ofs = li->li_data[page->index],
1372 .flags = flags,
1373 };
1374 int err;
1375
1376 alloc_inode_block(inode);
1377
1378 err = logfs_write_i0(inode, page, &wc);
1379 if (err)
1380 return err;
1381
1382 li->li_data[page->index] = wc.ofs;
1383 logfs_set_alias(inode->i_sb, li->li_block,
1384 page->index + INODE_POINTER_OFS);
1385 return 0;
1386}
1387
1388static int ptr_change(u64 ofs, struct page *page)
1389{
1390 struct logfs_block *block = logfs_block(page);
1391 int empty0, empty1, full0, full1;
1392
1393 empty0 = ofs == 0;
1394 empty1 = block->partial == 0;
1395 if (empty0 != empty1)
1396 return 1;
1397
1398 /* The !! is necessary to shrink result to int */
1399 full0 = !!(ofs & LOGFS_FULLY_POPULATED);
1400 full1 = block->full == LOGFS_BLOCK_FACTOR;
1401 if (full0 != full1)
1402 return 1;
1403 return 0;
1404}
1405
1406static int __logfs_write_rec(struct inode *inode, struct page *page,
1407 struct write_control *this_wc,
1408 pgoff_t bix, level_t target_level, level_t level)
1409{
1410 int ret, page_empty = 0;
1411 int child_no = get_bits(bix, SUBLEVEL(level));
1412 struct page *ipage;
1413 struct write_control child_wc = {
1414 .flags = this_wc->flags,
1415 };
1416
1417 ipage = logfs_get_write_page(inode, bix, level);
1418 if (!ipage)
1419 return -ENOMEM;
1420
1421 if (this_wc->ofs) {
1422 ret = logfs_segment_read(inode, ipage, this_wc->ofs, bix, level);
1423 if (ret)
1424 goto out;
1425 } else if (!PageUptodate(ipage)) {
1426 page_empty = 1;
1427 logfs_read_empty(ipage);
1428 }
1429
1430 child_wc.ofs = block_get_pointer(ipage, child_no);
1431
1432 if ((__force u8)level-1 > (__force u8)target_level)
1433 ret = __logfs_write_rec(inode, page, &child_wc, bix,
1434 target_level, SUBLEVEL(level));
1435 else
1436 ret = logfs_write_i0(inode, page, &child_wc);
1437
1438 if (ret)
1439 goto out;
1440
1441 alloc_indirect_block(inode, ipage, page_empty);
1442 block_set_pointer(ipage, child_no, child_wc.ofs);
1443 /* FIXME: first condition seems superfluous */
1444 if (child_wc.ofs || logfs_block(ipage)->partial)
1445 this_wc->flags |= WF_WRITE;
1446 /* the condition on this_wc->ofs ensures that we won't consume extra
1447 * space for indirect blocks in the future, which we cannot reserve */
1448 if (!this_wc->ofs || ptr_change(this_wc->ofs, ipage))
1449 ret = logfs_write_i0(inode, ipage, this_wc);
1450 else
1451 logfs_set_alias(inode->i_sb, logfs_block(ipage), child_no);
1452out:
1453 logfs_put_write_page(ipage);
1454 return ret;
1455}
1456
1457static int logfs_write_rec(struct inode *inode, struct page *page,
1458 pgoff_t bix, level_t target_level, long flags)
1459{
1460 struct logfs_inode *li = logfs_inode(inode);
1461 struct write_control wc = {
1462 .ofs = li->li_data[INDIRECT_INDEX],
1463 .flags = flags,
1464 };
1465 int ret;
1466
1467 alloc_inode_block(inode);
1468
1469 if (li->li_height > (__force u8)target_level)
1470 ret = __logfs_write_rec(inode, page, &wc, bix, target_level,
1471 LEVEL(li->li_height));
1472 else
1473 ret = logfs_write_i0(inode, page, &wc);
1474 if (ret)
1475 return ret;
1476
1477 if (li->li_data[INDIRECT_INDEX] != wc.ofs) {
1478 li->li_data[INDIRECT_INDEX] = wc.ofs;
1479 logfs_set_alias(inode->i_sb, li->li_block,
1480 INDIRECT_INDEX + INODE_POINTER_OFS);
1481 }
1482 return ret;
1483}
1484
1485void logfs_add_transaction(struct inode *inode, struct logfs_transaction *ta)
1486{
1487 alloc_inode_block(inode);
1488 logfs_inode(inode)->li_block->ta = ta;
1489}
1490
1491void logfs_del_transaction(struct inode *inode, struct logfs_transaction *ta)
1492{
1493 struct logfs_block *block = logfs_inode(inode)->li_block;
1494
1495 if (block && block->ta)
1496 block->ta = NULL;
1497}
1498
1499static int grow_inode(struct inode *inode, u64 bix, level_t level)
1500{
1501 struct logfs_inode *li = logfs_inode(inode);
1502 u8 height = (__force u8)level;
1503 struct page *page;
1504 struct write_control wc = {
1505 .flags = WF_WRITE,
1506 };
1507 int err;
1508
1509 BUG_ON(height > 5 || li->li_height > 5);
1510 while (height > li->li_height || bix >= maxbix(li->li_height)) {
1511 page = logfs_get_write_page(inode, I0_BLOCKS + 1,
1512 LEVEL(li->li_height + 1));
1513 if (!page)
1514 return -ENOMEM;
1515 logfs_read_empty(page);
1516 alloc_indirect_block(inode, page, 1);
1517 block_set_pointer(page, 0, li->li_data[INDIRECT_INDEX]);
1518 err = logfs_write_i0(inode, page, &wc);
1519 logfs_put_write_page(page);
1520 if (err)
1521 return err;
1522 li->li_data[INDIRECT_INDEX] = wc.ofs;
1523 wc.ofs = 0;
1524 li->li_height++;
1525 logfs_set_alias(inode->i_sb, li->li_block, INODE_HEIGHT_OFS);
1526 }
1527 return 0;
1528}
1529
1530static int __logfs_write_buf(struct inode *inode, struct page *page, long flags)
1531{
1532 struct logfs_super *super = logfs_super(inode->i_sb);
1533 pgoff_t index = page->index;
1534 u64 bix;
1535 level_t level;
1536 int err;
1537
1538 flags |= WF_WRITE | WF_DELETE;
1539 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
1540
1541 logfs_unpack_index(index, &bix, &level);
1542 if (logfs_block(page) && logfs_block(page)->reserved_bytes)
1543 super->s_dirty_pages -= logfs_block(page)->reserved_bytes;
1544
1545 if (index < I0_BLOCKS)
1546 return logfs_write_direct(inode, page, flags);
1547
1548 bix = adjust_bix(bix, level);
1549 err = grow_inode(inode, bix, level);
1550 if (err)
1551 return err;
1552 return logfs_write_rec(inode, page, bix, level, flags);
1553}
1554
1555int logfs_write_buf(struct inode *inode, struct page *page, long flags)
1556{
1557 struct super_block *sb = inode->i_sb;
1558 int ret;
1559
1560 logfs_get_wblocks(sb, page, flags & WF_LOCK);
1561 ret = __logfs_write_buf(inode, page, flags);
1562 logfs_put_wblocks(sb, page, flags & WF_LOCK);
1563 return ret;
1564}
1565
1566static int __logfs_delete(struct inode *inode, struct page *page)
1567{
1568 long flags = WF_DELETE;
1569
1570 inode->i_ctime = inode->i_mtime = CURRENT_TIME;
1571
1572 if (page->index < I0_BLOCKS)
1573 return logfs_write_direct(inode, page, flags);
1574 return logfs_write_rec(inode, page, page->index, 0, flags);
1575}
1576
1577int logfs_delete(struct inode *inode, pgoff_t index,
1578 struct shadow_tree *shadow_tree)
1579{
1580 struct super_block *sb = inode->i_sb;
1581 struct page *page;
1582 int ret;
1583
1584 page = logfs_get_read_page(inode, index, 0);
1585 if (!page)
1586 return -ENOMEM;
1587
1588 logfs_get_wblocks(sb, page, 1);
1589 ret = __logfs_delete(inode, page);
1590 logfs_put_wblocks(sb, page, 1);
1591
1592 logfs_put_read_page(page);
1593
1594 return ret;
1595}
1596
1597/* Rewrite cannot mark the inode dirty but has to write it immediatly. */
1598int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs,
1599 gc_level_t gc_level, long flags)
1600{
1601 level_t level = shrink_level(gc_level);
1602 struct page *page;
1603 int err;
1604
1605 page = logfs_get_write_page(inode, bix, level);
1606 if (!page)
1607 return -ENOMEM;
1608
1609 err = logfs_segment_read(inode, page, ofs, bix, level);
1610 if (!err) {
1611 if (level != 0)
1612 alloc_indirect_block(inode, page, 0);
1613 err = logfs_write_buf(inode, page, flags);
1614 }
1615 logfs_put_write_page(page);
1616 return err;
1617}
1618
1619static int truncate_data_block(struct inode *inode, struct page *page,
1620 u64 ofs, struct logfs_shadow *shadow, u64 size)
1621{
1622 loff_t pageofs = page->index << inode->i_sb->s_blocksize_bits;
1623 u64 bix;
1624 level_t level;
1625 int err;
1626
1627 /* Does truncation happen within this page? */
1628 if (size <= pageofs || size - pageofs >= PAGE_SIZE)
1629 return 0;
1630
1631 logfs_unpack_index(page->index, &bix, &level);
1632 BUG_ON(level != 0);
1633
1634 err = logfs_segment_read(inode, page, ofs, bix, level);
1635 if (err)
1636 return err;
1637
1638 zero_user_segment(page, size - pageofs, PAGE_CACHE_SIZE);
1639 return logfs_segment_write(inode, page, shadow);
1640}
1641
1642static int logfs_truncate_i0(struct inode *inode, struct page *page,
1643 struct write_control *wc, u64 size)
1644{
1645 struct logfs_shadow *shadow;
1646 u64 bix;
1647 level_t level;
1648 int err = 0;
1649
1650 logfs_unpack_index(page->index, &bix, &level);
1651 BUG_ON(level != 0);
1652 shadow = alloc_shadow(inode, bix, level, wc->ofs);
1653
1654 err = truncate_data_block(inode, page, wc->ofs, shadow, size);
1655 if (err) {
1656 free_shadow(inode, shadow);
1657 return err;
1658 }
1659
1660 logfs_segment_delete(inode, shadow);
1661 set_iused(inode, shadow);
1662 fill_shadow_tree(inode, page, shadow);
1663 wc->ofs = shadow->new_ofs;
1664 return 0;
1665}
1666
1667static int logfs_truncate_direct(struct inode *inode, u64 size)
1668{
1669 struct logfs_inode *li = logfs_inode(inode);
1670 struct write_control wc;
1671 struct page *page;
1672 int e;
1673 int err;
1674
1675 alloc_inode_block(inode);
1676
1677 for (e = I0_BLOCKS - 1; e >= 0; e--) {
1678 if (size > (e+1) * LOGFS_BLOCKSIZE)
1679 break;
1680
1681 wc.ofs = li->li_data[e];
1682 if (!wc.ofs)
1683 continue;
1684
1685 page = logfs_get_write_page(inode, e, 0);
1686 if (!page)
1687 return -ENOMEM;
1688 err = logfs_segment_read(inode, page, wc.ofs, e, 0);
1689 if (err) {
1690 logfs_put_write_page(page);
1691 return err;
1692 }
1693 err = logfs_truncate_i0(inode, page, &wc, size);
1694 logfs_put_write_page(page);
1695 if (err)
1696 return err;
1697
1698 li->li_data[e] = wc.ofs;
1699 }
1700 return 0;
1701}
1702
1703/* FIXME: these need to become per-sb once we support different blocksizes */
1704static u64 __logfs_step[] = {
1705 1,
1706 I1_BLOCKS,
1707 I2_BLOCKS,
1708 I3_BLOCKS,
1709};
1710
1711static u64 __logfs_start_index[] = {
1712 I0_BLOCKS,
1713 I1_BLOCKS,
1714 I2_BLOCKS,
1715 I3_BLOCKS
1716};
1717
1718static inline u64 logfs_step(level_t level)
1719{
1720 return __logfs_step[(__force u8)level];
1721}
1722
1723static inline u64 logfs_factor(u8 level)
1724{
1725 return __logfs_step[level] * LOGFS_BLOCKSIZE;
1726}
1727
1728static inline u64 logfs_start_index(level_t level)
1729{
1730 return __logfs_start_index[(__force u8)level];
1731}
1732
1733static void logfs_unpack_raw_index(pgoff_t index, u64 *bix, level_t *level)
1734{
1735 logfs_unpack_index(index, bix, level);
1736 if (*bix <= logfs_start_index(SUBLEVEL(*level)))
1737 *bix = 0;
1738}
1739
1740static int __logfs_truncate_rec(struct inode *inode, struct page *ipage,
1741 struct write_control *this_wc, u64 size)
1742{
1743 int truncate_happened = 0;
1744 int e, err = 0;
1745 u64 bix, child_bix, next_bix;
1746 level_t level;
1747 struct page *page;
1748 struct write_control child_wc = { /* FIXME: flags */ };
1749
1750 logfs_unpack_raw_index(ipage->index, &bix, &level);
1751 err = logfs_segment_read(inode, ipage, this_wc->ofs, bix, level);
1752 if (err)
1753 return err;
1754
1755 for (e = LOGFS_BLOCK_FACTOR - 1; e >= 0; e--) {
1756 child_bix = bix + e * logfs_step(SUBLEVEL(level));
1757 next_bix = child_bix + logfs_step(SUBLEVEL(level));
1758 if (size > next_bix * LOGFS_BLOCKSIZE)
1759 break;
1760
1761 child_wc.ofs = pure_ofs(block_get_pointer(ipage, e));
1762 if (!child_wc.ofs)
1763 continue;
1764
1765 page = logfs_get_write_page(inode, child_bix, SUBLEVEL(level));
1766 if (!page)
1767 return -ENOMEM;
1768
1769 if ((__force u8)level > 1)
1770 err = __logfs_truncate_rec(inode, page, &child_wc, size);
1771 else
1772 err = logfs_truncate_i0(inode, page, &child_wc, size);
1773 logfs_put_write_page(page);
1774 if (err)
1775 return err;
1776
1777 truncate_happened = 1;
1778 alloc_indirect_block(inode, ipage, 0);
1779 block_set_pointer(ipage, e, child_wc.ofs);
1780 }
1781
1782 if (!truncate_happened) {
1783 printk("ineffectual truncate (%lx, %lx, %llx)\n", inode->i_ino, ipage->index, size);
1784 return 0;
1785 }
1786
1787 this_wc->flags = WF_DELETE;
1788 if (logfs_block(ipage)->partial)
1789 this_wc->flags |= WF_WRITE;
1790
1791 return logfs_write_i0(inode, ipage, this_wc);
1792}
1793
1794static int logfs_truncate_rec(struct inode *inode, u64 size)
1795{
1796 struct logfs_inode *li = logfs_inode(inode);
1797 struct write_control wc = {
1798 .ofs = li->li_data[INDIRECT_INDEX],
1799 };
1800 struct page *page;
1801 int err;
1802
1803 alloc_inode_block(inode);
1804
1805 if (!wc.ofs)
1806 return 0;
1807
1808 page = logfs_get_write_page(inode, 0, LEVEL(li->li_height));
1809 if (!page)
1810 return -ENOMEM;
1811
1812 err = __logfs_truncate_rec(inode, page, &wc, size);
1813 logfs_put_write_page(page);
1814 if (err)
1815 return err;
1816
1817 if (li->li_data[INDIRECT_INDEX] != wc.ofs)
1818 li->li_data[INDIRECT_INDEX] = wc.ofs;
1819 return 0;
1820}
1821
1822static int __logfs_truncate(struct inode *inode, u64 size)
1823{
1824 int ret;
1825
1826 if (size >= logfs_factor(logfs_inode(inode)->li_height))
1827 return 0;
1828
1829 ret = logfs_truncate_rec(inode, size);
1830 if (ret)
1831 return ret;
1832
1833 return logfs_truncate_direct(inode, size);
1834}
1835
1836int logfs_truncate(struct inode *inode, u64 size)
1837{
1838 struct super_block *sb = inode->i_sb;
1839 int err;
1840
1841 logfs_get_wblocks(sb, NULL, 1);
1842 err = __logfs_truncate(inode, size);
1843 if (!err)
1844 err = __logfs_write_inode(inode, 0);
1845 logfs_put_wblocks(sb, NULL, 1);
1846
1847 if (!err)
1848 err = vmtruncate(inode, size);
1849
1850 /* I don't trust error recovery yet. */
1851 WARN_ON(err);
1852 return err;
1853}
1854
1855static void move_page_to_inode(struct inode *inode, struct page *page)
1856{
1857 struct logfs_inode *li = logfs_inode(inode);
1858 struct logfs_block *block = logfs_block(page);
1859
1860 if (!block)
1861 return;
1862
1863 log_blockmove("move_page_to_inode(%llx, %llx, %x)\n",
1864 block->ino, block->bix, block->level);
1865 BUG_ON(li->li_block);
1866 block->ops = &inode_block_ops;
1867 block->inode = inode;
1868 li->li_block = block;
1869
1870 block->page = NULL;
1871 page->private = 0;
1872 ClearPagePrivate(page);
1873}
1874
1875static void move_inode_to_page(struct page *page, struct inode *inode)
1876{
1877 struct logfs_inode *li = logfs_inode(inode);
1878 struct logfs_block *block = li->li_block;
1879
1880 if (!block)
1881 return;
1882
1883 log_blockmove("move_inode_to_page(%llx, %llx, %x)\n",
1884 block->ino, block->bix, block->level);
1885 BUG_ON(PagePrivate(page));
1886 block->ops = &indirect_block_ops;
1887 block->page = page;
1888 page->private = (unsigned long)block;
1889 SetPagePrivate(page);
1890
1891 block->inode = NULL;
1892 li->li_block = NULL;
1893}
1894
1895int logfs_read_inode(struct inode *inode)
1896{
1897 struct super_block *sb = inode->i_sb;
1898 struct logfs_super *super = logfs_super(sb);
1899 struct inode *master_inode = super->s_master_inode;
1900 struct page *page;
1901 struct logfs_disk_inode *di;
1902 u64 ino = inode->i_ino;
1903
1904 if (ino << sb->s_blocksize_bits > i_size_read(master_inode))
1905 return -ENODATA;
1906 if (!logfs_exist_block(master_inode, ino))
1907 return -ENODATA;
1908
1909 page = read_cache_page(master_inode->i_mapping, ino,
1910 (filler_t *)logfs_readpage, NULL);
1911 if (IS_ERR(page))
1912 return PTR_ERR(page);
1913
1914 di = kmap_atomic(page, KM_USER0);
1915 logfs_disk_to_inode(di, inode);
1916 kunmap_atomic(di, KM_USER0);
1917 move_page_to_inode(inode, page);
1918 page_cache_release(page);
1919 return 0;
1920}
1921
1922/* Caller must logfs_put_write_page(page); */
1923static struct page *inode_to_page(struct inode *inode)
1924{
1925 struct inode *master_inode = logfs_super(inode->i_sb)->s_master_inode;
1926 struct logfs_disk_inode *di;
1927 struct page *page;
1928
1929 BUG_ON(inode->i_ino == LOGFS_INO_MASTER);
1930
1931 page = logfs_get_write_page(master_inode, inode->i_ino, 0);
1932 if (!page)
1933 return NULL;
1934
1935 di = kmap_atomic(page, KM_USER0);
1936 logfs_inode_to_disk(inode, di);
1937 kunmap_atomic(di, KM_USER0);
1938 move_inode_to_page(page, inode);
1939 return page;
1940}
1941
1942/* Cheaper version of write_inode. All changes are concealed in
1943 * aliases, which are moved back. No write to the medium happens.
1944 */
1945void logfs_clear_inode(struct inode *inode)
1946{
1947 struct super_block *sb = inode->i_sb;
1948 struct logfs_inode *li = logfs_inode(inode);
1949 struct logfs_block *block = li->li_block;
1950 struct page *page;
1951
1952 /* Only deleted files may be dirty at this point */
1953 BUG_ON(inode->i_state & I_DIRTY && inode->i_nlink);
1954 if (!block)
1955 return;
1956 if ((logfs_super(sb)->s_flags & LOGFS_SB_FLAG_SHUTDOWN)) {
1957 block->ops->free_block(inode->i_sb, block);
1958 return;
1959 }
1960
1961 BUG_ON(inode->i_ino < LOGFS_RESERVED_INOS);
1962 page = inode_to_page(inode);
1963 BUG_ON(!page); /* FIXME: Use emergency page */
1964 logfs_put_write_page(page);
1965}
1966
1967static int do_write_inode(struct inode *inode)
1968{
1969 struct super_block *sb = inode->i_sb;
1970 struct inode *master_inode = logfs_super(sb)->s_master_inode;
1971 loff_t size = (inode->i_ino + 1) << inode->i_sb->s_blocksize_bits;
1972 struct page *page;
1973 int err;
1974
1975 BUG_ON(inode->i_ino == LOGFS_INO_MASTER);
1976 /* FIXME: lock inode */
1977
1978 if (i_size_read(master_inode) < size)
1979 i_size_write(master_inode, size);
1980
1981 /* TODO: Tell vfs this inode is clean now */
1982
1983 page = inode_to_page(inode);
1984 if (!page)
1985 return -ENOMEM;
1986
1987 /* FIXME: transaction is part of logfs_block now. Is that enough? */
1988 err = logfs_write_buf(master_inode, page, 0);
1989 logfs_put_write_page(page);
1990 return err;
1991}
1992
1993static void logfs_mod_segment_entry(struct super_block *sb, u32 segno,
1994 int write,
1995 void (*change_se)(struct logfs_segment_entry *, long),
1996 long arg)
1997{
1998 struct logfs_super *super = logfs_super(sb);
1999 struct inode *inode;
2000 struct page *page;
2001 struct logfs_segment_entry *se;
2002 pgoff_t page_no;
2003 int child_no;
2004
2005 page_no = segno >> (sb->s_blocksize_bits - 3);
2006 child_no = segno & ((sb->s_blocksize >> 3) - 1);
2007
2008 inode = super->s_segfile_inode;
2009 page = logfs_get_write_page(inode, page_no, 0);
2010 BUG_ON(!page); /* FIXME: We need some reserve page for this case */
2011 if (!PageUptodate(page))
2012 logfs_read_block(inode, page, WRITE);
2013
2014 if (write)
2015 alloc_indirect_block(inode, page, 0);
2016 se = kmap_atomic(page, KM_USER0);
2017 change_se(se + child_no, arg);
2018 if (write) {
2019 logfs_set_alias(sb, logfs_block(page), child_no);
2020 BUG_ON((int)be32_to_cpu(se[child_no].valid) > super->s_segsize);
2021 }
2022 kunmap_atomic(se, KM_USER0);
2023
2024 logfs_put_write_page(page);
2025}
2026
2027static void __get_segment_entry(struct logfs_segment_entry *se, long _target)
2028{
2029 struct logfs_segment_entry *target = (void *)_target;
2030
2031 *target = *se;
2032}
2033
2034void logfs_get_segment_entry(struct super_block *sb, u32 segno,
2035 struct logfs_segment_entry *se)
2036{
2037 logfs_mod_segment_entry(sb, segno, 0, __get_segment_entry, (long)se);
2038}
2039
2040static void __set_segment_used(struct logfs_segment_entry *se, long increment)
2041{
2042 u32 valid;
2043
2044 valid = be32_to_cpu(se->valid);
2045 valid += increment;
2046 se->valid = cpu_to_be32(valid);
2047}
2048
2049void logfs_set_segment_used(struct super_block *sb, u64 ofs, int increment)
2050{
2051 struct logfs_super *super = logfs_super(sb);
2052 u32 segno = ofs >> super->s_segshift;
2053
2054 if (!increment)
2055 return;
2056
2057 logfs_mod_segment_entry(sb, segno, 1, __set_segment_used, increment);
2058}
2059
2060static void __set_segment_erased(struct logfs_segment_entry *se, long ec_level)
2061{
2062 se->ec_level = cpu_to_be32(ec_level);
2063}
2064
2065void logfs_set_segment_erased(struct super_block *sb, u32 segno, u32 ec,
2066 gc_level_t gc_level)
2067{
2068 u32 ec_level = ec << 4 | (__force u8)gc_level;
2069
2070 logfs_mod_segment_entry(sb, segno, 1, __set_segment_erased, ec_level);
2071}
2072
2073static void __set_segment_reserved(struct logfs_segment_entry *se, long ignore)
2074{
2075 se->valid = cpu_to_be32(RESERVED);
2076}
2077
2078void logfs_set_segment_reserved(struct super_block *sb, u32 segno)
2079{
2080 logfs_mod_segment_entry(sb, segno, 1, __set_segment_reserved, 0);
2081}
2082
2083static void __set_segment_unreserved(struct logfs_segment_entry *se,
2084 long ec_level)
2085{
2086 se->valid = 0;
2087 se->ec_level = cpu_to_be32(ec_level);
2088}
2089
2090void logfs_set_segment_unreserved(struct super_block *sb, u32 segno, u32 ec)
2091{
2092 u32 ec_level = ec << 4;
2093
2094 logfs_mod_segment_entry(sb, segno, 1, __set_segment_unreserved,
2095 ec_level);
2096}
2097
2098int __logfs_write_inode(struct inode *inode, long flags)
2099{
2100 struct super_block *sb = inode->i_sb;
2101 int ret;
2102
2103 logfs_get_wblocks(sb, NULL, flags & WF_LOCK);
2104 ret = do_write_inode(inode);
2105 logfs_put_wblocks(sb, NULL, flags & WF_LOCK);
2106 return ret;
2107}
2108
2109static int do_delete_inode(struct inode *inode)
2110{
2111 struct super_block *sb = inode->i_sb;
2112 struct inode *master_inode = logfs_super(sb)->s_master_inode;
2113 struct page *page;
2114 int ret;
2115
2116 page = logfs_get_write_page(master_inode, inode->i_ino, 0);
2117 if (!page)
2118 return -ENOMEM;
2119
2120 move_inode_to_page(page, inode);
2121
2122 logfs_get_wblocks(sb, page, 1);
2123 ret = __logfs_delete(master_inode, page);
2124 logfs_put_wblocks(sb, page, 1);
2125
2126 logfs_put_write_page(page);
2127 return ret;
2128}
2129
2130/*
2131 * ZOMBIE inodes have already been deleted before and should remain dead,
2132 * if it weren't for valid checking. No need to kill them again here.
2133 */
2134void logfs_delete_inode(struct inode *inode)
2135{
2136 struct logfs_inode *li = logfs_inode(inode);
2137
2138 if (!(li->li_flags & LOGFS_IF_ZOMBIE)) {
2139 li->li_flags |= LOGFS_IF_ZOMBIE;
2140 if (i_size_read(inode) > 0)
2141 logfs_truncate(inode, 0);
2142 do_delete_inode(inode);
2143 }
2144 truncate_inode_pages(&inode->i_data, 0);
2145 clear_inode(inode);
2146}
2147
2148void btree_write_block(struct logfs_block *block)
2149{
2150 struct inode *inode;
2151 struct page *page;
2152 int err, cookie;
2153
2154 inode = logfs_safe_iget(block->sb, block->ino, &cookie);
2155 page = logfs_get_write_page(inode, block->bix, block->level);
2156
2157 err = logfs_readpage_nolock(page);
2158 BUG_ON(err);
2159 BUG_ON(!PagePrivate(page));
2160 BUG_ON(logfs_block(page) != block);
2161 err = __logfs_write_buf(inode, page, 0);
2162 BUG_ON(err);
2163 BUG_ON(PagePrivate(page) || page->private);
2164
2165 logfs_put_write_page(page);
2166 logfs_safe_iput(inode, cookie);
2167}
2168
2169/**
2170 * logfs_inode_write - write inode or dentry objects
2171 *
2172 * @inode: parent inode (ifile or directory)
2173 * @buf: object to write (inode or dentry)
2174 * @n: object size
2175 * @_pos: object number (file position in blocks/objects)
2176 * @flags: write flags
2177 * @lock: 0 if write lock is already taken, 1 otherwise
2178 * @shadow_tree: shadow below this inode
2179 *
2180 * FIXME: All caller of this put a 200-300 byte variable on the stack,
2181 * only to call here and do a memcpy from that stack variable. A good
2182 * example of wasted performance and stack space.
2183 */
2184int logfs_inode_write(struct inode *inode, const void *buf, size_t count,
2185 loff_t bix, long flags, struct shadow_tree *shadow_tree)
2186{
2187 loff_t pos = bix << inode->i_sb->s_blocksize_bits;
2188 int err;
2189 struct page *page;
2190 void *pagebuf;
2191
2192 BUG_ON(pos & (LOGFS_BLOCKSIZE-1));
2193 BUG_ON(count > LOGFS_BLOCKSIZE);
2194 page = logfs_get_write_page(inode, bix, 0);
2195 if (!page)
2196 return -ENOMEM;
2197
2198 pagebuf = kmap_atomic(page, KM_USER0);
2199 memcpy(pagebuf, buf, count);
2200 flush_dcache_page(page);
2201 kunmap_atomic(pagebuf, KM_USER0);
2202
2203 if (i_size_read(inode) < pos + LOGFS_BLOCKSIZE)
2204 i_size_write(inode, pos + LOGFS_BLOCKSIZE);
2205
2206 err = logfs_write_buf(inode, page, flags);
2207 logfs_put_write_page(page);
2208 return err;
2209}
2210
2211int logfs_open_segfile(struct super_block *sb)
2212{
2213 struct logfs_super *super = logfs_super(sb);
2214 struct inode *inode;
2215
2216 inode = logfs_read_meta_inode(sb, LOGFS_INO_SEGFILE);
2217 if (IS_ERR(inode))
2218 return PTR_ERR(inode);
2219 super->s_segfile_inode = inode;
2220 return 0;
2221}
2222
2223int logfs_init_rw(struct super_block *sb)
2224{
2225 struct logfs_super *super = logfs_super(sb);
2226 int min_fill = 3 * super->s_no_blocks;
2227
2228 INIT_LIST_HEAD(&super->s_object_alias);
2229 mutex_init(&super->s_write_mutex);
2230 super->s_block_pool = mempool_create_kmalloc_pool(min_fill,
2231 sizeof(struct logfs_block));
2232 super->s_shadow_pool = mempool_create_kmalloc_pool(min_fill,
2233 sizeof(struct logfs_shadow));
2234 return 0;
2235}
2236
2237void logfs_cleanup_rw(struct super_block *sb)
2238{
2239 struct logfs_super *super = logfs_super(sb);
2240
2241 destroy_meta_inode(super->s_segfile_inode);
2242 if (super->s_block_pool)
2243 mempool_destroy(super->s_block_pool);
2244 if (super->s_shadow_pool)
2245 mempool_destroy(super->s_shadow_pool);
2246}
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
new file mode 100644
index 000000000000..1a14f9910d55
--- /dev/null
+++ b/fs/logfs/segment.c
@@ -0,0 +1,927 @@
1/*
2 * fs/logfs/segment.c - Handling the Object Store
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 *
8 * Object store or ostore makes up the complete device with exception of
9 * the superblock and journal areas. Apart from its own metadata it stores
10 * three kinds of objects: inodes, dentries and blocks, both data and indirect.
11 */
12#include "logfs.h"
13
14static int logfs_mark_segment_bad(struct super_block *sb, u32 segno)
15{
16 struct logfs_super *super = logfs_super(sb);
17 struct btree_head32 *head = &super->s_reserved_segments;
18 int err;
19
20 err = btree_insert32(head, segno, (void *)1, GFP_NOFS);
21 if (err)
22 return err;
23 logfs_super(sb)->s_bad_segments++;
24 /* FIXME: write to journal */
25 return 0;
26}
27
28int logfs_erase_segment(struct super_block *sb, u32 segno, int ensure_erase)
29{
30 struct logfs_super *super = logfs_super(sb);
31
32 super->s_gec++;
33
34 return super->s_devops->erase(sb, (u64)segno << super->s_segshift,
35 super->s_segsize, ensure_erase);
36}
37
38static s64 logfs_get_free_bytes(struct logfs_area *area, size_t bytes)
39{
40 s32 ofs;
41
42 logfs_open_area(area, bytes);
43
44 ofs = area->a_used_bytes;
45 area->a_used_bytes += bytes;
46 BUG_ON(area->a_used_bytes >= logfs_super(area->a_sb)->s_segsize);
47
48 return dev_ofs(area->a_sb, area->a_segno, ofs);
49}
50
51static struct page *get_mapping_page(struct super_block *sb, pgoff_t index,
52 int use_filler)
53{
54 struct logfs_super *super = logfs_super(sb);
55 struct address_space *mapping = super->s_mapping_inode->i_mapping;
56 filler_t *filler = super->s_devops->readpage;
57 struct page *page;
58
59 BUG_ON(mapping_gfp_mask(mapping) & __GFP_FS);
60 if (use_filler)
61 page = read_cache_page(mapping, index, filler, sb);
62 else {
63 page = find_or_create_page(mapping, index, GFP_NOFS);
64 unlock_page(page);
65 }
66 return page;
67}
68
69void __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
70 int use_filler)
71{
72 pgoff_t index = ofs >> PAGE_SHIFT;
73 struct page *page;
74 long offset = ofs & (PAGE_SIZE-1);
75 long copylen;
76
77 /* Only logfs_wbuf_recover may use len==0 */
78 BUG_ON(!len && !use_filler);
79 do {
80 copylen = min((ulong)len, PAGE_SIZE - offset);
81
82 page = get_mapping_page(area->a_sb, index, use_filler);
83 SetPageUptodate(page);
84 BUG_ON(!page); /* FIXME: reserve a pool */
85 memcpy(page_address(page) + offset, buf, copylen);
86 SetPagePrivate(page);
87 page_cache_release(page);
88
89 buf += copylen;
90 len -= copylen;
91 offset = 0;
92 index++;
93 } while (len);
94}
95
96/*
97 * bdev_writeseg will write full pages. Memset the tail to prevent data leaks.
98 */
99static void pad_wbuf(struct logfs_area *area, int final)
100{
101 struct super_block *sb = area->a_sb;
102 struct logfs_super *super = logfs_super(sb);
103 struct page *page;
104 u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes);
105 pgoff_t index = ofs >> PAGE_SHIFT;
106 long offset = ofs & (PAGE_SIZE-1);
107 u32 len = PAGE_SIZE - offset;
108
109 if (len == PAGE_SIZE) {
110 /* The math in this function can surely use some love */
111 len = 0;
112 }
113 if (len) {
114 BUG_ON(area->a_used_bytes >= super->s_segsize);
115
116 page = get_mapping_page(area->a_sb, index, 0);
117 BUG_ON(!page); /* FIXME: reserve a pool */
118 memset(page_address(page) + offset, 0xff, len);
119 SetPagePrivate(page);
120 page_cache_release(page);
121 }
122
123 if (!final)
124 return;
125
126 area->a_used_bytes += len;
127 for ( ; area->a_used_bytes < super->s_segsize;
128 area->a_used_bytes += PAGE_SIZE) {
129 /* Memset another page */
130 index++;
131 page = get_mapping_page(area->a_sb, index, 0);
132 BUG_ON(!page); /* FIXME: reserve a pool */
133 memset(page_address(page), 0xff, PAGE_SIZE);
134 SetPagePrivate(page);
135 page_cache_release(page);
136 }
137}
138
139/*
140 * We have to be careful with the alias tree. Since lookup is done by bix,
141 * it needs to be normalized, so 14, 15, 16, etc. all match when dealing with
142 * indirect blocks. So always use it through accessor functions.
143 */
144static void *alias_tree_lookup(struct super_block *sb, u64 ino, u64 bix,
145 level_t level)
146{
147 struct btree_head128 *head = &logfs_super(sb)->s_object_alias_tree;
148 pgoff_t index = logfs_pack_index(bix, level);
149
150 return btree_lookup128(head, ino, index);
151}
152
153static int alias_tree_insert(struct super_block *sb, u64 ino, u64 bix,
154 level_t level, void *val)
155{
156 struct btree_head128 *head = &logfs_super(sb)->s_object_alias_tree;
157 pgoff_t index = logfs_pack_index(bix, level);
158
159 return btree_insert128(head, ino, index, val, GFP_NOFS);
160}
161
162static int btree_write_alias(struct super_block *sb, struct logfs_block *block,
163 write_alias_t *write_one_alias)
164{
165 struct object_alias_item *item;
166 int err;
167
168 list_for_each_entry(item, &block->item_list, list) {
169 err = write_alias_journal(sb, block->ino, block->bix,
170 block->level, item->child_no, item->val);
171 if (err)
172 return err;
173 }
174 return 0;
175}
176
177static gc_level_t btree_block_level(struct logfs_block *block)
178{
179 return expand_level(block->ino, block->level);
180}
181
182static struct logfs_block_ops btree_block_ops = {
183 .write_block = btree_write_block,
184 .block_level = btree_block_level,
185 .free_block = __free_block,
186 .write_alias = btree_write_alias,
187};
188
189int logfs_load_object_aliases(struct super_block *sb,
190 struct logfs_obj_alias *oa, int count)
191{
192 struct logfs_super *super = logfs_super(sb);
193 struct logfs_block *block;
194 struct object_alias_item *item;
195 u64 ino, bix;
196 level_t level;
197 int i, err;
198
199 super->s_flags |= LOGFS_SB_FLAG_OBJ_ALIAS;
200 count /= sizeof(*oa);
201 for (i = 0; i < count; i++) {
202 item = mempool_alloc(super->s_alias_pool, GFP_NOFS);
203 if (!item)
204 return -ENOMEM;
205 memset(item, 0, sizeof(*item));
206
207 super->s_no_object_aliases++;
208 item->val = oa[i].val;
209 item->child_no = be16_to_cpu(oa[i].child_no);
210
211 ino = be64_to_cpu(oa[i].ino);
212 bix = be64_to_cpu(oa[i].bix);
213 level = LEVEL(oa[i].level);
214
215 log_aliases("logfs_load_object_aliases(%llx, %llx, %x, %x) %llx\n",
216 ino, bix, level, item->child_no,
217 be64_to_cpu(item->val));
218 block = alias_tree_lookup(sb, ino, bix, level);
219 if (!block) {
220 block = __alloc_block(sb, ino, bix, level);
221 block->ops = &btree_block_ops;
222 err = alias_tree_insert(sb, ino, bix, level, block);
223 BUG_ON(err); /* mempool empty */
224 }
225 if (test_and_set_bit(item->child_no, block->alias_map)) {
226 printk(KERN_ERR"LogFS: Alias collision detected\n");
227 return -EIO;
228 }
229 list_move_tail(&block->alias_list, &super->s_object_alias);
230 list_add(&item->list, &block->item_list);
231 }
232 return 0;
233}
234
235static void kill_alias(void *_block, unsigned long ignore0,
236 u64 ignore1, u64 ignore2, size_t ignore3)
237{
238 struct logfs_block *block = _block;
239 struct super_block *sb = block->sb;
240 struct logfs_super *super = logfs_super(sb);
241 struct object_alias_item *item;
242
243 while (!list_empty(&block->item_list)) {
244 item = list_entry(block->item_list.next, typeof(*item), list);
245 list_del(&item->list);
246 mempool_free(item, super->s_alias_pool);
247 }
248 block->ops->free_block(sb, block);
249}
250
251static int obj_type(struct inode *inode, level_t level)
252{
253 if (level == 0) {
254 if (S_ISDIR(inode->i_mode))
255 return OBJ_DENTRY;
256 if (inode->i_ino == LOGFS_INO_MASTER)
257 return OBJ_INODE;
258 }
259 return OBJ_BLOCK;
260}
261
262static int obj_len(struct super_block *sb, int obj_type)
263{
264 switch (obj_type) {
265 case OBJ_DENTRY:
266 return sizeof(struct logfs_disk_dentry);
267 case OBJ_INODE:
268 return sizeof(struct logfs_disk_inode);
269 case OBJ_BLOCK:
270 return sb->s_blocksize;
271 default:
272 BUG();
273 }
274}
275
276static int __logfs_segment_write(struct inode *inode, void *buf,
277 struct logfs_shadow *shadow, int type, int len, int compr)
278{
279 struct logfs_area *area;
280 struct super_block *sb = inode->i_sb;
281 s64 ofs;
282 struct logfs_object_header h;
283 int acc_len;
284
285 if (shadow->gc_level == 0)
286 acc_len = len;
287 else
288 acc_len = obj_len(sb, type);
289
290 area = get_area(sb, shadow->gc_level);
291 ofs = logfs_get_free_bytes(area, len + LOGFS_OBJECT_HEADERSIZE);
292 LOGFS_BUG_ON(ofs <= 0, sb);
293 /*
294 * Order is important. logfs_get_free_bytes(), by modifying the
295 * segment file, may modify the content of the very page we're about
296 * to write now. Which is fine, as long as the calculated crc and
297 * written data still match. So do the modifications _before_
298 * calculating the crc.
299 */
300
301 h.len = cpu_to_be16(len);
302 h.type = type;
303 h.compr = compr;
304 h.ino = cpu_to_be64(inode->i_ino);
305 h.bix = cpu_to_be64(shadow->bix);
306 h.crc = logfs_crc32(&h, sizeof(h) - 4, 4);
307 h.data_crc = logfs_crc32(buf, len, 0);
308
309 logfs_buf_write(area, ofs, &h, sizeof(h));
310 logfs_buf_write(area, ofs + LOGFS_OBJECT_HEADERSIZE, buf, len);
311
312 shadow->new_ofs = ofs;
313 shadow->new_len = acc_len + LOGFS_OBJECT_HEADERSIZE;
314
315 return 0;
316}
317
318static s64 logfs_segment_write_compress(struct inode *inode, void *buf,
319 struct logfs_shadow *shadow, int type, int len)
320{
321 struct super_block *sb = inode->i_sb;
322 void *compressor_buf = logfs_super(sb)->s_compressed_je;
323 ssize_t compr_len;
324 int ret;
325
326 mutex_lock(&logfs_super(sb)->s_journal_mutex);
327 compr_len = logfs_compress(buf, compressor_buf, len, len);
328
329 if (compr_len >= 0) {
330 ret = __logfs_segment_write(inode, compressor_buf, shadow,
331 type, compr_len, COMPR_ZLIB);
332 } else {
333 ret = __logfs_segment_write(inode, buf, shadow, type, len,
334 COMPR_NONE);
335 }
336 mutex_unlock(&logfs_super(sb)->s_journal_mutex);
337 return ret;
338}
339
340/**
341 * logfs_segment_write - write data block to object store
342 * @inode: inode containing data
343 *
344 * Returns an errno or zero.
345 */
346int logfs_segment_write(struct inode *inode, struct page *page,
347 struct logfs_shadow *shadow)
348{
349 struct super_block *sb = inode->i_sb;
350 struct logfs_super *super = logfs_super(sb);
351 int do_compress, type, len;
352 int ret;
353 void *buf;
354
355 super->s_flags |= LOGFS_SB_FLAG_DIRTY;
356 BUG_ON(super->s_flags & LOGFS_SB_FLAG_SHUTDOWN);
357 do_compress = logfs_inode(inode)->li_flags & LOGFS_IF_COMPRESSED;
358 if (shadow->gc_level != 0) {
359 /* temporarily disable compression for indirect blocks */
360 do_compress = 0;
361 }
362
363 type = obj_type(inode, shrink_level(shadow->gc_level));
364 len = obj_len(sb, type);
365 buf = kmap(page);
366 if (do_compress)
367 ret = logfs_segment_write_compress(inode, buf, shadow, type,
368 len);
369 else
370 ret = __logfs_segment_write(inode, buf, shadow, type, len,
371 COMPR_NONE);
372 kunmap(page);
373
374 log_segment("logfs_segment_write(%llx, %llx, %x) %llx->%llx %x->%x\n",
375 shadow->ino, shadow->bix, shadow->gc_level,
376 shadow->old_ofs, shadow->new_ofs,
377 shadow->old_len, shadow->new_len);
378 /* this BUG_ON did catch a locking bug. useful */
379 BUG_ON(!(shadow->new_ofs & (super->s_segsize - 1)));
380 return ret;
381}
382
383int wbuf_read(struct super_block *sb, u64 ofs, size_t len, void *buf)
384{
385 pgoff_t index = ofs >> PAGE_SHIFT;
386 struct page *page;
387 long offset = ofs & (PAGE_SIZE-1);
388 long copylen;
389
390 while (len) {
391 copylen = min((ulong)len, PAGE_SIZE - offset);
392
393 page = get_mapping_page(sb, index, 1);
394 if (IS_ERR(page))
395 return PTR_ERR(page);
396 memcpy(buf, page_address(page) + offset, copylen);
397 page_cache_release(page);
398
399 buf += copylen;
400 len -= copylen;
401 offset = 0;
402 index++;
403 }
404 return 0;
405}
406
407/*
408 * The "position" of indirect blocks is ambiguous. It can be the position
409 * of any data block somewhere behind this indirect block. So we need to
410 * normalize the positions through logfs_block_mask() before comparing.
411 */
412static int check_pos(struct super_block *sb, u64 pos1, u64 pos2, level_t level)
413{
414 return (pos1 & logfs_block_mask(sb, level)) !=
415 (pos2 & logfs_block_mask(sb, level));
416}
417
418#if 0
419static int read_seg_header(struct super_block *sb, u64 ofs,
420 struct logfs_segment_header *sh)
421{
422 __be32 crc;
423 int err;
424
425 err = wbuf_read(sb, ofs, sizeof(*sh), sh);
426 if (err)
427 return err;
428 crc = logfs_crc32(sh, sizeof(*sh), 4);
429 if (crc != sh->crc) {
430 printk(KERN_ERR"LOGFS: header crc error at %llx: expected %x, "
431 "got %x\n", ofs, be32_to_cpu(sh->crc),
432 be32_to_cpu(crc));
433 return -EIO;
434 }
435 return 0;
436}
437#endif
438
439static int read_obj_header(struct super_block *sb, u64 ofs,
440 struct logfs_object_header *oh)
441{
442 __be32 crc;
443 int err;
444
445 err = wbuf_read(sb, ofs, sizeof(*oh), oh);
446 if (err)
447 return err;
448 crc = logfs_crc32(oh, sizeof(*oh) - 4, 4);
449 if (crc != oh->crc) {
450 printk(KERN_ERR"LOGFS: header crc error at %llx: expected %x, "
451 "got %x\n", ofs, be32_to_cpu(oh->crc),
452 be32_to_cpu(crc));
453 return -EIO;
454 }
455 return 0;
456}
457
458static void move_btree_to_page(struct inode *inode, struct page *page,
459 __be64 *data)
460{
461 struct super_block *sb = inode->i_sb;
462 struct logfs_super *super = logfs_super(sb);
463 struct btree_head128 *head = &super->s_object_alias_tree;
464 struct logfs_block *block;
465 struct object_alias_item *item, *next;
466
467 if (!(super->s_flags & LOGFS_SB_FLAG_OBJ_ALIAS))
468 return;
469
470 block = btree_remove128(head, inode->i_ino, page->index);
471 if (!block)
472 return;
473
474 log_blockmove("move_btree_to_page(%llx, %llx, %x)\n",
475 block->ino, block->bix, block->level);
476 list_for_each_entry_safe(item, next, &block->item_list, list) {
477 data[item->child_no] = item->val;
478 list_del(&item->list);
479 mempool_free(item, super->s_alias_pool);
480 }
481 block->page = page;
482 SetPagePrivate(page);
483 page->private = (unsigned long)block;
484 block->ops = &indirect_block_ops;
485 initialize_block_counters(page, block, data, 0);
486}
487
488/*
489 * This silences a false, yet annoying gcc warning. I hate it when my editor
490 * jumps into bitops.h each time I recompile this file.
491 * TODO: Complain to gcc folks about this and upgrade compiler.
492 */
493static unsigned long fnb(const unsigned long *addr,
494 unsigned long size, unsigned long offset)
495{
496 return find_next_bit(addr, size, offset);
497}
498
499void move_page_to_btree(struct page *page)
500{
501 struct logfs_block *block = logfs_block(page);
502 struct super_block *sb = block->sb;
503 struct logfs_super *super = logfs_super(sb);
504 struct object_alias_item *item;
505 unsigned long pos;
506 __be64 *child;
507 int err;
508
509 if (super->s_flags & LOGFS_SB_FLAG_SHUTDOWN) {
510 block->ops->free_block(sb, block);
511 return;
512 }
513 log_blockmove("move_page_to_btree(%llx, %llx, %x)\n",
514 block->ino, block->bix, block->level);
515 super->s_flags |= LOGFS_SB_FLAG_OBJ_ALIAS;
516
517 for (pos = 0; ; pos++) {
518 pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos);
519 if (pos >= LOGFS_BLOCK_FACTOR)
520 break;
521
522 item = mempool_alloc(super->s_alias_pool, GFP_NOFS);
523 BUG_ON(!item); /* mempool empty */
524 memset(item, 0, sizeof(*item));
525
526 child = kmap_atomic(page, KM_USER0);
527 item->val = child[pos];
528 kunmap_atomic(child, KM_USER0);
529 item->child_no = pos;
530 list_add(&item->list, &block->item_list);
531 }
532 block->page = NULL;
533 ClearPagePrivate(page);
534 page->private = 0;
535 block->ops = &btree_block_ops;
536 err = alias_tree_insert(block->sb, block->ino, block->bix, block->level,
537 block);
538 BUG_ON(err); /* mempool empty */
539 ClearPageUptodate(page);
540}
541
542static int __logfs_segment_read(struct inode *inode, void *buf,
543 u64 ofs, u64 bix, level_t level)
544{
545 struct super_block *sb = inode->i_sb;
546 void *compressor_buf = logfs_super(sb)->s_compressed_je;
547 struct logfs_object_header oh;
548 __be32 crc;
549 u16 len;
550 int err, block_len;
551
552 block_len = obj_len(sb, obj_type(inode, level));
553 err = read_obj_header(sb, ofs, &oh);
554 if (err)
555 goto out_err;
556
557 err = -EIO;
558 if (be64_to_cpu(oh.ino) != inode->i_ino
559 || check_pos(sb, be64_to_cpu(oh.bix), bix, level)) {
560 printk(KERN_ERR"LOGFS: (ino, bix) don't match at %llx: "
561 "expected (%lx, %llx), got (%llx, %llx)\n",
562 ofs, inode->i_ino, bix,
563 be64_to_cpu(oh.ino), be64_to_cpu(oh.bix));
564 goto out_err;
565 }
566
567 len = be16_to_cpu(oh.len);
568
569 switch (oh.compr) {
570 case COMPR_NONE:
571 err = wbuf_read(sb, ofs + LOGFS_OBJECT_HEADERSIZE, len, buf);
572 if (err)
573 goto out_err;
574 crc = logfs_crc32(buf, len, 0);
575 if (crc != oh.data_crc) {
576 printk(KERN_ERR"LOGFS: uncompressed data crc error at "
577 "%llx: expected %x, got %x\n", ofs,
578 be32_to_cpu(oh.data_crc),
579 be32_to_cpu(crc));
580 goto out_err;
581 }
582 break;
583 case COMPR_ZLIB:
584 mutex_lock(&logfs_super(sb)->s_journal_mutex);
585 err = wbuf_read(sb, ofs + LOGFS_OBJECT_HEADERSIZE, len,
586 compressor_buf);
587 if (err) {
588 mutex_unlock(&logfs_super(sb)->s_journal_mutex);
589 goto out_err;
590 }
591 crc = logfs_crc32(compressor_buf, len, 0);
592 if (crc != oh.data_crc) {
593 printk(KERN_ERR"LOGFS: compressed data crc error at "
594 "%llx: expected %x, got %x\n", ofs,
595 be32_to_cpu(oh.data_crc),
596 be32_to_cpu(crc));
597 mutex_unlock(&logfs_super(sb)->s_journal_mutex);
598 goto out_err;
599 }
600 err = logfs_uncompress(compressor_buf, buf, len, block_len);
601 mutex_unlock(&logfs_super(sb)->s_journal_mutex);
602 if (err) {
603 printk(KERN_ERR"LOGFS: uncompress error at %llx\n", ofs);
604 goto out_err;
605 }
606 break;
607 default:
608 LOGFS_BUG(sb);
609 err = -EIO;
610 goto out_err;
611 }
612 return 0;
613
614out_err:
615 logfs_set_ro(sb);
616 printk(KERN_ERR"LOGFS: device is read-only now\n");
617 LOGFS_BUG(sb);
618 return err;
619}
620
621/**
622 * logfs_segment_read - read data block from object store
623 * @inode: inode containing data
624 * @buf: data buffer
625 * @ofs: physical data offset
626 * @bix: block index
627 * @level: block level
628 *
629 * Returns 0 on success or a negative errno.
630 */
631int logfs_segment_read(struct inode *inode, struct page *page,
632 u64 ofs, u64 bix, level_t level)
633{
634 int err;
635 void *buf;
636
637 if (PageUptodate(page))
638 return 0;
639
640 ofs &= ~LOGFS_FULLY_POPULATED;
641
642 buf = kmap(page);
643 err = __logfs_segment_read(inode, buf, ofs, bix, level);
644 if (!err) {
645 move_btree_to_page(inode, page, buf);
646 SetPageUptodate(page);
647 }
648 kunmap(page);
649 log_segment("logfs_segment_read(%lx, %llx, %x) %llx (%d)\n",
650 inode->i_ino, bix, level, ofs, err);
651 return err;
652}
653
654int logfs_segment_delete(struct inode *inode, struct logfs_shadow *shadow)
655{
656 struct super_block *sb = inode->i_sb;
657 struct logfs_super *super = logfs_super(sb);
658 struct logfs_object_header h;
659 u16 len;
660 int err;
661
662 super->s_flags |= LOGFS_SB_FLAG_DIRTY;
663 BUG_ON(super->s_flags & LOGFS_SB_FLAG_SHUTDOWN);
664 BUG_ON(shadow->old_ofs & LOGFS_FULLY_POPULATED);
665 if (!shadow->old_ofs)
666 return 0;
667
668 log_segment("logfs_segment_delete(%llx, %llx, %x) %llx->%llx %x->%x\n",
669 shadow->ino, shadow->bix, shadow->gc_level,
670 shadow->old_ofs, shadow->new_ofs,
671 shadow->old_len, shadow->new_len);
672 err = read_obj_header(sb, shadow->old_ofs, &h);
673 LOGFS_BUG_ON(err, sb);
674 LOGFS_BUG_ON(be64_to_cpu(h.ino) != inode->i_ino, sb);
675 LOGFS_BUG_ON(check_pos(sb, shadow->bix, be64_to_cpu(h.bix),
676 shrink_level(shadow->gc_level)), sb);
677
678 if (shadow->gc_level == 0)
679 len = be16_to_cpu(h.len);
680 else
681 len = obj_len(sb, h.type);
682 shadow->old_len = len + sizeof(h);
683 return 0;
684}
685
686static void freeseg(struct super_block *sb, u32 segno)
687{
688 struct logfs_super *super = logfs_super(sb);
689 struct address_space *mapping = super->s_mapping_inode->i_mapping;
690 struct page *page;
691 u64 ofs, start, end;
692
693 start = dev_ofs(sb, segno, 0);
694 end = dev_ofs(sb, segno + 1, 0);
695 for (ofs = start; ofs < end; ofs += PAGE_SIZE) {
696 page = find_get_page(mapping, ofs >> PAGE_SHIFT);
697 if (!page)
698 continue;
699 ClearPagePrivate(page);
700 page_cache_release(page);
701 }
702}
703
704int logfs_open_area(struct logfs_area *area, size_t bytes)
705{
706 struct super_block *sb = area->a_sb;
707 struct logfs_super *super = logfs_super(sb);
708 int err, closed = 0;
709
710 if (area->a_is_open && area->a_used_bytes + bytes <= super->s_segsize)
711 return 0;
712
713 if (area->a_is_open) {
714 u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
715 u32 len = super->s_segsize - area->a_written_bytes;
716
717 log_gc("logfs_close_area(%x)\n", area->a_segno);
718 pad_wbuf(area, 1);
719 super->s_devops->writeseg(area->a_sb, ofs, len);
720 freeseg(sb, area->a_segno);
721 closed = 1;
722 }
723
724 area->a_used_bytes = 0;
725 area->a_written_bytes = 0;
726again:
727 area->a_ops->get_free_segment(area);
728 area->a_ops->get_erase_count(area);
729
730 log_gc("logfs_open_area(%x, %x)\n", area->a_segno, area->a_level);
731 err = area->a_ops->erase_segment(area);
732 if (err) {
733 printk(KERN_WARNING "LogFS: Error erasing segment %x\n",
734 area->a_segno);
735 logfs_mark_segment_bad(sb, area->a_segno);
736 goto again;
737 }
738 area->a_is_open = 1;
739 return closed;
740}
741
742void logfs_sync_area(struct logfs_area *area)
743{
744 struct super_block *sb = area->a_sb;
745 struct logfs_super *super = logfs_super(sb);
746 u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
747 u32 len = (area->a_used_bytes - area->a_written_bytes);
748
749 if (super->s_writesize)
750 len &= ~(super->s_writesize - 1);
751 if (len == 0)
752 return;
753 pad_wbuf(area, 0);
754 super->s_devops->writeseg(sb, ofs, len);
755 area->a_written_bytes += len;
756}
757
758void logfs_sync_segments(struct super_block *sb)
759{
760 struct logfs_super *super = logfs_super(sb);
761 int i;
762
763 for_each_area(i)
764 logfs_sync_area(super->s_area[i]);
765}
766
767/*
768 * Pick a free segment to be used for this area. Effectively takes a
769 * candidate from the free list (not really a candidate anymore).
770 */
771static void ostore_get_free_segment(struct logfs_area *area)
772{
773 struct super_block *sb = area->a_sb;
774 struct logfs_super *super = logfs_super(sb);
775
776 if (super->s_free_list.count == 0) {
777 printk(KERN_ERR"LOGFS: ran out of free segments\n");
778 LOGFS_BUG(sb);
779 }
780
781 area->a_segno = get_best_cand(sb, &super->s_free_list, NULL);
782}
783
784static void ostore_get_erase_count(struct logfs_area *area)
785{
786 struct logfs_segment_entry se;
787 u32 ec_level;
788
789 logfs_get_segment_entry(area->a_sb, area->a_segno, &se);
790 BUG_ON(se.ec_level == cpu_to_be32(BADSEG) ||
791 se.valid == cpu_to_be32(RESERVED));
792
793 ec_level = be32_to_cpu(se.ec_level);
794 area->a_erase_count = (ec_level >> 4) + 1;
795}
796
797static int ostore_erase_segment(struct logfs_area *area)
798{
799 struct super_block *sb = area->a_sb;
800 struct logfs_segment_header sh;
801 u64 ofs;
802 int err;
803
804 err = logfs_erase_segment(sb, area->a_segno, 0);
805 if (err)
806 return err;
807
808 sh.pad = 0;
809 sh.type = SEG_OSTORE;
810 sh.level = (__force u8)area->a_level;
811 sh.segno = cpu_to_be32(area->a_segno);
812 sh.ec = cpu_to_be32(area->a_erase_count);
813 sh.gec = cpu_to_be64(logfs_super(sb)->s_gec);
814 sh.crc = logfs_crc32(&sh, sizeof(sh), 4);
815
816 logfs_set_segment_erased(sb, area->a_segno, area->a_erase_count,
817 area->a_level);
818
819 ofs = dev_ofs(sb, area->a_segno, 0);
820 area->a_used_bytes = sizeof(sh);
821 logfs_buf_write(area, ofs, &sh, sizeof(sh));
822 return 0;
823}
824
825static const struct logfs_area_ops ostore_area_ops = {
826 .get_free_segment = ostore_get_free_segment,
827 .get_erase_count = ostore_get_erase_count,
828 .erase_segment = ostore_erase_segment,
829};
830
831static void free_area(struct logfs_area *area)
832{
833 if (area)
834 freeseg(area->a_sb, area->a_segno);
835 kfree(area);
836}
837
838static struct logfs_area *alloc_area(struct super_block *sb)
839{
840 struct logfs_area *area;
841
842 area = kzalloc(sizeof(*area), GFP_KERNEL);
843 if (!area)
844 return NULL;
845
846 area->a_sb = sb;
847 return area;
848}
849
850static void map_invalidatepage(struct page *page, unsigned long l)
851{
852 BUG();
853}
854
855static int map_releasepage(struct page *page, gfp_t g)
856{
857 /* Don't release these pages */
858 return 0;
859}
860
861static const struct address_space_operations mapping_aops = {
862 .invalidatepage = map_invalidatepage,
863 .releasepage = map_releasepage,
864 .set_page_dirty = __set_page_dirty_nobuffers,
865};
866
867int logfs_init_mapping(struct super_block *sb)
868{
869 struct logfs_super *super = logfs_super(sb);
870 struct address_space *mapping;
871 struct inode *inode;
872
873 inode = logfs_new_meta_inode(sb, LOGFS_INO_MAPPING);
874 if (IS_ERR(inode))
875 return PTR_ERR(inode);
876 super->s_mapping_inode = inode;
877 mapping = inode->i_mapping;
878 mapping->a_ops = &mapping_aops;
879 /* Would it be possible to use __GFP_HIGHMEM as well? */
880 mapping_set_gfp_mask(mapping, GFP_NOFS);
881 return 0;
882}
883
884int logfs_init_areas(struct super_block *sb)
885{
886 struct logfs_super *super = logfs_super(sb);
887 int i = -1;
888
889 super->s_alias_pool = mempool_create_kmalloc_pool(600,
890 sizeof(struct object_alias_item));
891 if (!super->s_alias_pool)
892 return -ENOMEM;
893
894 super->s_journal_area = alloc_area(sb);
895 if (!super->s_journal_area)
896 goto err;
897
898 for_each_area(i) {
899 super->s_area[i] = alloc_area(sb);
900 if (!super->s_area[i])
901 goto err;
902 super->s_area[i]->a_level = GC_LEVEL(i);
903 super->s_area[i]->a_ops = &ostore_area_ops;
904 }
905 btree_init_mempool128(&super->s_object_alias_tree,
906 super->s_btree_pool);
907 return 0;
908
909err:
910 for (i--; i >= 0; i--)
911 free_area(super->s_area[i]);
912 free_area(super->s_journal_area);
913 mempool_destroy(super->s_alias_pool);
914 return -ENOMEM;
915}
916
917void logfs_cleanup_areas(struct super_block *sb)
918{
919 struct logfs_super *super = logfs_super(sb);
920 int i;
921
922 btree_grim_visitor128(&super->s_object_alias_tree, 0, kill_alias);
923 for_each_area(i)
924 free_area(super->s_area[i]);
925 free_area(super->s_journal_area);
926 destroy_meta_inode(super->s_mapping_inode);
927}
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
new file mode 100644
index 000000000000..c66beab78dee
--- /dev/null
+++ b/fs/logfs/super.c
@@ -0,0 +1,650 @@
1/*
2 * fs/logfs/super.c
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 *
8 * Generally contains mount/umount code and also serves as a dump area for
9 * any functions that don't fit elsewhere and neither justify a file of their
10 * own.
11 */
12#include "logfs.h"
13#include <linux/bio.h>
14#include <linux/mtd/mtd.h>
15#include <linux/statfs.h>
16#include <linux/buffer_head.h>
17
18static DEFINE_MUTEX(emergency_mutex);
19static struct page *emergency_page;
20
21struct page *emergency_read_begin(struct address_space *mapping, pgoff_t index)
22{
23 filler_t *filler = (filler_t *)mapping->a_ops->readpage;
24 struct page *page;
25 int err;
26
27 page = read_cache_page(mapping, index, filler, NULL);
28 if (page)
29 return page;
30
31 /* No more pages available, switch to emergency page */
32 printk(KERN_INFO"Logfs: Using emergency page\n");
33 mutex_lock(&emergency_mutex);
34 err = filler(NULL, emergency_page);
35 if (err) {
36 mutex_unlock(&emergency_mutex);
37 printk(KERN_EMERG"Logfs: Error reading emergency page\n");
38 return ERR_PTR(err);
39 }
40 return emergency_page;
41}
42
43void emergency_read_end(struct page *page)
44{
45 if (page == emergency_page)
46 mutex_unlock(&emergency_mutex);
47 else
48 page_cache_release(page);
49}
50
51static void dump_segfile(struct super_block *sb)
52{
53 struct logfs_super *super = logfs_super(sb);
54 struct logfs_segment_entry se;
55 u32 segno;
56
57 for (segno = 0; segno < super->s_no_segs; segno++) {
58 logfs_get_segment_entry(sb, segno, &se);
59 printk("%3x: %6x %8x", segno, be32_to_cpu(se.ec_level),
60 be32_to_cpu(se.valid));
61 if (++segno < super->s_no_segs) {
62 logfs_get_segment_entry(sb, segno, &se);
63 printk(" %6x %8x", be32_to_cpu(se.ec_level),
64 be32_to_cpu(se.valid));
65 }
66 if (++segno < super->s_no_segs) {
67 logfs_get_segment_entry(sb, segno, &se);
68 printk(" %6x %8x", be32_to_cpu(se.ec_level),
69 be32_to_cpu(se.valid));
70 }
71 if (++segno < super->s_no_segs) {
72 logfs_get_segment_entry(sb, segno, &se);
73 printk(" %6x %8x", be32_to_cpu(se.ec_level),
74 be32_to_cpu(se.valid));
75 }
76 printk("\n");
77 }
78}
79
80/*
81 * logfs_crash_dump - dump debug information to device
82 *
83 * The LogFS superblock only occupies part of a segment. This function will
84 * write as much debug information as it can gather into the spare space.
85 */
86void logfs_crash_dump(struct super_block *sb)
87{
88 dump_segfile(sb);
89}
90
91/*
92 * TODO: move to lib/string.c
93 */
94/**
95 * memchr_inv - Find a character in an area of memory.
96 * @s: The memory area
97 * @c: The byte to search for
98 * @n: The size of the area.
99 *
100 * returns the address of the first character other than @c, or %NULL
101 * if the whole buffer contains just @c.
102 */
103void *memchr_inv(const void *s, int c, size_t n)
104{
105 const unsigned char *p = s;
106 while (n-- != 0)
107 if ((unsigned char)c != *p++)
108 return (void *)(p - 1);
109
110 return NULL;
111}
112
113/*
114 * FIXME: There should be a reserve for root, similar to ext2.
115 */
116int logfs_statfs(struct dentry *dentry, struct kstatfs *stats)
117{
118 struct super_block *sb = dentry->d_sb;
119 struct logfs_super *super = logfs_super(sb);
120
121 stats->f_type = LOGFS_MAGIC_U32;
122 stats->f_bsize = sb->s_blocksize;
123 stats->f_blocks = super->s_size >> LOGFS_BLOCK_BITS >> 3;
124 stats->f_bfree = super->s_free_bytes >> sb->s_blocksize_bits;
125 stats->f_bavail = super->s_free_bytes >> sb->s_blocksize_bits;
126 stats->f_files = 0;
127 stats->f_ffree = 0;
128 stats->f_namelen = LOGFS_MAX_NAMELEN;
129 return 0;
130}
131
132static int logfs_sb_set(struct super_block *sb, void *_super)
133{
134 struct logfs_super *super = _super;
135
136 sb->s_fs_info = super;
137 sb->s_mtd = super->s_mtd;
138 sb->s_bdev = super->s_bdev;
139 return 0;
140}
141
142static int logfs_sb_test(struct super_block *sb, void *_super)
143{
144 struct logfs_super *super = _super;
145 struct mtd_info *mtd = super->s_mtd;
146
147 if (mtd && sb->s_mtd == mtd)
148 return 1;
149 if (super->s_bdev && sb->s_bdev == super->s_bdev)
150 return 1;
151 return 0;
152}
153
154static void set_segment_header(struct logfs_segment_header *sh, u8 type,
155 u8 level, u32 segno, u32 ec)
156{
157 sh->pad = 0;
158 sh->type = type;
159 sh->level = level;
160 sh->segno = cpu_to_be32(segno);
161 sh->ec = cpu_to_be32(ec);
162 sh->gec = cpu_to_be64(segno);
163 sh->crc = logfs_crc32(sh, LOGFS_SEGMENT_HEADERSIZE, 4);
164}
165
166static void logfs_write_ds(struct super_block *sb, struct logfs_disk_super *ds,
167 u32 segno, u32 ec)
168{
169 struct logfs_super *super = logfs_super(sb);
170 struct logfs_segment_header *sh = &ds->ds_sh;
171 int i;
172
173 memset(ds, 0, sizeof(*ds));
174 set_segment_header(sh, SEG_SUPER, 0, segno, ec);
175
176 ds->ds_ifile_levels = super->s_ifile_levels;
177 ds->ds_iblock_levels = super->s_iblock_levels;
178 ds->ds_data_levels = super->s_data_levels; /* XXX: Remove */
179 ds->ds_segment_shift = super->s_segshift;
180 ds->ds_block_shift = sb->s_blocksize_bits;
181 ds->ds_write_shift = super->s_writeshift;
182 ds->ds_filesystem_size = cpu_to_be64(super->s_size);
183 ds->ds_segment_size = cpu_to_be32(super->s_segsize);
184 ds->ds_bad_seg_reserve = cpu_to_be32(super->s_bad_seg_reserve);
185 ds->ds_feature_incompat = cpu_to_be64(super->s_feature_incompat);
186 ds->ds_feature_ro_compat= cpu_to_be64(super->s_feature_ro_compat);
187 ds->ds_feature_compat = cpu_to_be64(super->s_feature_compat);
188 ds->ds_feature_flags = cpu_to_be64(super->s_feature_flags);
189 ds->ds_root_reserve = cpu_to_be64(super->s_root_reserve);
190 ds->ds_speed_reserve = cpu_to_be64(super->s_speed_reserve);
191 journal_for_each(i)
192 ds->ds_journal_seg[i] = cpu_to_be32(super->s_journal_seg[i]);
193 ds->ds_magic = cpu_to_be64(LOGFS_MAGIC);
194 ds->ds_crc = logfs_crc32(ds, sizeof(*ds),
195 LOGFS_SEGMENT_HEADERSIZE + 12);
196}
197
198static int write_one_sb(struct super_block *sb,
199 struct page *(*find_sb)(struct super_block *sb, u64 *ofs))
200{
201 struct logfs_super *super = logfs_super(sb);
202 struct logfs_disk_super *ds;
203 struct logfs_segment_entry se;
204 struct page *page;
205 u64 ofs;
206 u32 ec, segno;
207 int err;
208
209 page = find_sb(sb, &ofs);
210 if (!page)
211 return -EIO;
212 ds = page_address(page);
213 segno = seg_no(sb, ofs);
214 logfs_get_segment_entry(sb, segno, &se);
215 ec = be32_to_cpu(se.ec_level) >> 4;
216 ec++;
217 logfs_set_segment_erased(sb, segno, ec, 0);
218 logfs_write_ds(sb, ds, segno, ec);
219 err = super->s_devops->write_sb(sb, page);
220 page_cache_release(page);
221 return err;
222}
223
224int logfs_write_sb(struct super_block *sb)
225{
226 struct logfs_super *super = logfs_super(sb);
227 int err;
228
229 /* First superblock */
230 err = write_one_sb(sb, super->s_devops->find_first_sb);
231 if (err)
232 return err;
233
234 /* Last superblock */
235 err = write_one_sb(sb, super->s_devops->find_last_sb);
236 if (err)
237 return err;
238 return 0;
239}
240
241static int ds_cmp(const void *ds0, const void *ds1)
242{
243 size_t len = sizeof(struct logfs_disk_super);
244
245 /* We know the segment headers differ, so ignore them */
246 len -= LOGFS_SEGMENT_HEADERSIZE;
247 ds0 += LOGFS_SEGMENT_HEADERSIZE;
248 ds1 += LOGFS_SEGMENT_HEADERSIZE;
249 return memcmp(ds0, ds1, len);
250}
251
252static int logfs_recover_sb(struct super_block *sb)
253{
254 struct logfs_super *super = logfs_super(sb);
255 struct logfs_disk_super _ds0, *ds0 = &_ds0;
256 struct logfs_disk_super _ds1, *ds1 = &_ds1;
257 int err, valid0, valid1;
258
259 /* read first superblock */
260 err = wbuf_read(sb, super->s_sb_ofs[0], sizeof(*ds0), ds0);
261 if (err)
262 return err;
263 /* read last superblock */
264 err = wbuf_read(sb, super->s_sb_ofs[1], sizeof(*ds1), ds1);
265 if (err)
266 return err;
267 valid0 = logfs_check_ds(ds0) == 0;
268 valid1 = logfs_check_ds(ds1) == 0;
269
270 if (!valid0 && valid1) {
271 printk(KERN_INFO"First superblock is invalid - fixing.\n");
272 return write_one_sb(sb, super->s_devops->find_first_sb);
273 }
274 if (valid0 && !valid1) {
275 printk(KERN_INFO"Last superblock is invalid - fixing.\n");
276 return write_one_sb(sb, super->s_devops->find_last_sb);
277 }
278 if (valid0 && valid1 && ds_cmp(ds0, ds1)) {
279 printk(KERN_INFO"Superblocks don't match - fixing.\n");
280 return write_one_sb(sb, super->s_devops->find_last_sb);
281 }
282 /* If neither is valid now, something's wrong. Didn't we properly
283 * check them before?!? */
284 BUG_ON(!valid0 && !valid1);
285 return 0;
286}
287
288static int logfs_make_writeable(struct super_block *sb)
289{
290 int err;
291
292 /* Repair any broken superblock copies */
293 err = logfs_recover_sb(sb);
294 if (err)
295 return err;
296
297 /* Check areas for trailing unaccounted data */
298 err = logfs_check_areas(sb);
299 if (err)
300 return err;
301
302 err = logfs_open_segfile(sb);
303 if (err)
304 return err;
305
306 /* Do one GC pass before any data gets dirtied */
307 logfs_gc_pass(sb);
308
309 /* after all initializations are done, replay the journal
310 * for rw-mounts, if necessary */
311 err = logfs_replay_journal(sb);
312 if (err)
313 return err;
314
315 return 0;
316}
317
318static int logfs_get_sb_final(struct super_block *sb, struct vfsmount *mnt)
319{
320 struct logfs_super *super = logfs_super(sb);
321 struct inode *rootdir;
322 int err;
323
324 /* root dir */
325 rootdir = logfs_iget(sb, LOGFS_INO_ROOT);
326 if (IS_ERR(rootdir))
327 goto fail;
328
329 sb->s_root = d_alloc_root(rootdir);
330 if (!sb->s_root)
331 goto fail;
332
333 super->s_erase_page = alloc_pages(GFP_KERNEL, 0);
334 if (!super->s_erase_page)
335 goto fail2;
336 memset(page_address(super->s_erase_page), 0xFF, PAGE_SIZE);
337
338 /* FIXME: check for read-only mounts */
339 err = logfs_make_writeable(sb);
340 if (err)
341 goto fail3;
342
343 log_super("LogFS: Finished mounting\n");
344 simple_set_mnt(mnt, sb);
345 return 0;
346
347fail3:
348 __free_page(super->s_erase_page);
349fail2:
350 iput(rootdir);
351fail:
352 iput(logfs_super(sb)->s_master_inode);
353 return -EIO;
354}
355
356int logfs_check_ds(struct logfs_disk_super *ds)
357{
358 struct logfs_segment_header *sh = &ds->ds_sh;
359
360 if (ds->ds_magic != cpu_to_be64(LOGFS_MAGIC))
361 return -EINVAL;
362 if (sh->crc != logfs_crc32(sh, LOGFS_SEGMENT_HEADERSIZE, 4))
363 return -EINVAL;
364 if (ds->ds_crc != logfs_crc32(ds, sizeof(*ds),
365 LOGFS_SEGMENT_HEADERSIZE + 12))
366 return -EINVAL;
367 return 0;
368}
369
370static struct page *find_super_block(struct super_block *sb)
371{
372 struct logfs_super *super = logfs_super(sb);
373 struct page *first, *last;
374
375 first = super->s_devops->find_first_sb(sb, &super->s_sb_ofs[0]);
376 if (!first || IS_ERR(first))
377 return NULL;
378 last = super->s_devops->find_last_sb(sb, &super->s_sb_ofs[1]);
379 if (!last || IS_ERR(first)) {
380 page_cache_release(first);
381 return NULL;
382 }
383
384 if (!logfs_check_ds(page_address(first))) {
385 page_cache_release(last);
386 return first;
387 }
388
389 /* First one didn't work, try the second superblock */
390 if (!logfs_check_ds(page_address(last))) {
391 page_cache_release(first);
392 return last;
393 }
394
395 /* Neither worked, sorry folks */
396 page_cache_release(first);
397 page_cache_release(last);
398 return NULL;
399}
400
401static int __logfs_read_sb(struct super_block *sb)
402{
403 struct logfs_super *super = logfs_super(sb);
404 struct page *page;
405 struct logfs_disk_super *ds;
406 int i;
407
408 page = find_super_block(sb);
409 if (!page)
410 return -EIO;
411
412 ds = page_address(page);
413 super->s_size = be64_to_cpu(ds->ds_filesystem_size);
414 super->s_root_reserve = be64_to_cpu(ds->ds_root_reserve);
415 super->s_speed_reserve = be64_to_cpu(ds->ds_speed_reserve);
416 super->s_bad_seg_reserve = be32_to_cpu(ds->ds_bad_seg_reserve);
417 super->s_segsize = 1 << ds->ds_segment_shift;
418 super->s_segmask = (1 << ds->ds_segment_shift) - 1;
419 super->s_segshift = ds->ds_segment_shift;
420 sb->s_blocksize = 1 << ds->ds_block_shift;
421 sb->s_blocksize_bits = ds->ds_block_shift;
422 super->s_writesize = 1 << ds->ds_write_shift;
423 super->s_writeshift = ds->ds_write_shift;
424 super->s_no_segs = super->s_size >> super->s_segshift;
425 super->s_no_blocks = super->s_segsize >> sb->s_blocksize_bits;
426 super->s_feature_incompat = be64_to_cpu(ds->ds_feature_incompat);
427 super->s_feature_ro_compat = be64_to_cpu(ds->ds_feature_ro_compat);
428 super->s_feature_compat = be64_to_cpu(ds->ds_feature_compat);
429 super->s_feature_flags = be64_to_cpu(ds->ds_feature_flags);
430
431 journal_for_each(i)
432 super->s_journal_seg[i] = be32_to_cpu(ds->ds_journal_seg[i]);
433
434 super->s_ifile_levels = ds->ds_ifile_levels;
435 super->s_iblock_levels = ds->ds_iblock_levels;
436 super->s_data_levels = ds->ds_data_levels;
437 super->s_total_levels = super->s_ifile_levels + super->s_iblock_levels
438 + super->s_data_levels;
439 page_cache_release(page);
440 return 0;
441}
442
443static int logfs_read_sb(struct super_block *sb, int read_only)
444{
445 struct logfs_super *super = logfs_super(sb);
446 int ret;
447
448 super->s_btree_pool = mempool_create(32, btree_alloc, btree_free, NULL);
449 if (!super->s_btree_pool)
450 return -ENOMEM;
451
452 btree_init_mempool64(&super->s_shadow_tree.new, super->s_btree_pool);
453 btree_init_mempool64(&super->s_shadow_tree.old, super->s_btree_pool);
454
455 ret = logfs_init_mapping(sb);
456 if (ret)
457 return ret;
458
459 ret = __logfs_read_sb(sb);
460 if (ret)
461 return ret;
462
463 if (super->s_feature_incompat & ~LOGFS_FEATURES_INCOMPAT)
464 return -EIO;
465 if ((super->s_feature_ro_compat & ~LOGFS_FEATURES_RO_COMPAT) &&
466 !read_only)
467 return -EIO;
468
469 mutex_init(&super->s_dirop_mutex);
470 mutex_init(&super->s_object_alias_mutex);
471 INIT_LIST_HEAD(&super->s_freeing_list);
472
473 ret = logfs_init_rw(sb);
474 if (ret)
475 return ret;
476
477 ret = logfs_init_areas(sb);
478 if (ret)
479 return ret;
480
481 ret = logfs_init_gc(sb);
482 if (ret)
483 return ret;
484
485 ret = logfs_init_journal(sb);
486 if (ret)
487 return ret;
488
489 return 0;
490}
491
492static void logfs_kill_sb(struct super_block *sb)
493{
494 struct logfs_super *super = logfs_super(sb);
495
496 log_super("LogFS: Start unmounting\n");
497 /* Alias entries slow down mount, so evict as many as possible */
498 sync_filesystem(sb);
499 logfs_write_anchor(sb);
500
501 /*
502 * From this point on alias entries are simply dropped - and any
503 * writes to the object store are considered bugs.
504 */
505 super->s_flags |= LOGFS_SB_FLAG_SHUTDOWN;
506 log_super("LogFS: Now in shutdown\n");
507 generic_shutdown_super(sb);
508
509 BUG_ON(super->s_dirty_used_bytes || super->s_dirty_free_bytes);
510
511 logfs_cleanup_gc(sb);
512 logfs_cleanup_journal(sb);
513 logfs_cleanup_areas(sb);
514 logfs_cleanup_rw(sb);
515 if (super->s_erase_page)
516 __free_page(super->s_erase_page);
517 super->s_devops->put_device(sb);
518 mempool_destroy(super->s_btree_pool);
519 mempool_destroy(super->s_alias_pool);
520 kfree(super);
521 log_super("LogFS: Finished unmounting\n");
522}
523
524int logfs_get_sb_device(struct file_system_type *type, int flags,
525 struct mtd_info *mtd, struct block_device *bdev,
526 const struct logfs_device_ops *devops, struct vfsmount *mnt)
527{
528 struct logfs_super *super;
529 struct super_block *sb;
530 int err = -ENOMEM;
531 static int mount_count;
532
533 log_super("LogFS: Start mount %x\n", mount_count++);
534 super = kzalloc(sizeof(*super), GFP_KERNEL);
535 if (!super)
536 goto err0;
537
538 super->s_mtd = mtd;
539 super->s_bdev = bdev;
540 err = -EINVAL;
541 sb = sget(type, logfs_sb_test, logfs_sb_set, super);
542 if (IS_ERR(sb))
543 goto err0;
544
545 if (sb->s_root) {
546 /* Device is already in use */
547 err = 0;
548 simple_set_mnt(mnt, sb);
549 goto err0;
550 }
551
552 super->s_devops = devops;
553
554 /*
555 * sb->s_maxbytes is limited to 8TB. On 32bit systems, the page cache
556 * only covers 16TB and the upper 8TB are used for indirect blocks.
557 * On 64bit system we could bump up the limit, but that would make
558 * the filesystem incompatible with 32bit systems.
559 */
560 sb->s_maxbytes = (1ull << 43) - 1;
561 sb->s_op = &logfs_super_operations;
562 sb->s_flags = flags | MS_NOATIME;
563
564 err = logfs_read_sb(sb, sb->s_flags & MS_RDONLY);
565 if (err)
566 goto err1;
567
568 sb->s_flags |= MS_ACTIVE;
569 err = logfs_get_sb_final(sb, mnt);
570 if (err)
571 goto err1;
572 return 0;
573
574err1:
575 up_write(&sb->s_umount);
576 deactivate_super(sb);
577 return err;
578err0:
579 kfree(super);
580 //devops->put_device(sb);
581 return err;
582}
583
584static int logfs_get_sb(struct file_system_type *type, int flags,
585 const char *devname, void *data, struct vfsmount *mnt)
586{
587 ulong mtdnr;
588
589 if (!devname)
590 return logfs_get_sb_bdev(type, flags, devname, mnt);
591 if (strncmp(devname, "mtd", 3))
592 return logfs_get_sb_bdev(type, flags, devname, mnt);
593
594 {
595 char *garbage;
596 mtdnr = simple_strtoul(devname+3, &garbage, 0);
597 if (*garbage)
598 return -EINVAL;
599 }
600
601 return logfs_get_sb_mtd(type, flags, mtdnr, mnt);
602}
603
604static struct file_system_type logfs_fs_type = {
605 .owner = THIS_MODULE,
606 .name = "logfs",
607 .get_sb = logfs_get_sb,
608 .kill_sb = logfs_kill_sb,
609 .fs_flags = FS_REQUIRES_DEV,
610
611};
612
613static int __init logfs_init(void)
614{
615 int ret;
616
617 emergency_page = alloc_pages(GFP_KERNEL, 0);
618 if (!emergency_page)
619 return -ENOMEM;
620
621 ret = logfs_compr_init();
622 if (ret)
623 goto out1;
624
625 ret = logfs_init_inode_cache();
626 if (ret)
627 goto out2;
628
629 return register_filesystem(&logfs_fs_type);
630out2:
631 logfs_compr_exit();
632out1:
633 __free_pages(emergency_page, 0);
634 return ret;
635}
636
637static void __exit logfs_exit(void)
638{
639 unregister_filesystem(&logfs_fs_type);
640 logfs_destroy_inode_cache();
641 logfs_compr_exit();
642 __free_pages(emergency_page, 0);
643}
644
645module_init(logfs_init);
646module_exit(logfs_exit);
647
648MODULE_LICENSE("GPL v2");
649MODULE_AUTHOR("Joern Engel <joern@logfs.org>");
650MODULE_DESCRIPTION("scalable flash filesystem");
diff --git a/include/linux/btree-128.h b/include/linux/btree-128.h
new file mode 100644
index 000000000000..0b3414c4c928
--- /dev/null
+++ b/include/linux/btree-128.h
@@ -0,0 +1,109 @@
1extern struct btree_geo btree_geo128;
2
3struct btree_head128 { struct btree_head h; };
4
5static inline void btree_init_mempool128(struct btree_head128 *head,
6 mempool_t *mempool)
7{
8 btree_init_mempool(&head->h, mempool);
9}
10
11static inline int btree_init128(struct btree_head128 *head)
12{
13 return btree_init(&head->h);
14}
15
16static inline void btree_destroy128(struct btree_head128 *head)
17{
18 btree_destroy(&head->h);
19}
20
21static inline void *btree_lookup128(struct btree_head128 *head, u64 k1, u64 k2)
22{
23 u64 key[2] = {k1, k2};
24 return btree_lookup(&head->h, &btree_geo128, (unsigned long *)&key);
25}
26
27static inline void *btree_get_prev128(struct btree_head128 *head,
28 u64 *k1, u64 *k2)
29{
30 u64 key[2] = {*k1, *k2};
31 void *val;
32
33 val = btree_get_prev(&head->h, &btree_geo128,
34 (unsigned long *)&key);
35 *k1 = key[0];
36 *k2 = key[1];
37 return val;
38}
39
40static inline int btree_insert128(struct btree_head128 *head, u64 k1, u64 k2,
41 void *val, gfp_t gfp)
42{
43 u64 key[2] = {k1, k2};
44 return btree_insert(&head->h, &btree_geo128,
45 (unsigned long *)&key, val, gfp);
46}
47
48static inline int btree_update128(struct btree_head128 *head, u64 k1, u64 k2,
49 void *val)
50{
51 u64 key[2] = {k1, k2};
52 return btree_update(&head->h, &btree_geo128,
53 (unsigned long *)&key, val);
54}
55
56static inline void *btree_remove128(struct btree_head128 *head, u64 k1, u64 k2)
57{
58 u64 key[2] = {k1, k2};
59 return btree_remove(&head->h, &btree_geo128, (unsigned long *)&key);
60}
61
62static inline void *btree_last128(struct btree_head128 *head, u64 *k1, u64 *k2)
63{
64 u64 key[2];
65 void *val;
66
67 val = btree_last(&head->h, &btree_geo128, (unsigned long *)&key[0]);
68 if (val) {
69 *k1 = key[0];
70 *k2 = key[1];
71 }
72
73 return val;
74}
75
76static inline int btree_merge128(struct btree_head128 *target,
77 struct btree_head128 *victim,
78 gfp_t gfp)
79{
80 return btree_merge(&target->h, &victim->h, &btree_geo128, gfp);
81}
82
83void visitor128(void *elem, unsigned long opaque, unsigned long *__key,
84 size_t index, void *__func);
85
86typedef void (*visitor128_t)(void *elem, unsigned long opaque,
87 u64 key1, u64 key2, size_t index);
88
89static inline size_t btree_visitor128(struct btree_head128 *head,
90 unsigned long opaque,
91 visitor128_t func2)
92{
93 return btree_visitor(&head->h, &btree_geo128, opaque,
94 visitor128, func2);
95}
96
97static inline size_t btree_grim_visitor128(struct btree_head128 *head,
98 unsigned long opaque,
99 visitor128_t func2)
100{
101 return btree_grim_visitor(&head->h, &btree_geo128, opaque,
102 visitor128, func2);
103}
104
105#define btree_for_each_safe128(head, k1, k2, val) \
106 for (val = btree_last128(head, &k1, &k2); \
107 val; \
108 val = btree_get_prev128(head, &k1, &k2))
109
diff --git a/include/linux/btree-type.h b/include/linux/btree-type.h
new file mode 100644
index 000000000000..9a1147ef8563
--- /dev/null
+++ b/include/linux/btree-type.h
@@ -0,0 +1,147 @@
1#define __BTREE_TP(pfx, type, sfx) pfx ## type ## sfx
2#define _BTREE_TP(pfx, type, sfx) __BTREE_TP(pfx, type, sfx)
3#define BTREE_TP(pfx) _BTREE_TP(pfx, BTREE_TYPE_SUFFIX,)
4#define BTREE_FN(name) BTREE_TP(btree_ ## name)
5#define BTREE_TYPE_HEAD BTREE_TP(struct btree_head)
6#define VISITOR_FN BTREE_TP(visitor)
7#define VISITOR_FN_T _BTREE_TP(visitor, BTREE_TYPE_SUFFIX, _t)
8
9BTREE_TYPE_HEAD {
10 struct btree_head h;
11};
12
13static inline void BTREE_FN(init_mempool)(BTREE_TYPE_HEAD *head,
14 mempool_t *mempool)
15{
16 btree_init_mempool(&head->h, mempool);
17}
18
19static inline int BTREE_FN(init)(BTREE_TYPE_HEAD *head)
20{
21 return btree_init(&head->h);
22}
23
24static inline void BTREE_FN(destroy)(BTREE_TYPE_HEAD *head)
25{
26 btree_destroy(&head->h);
27}
28
29static inline int BTREE_FN(merge)(BTREE_TYPE_HEAD *target,
30 BTREE_TYPE_HEAD *victim,
31 gfp_t gfp)
32{
33 return btree_merge(&target->h, &victim->h, BTREE_TYPE_GEO, gfp);
34}
35
36#if (BITS_PER_LONG > BTREE_TYPE_BITS)
37static inline void *BTREE_FN(lookup)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE key)
38{
39 unsigned long _key = key;
40 return btree_lookup(&head->h, BTREE_TYPE_GEO, &_key);
41}
42
43static inline int BTREE_FN(insert)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE key,
44 void *val, gfp_t gfp)
45{
46 unsigned long _key = key;
47 return btree_insert(&head->h, BTREE_TYPE_GEO, &_key, val, gfp);
48}
49
50static inline int BTREE_FN(update)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE key,
51 void *val)
52{
53 unsigned long _key = key;
54 return btree_update(&head->h, BTREE_TYPE_GEO, &_key, val);
55}
56
57static inline void *BTREE_FN(remove)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE key)
58{
59 unsigned long _key = key;
60 return btree_remove(&head->h, BTREE_TYPE_GEO, &_key);
61}
62
63static inline void *BTREE_FN(last)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE *key)
64{
65 unsigned long _key;
66 void *val = btree_last(&head->h, BTREE_TYPE_GEO, &_key);
67 if (val)
68 *key = _key;
69 return val;
70}
71
72static inline void *BTREE_FN(get_prev)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE *key)
73{
74 unsigned long _key = *key;
75 void *val = btree_get_prev(&head->h, BTREE_TYPE_GEO, &_key);
76 if (val)
77 *key = _key;
78 return val;
79}
80#else
81static inline void *BTREE_FN(lookup)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE key)
82{
83 return btree_lookup(&head->h, BTREE_TYPE_GEO, (unsigned long *)&key);
84}
85
86static inline int BTREE_FN(insert)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE key,
87 void *val, gfp_t gfp)
88{
89 return btree_insert(&head->h, BTREE_TYPE_GEO, (unsigned long *)&key,
90 val, gfp);
91}
92
93static inline int BTREE_FN(update)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE key,
94 void *val)
95{
96 return btree_update(&head->h, BTREE_TYPE_GEO, (unsigned long *)&key, val);
97}
98
99static inline void *BTREE_FN(remove)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE key)
100{
101 return btree_remove(&head->h, BTREE_TYPE_GEO, (unsigned long *)&key);
102}
103
104static inline void *BTREE_FN(last)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE *key)
105{
106 return btree_last(&head->h, BTREE_TYPE_GEO, (unsigned long *)key);
107}
108
109static inline void *BTREE_FN(get_prev)(BTREE_TYPE_HEAD *head, BTREE_KEYTYPE *key)
110{
111 return btree_get_prev(&head->h, BTREE_TYPE_GEO, (unsigned long *)key);
112}
113#endif
114
115void VISITOR_FN(void *elem, unsigned long opaque, unsigned long *key,
116 size_t index, void *__func);
117
118typedef void (*VISITOR_FN_T)(void *elem, unsigned long opaque,
119 BTREE_KEYTYPE key, size_t index);
120
121static inline size_t BTREE_FN(visitor)(BTREE_TYPE_HEAD *head,
122 unsigned long opaque,
123 VISITOR_FN_T func2)
124{
125 return btree_visitor(&head->h, BTREE_TYPE_GEO, opaque,
126 visitorl, func2);
127}
128
129static inline size_t BTREE_FN(grim_visitor)(BTREE_TYPE_HEAD *head,
130 unsigned long opaque,
131 VISITOR_FN_T func2)
132{
133 return btree_grim_visitor(&head->h, BTREE_TYPE_GEO, opaque,
134 visitorl, func2);
135}
136
137#undef VISITOR_FN
138#undef VISITOR_FN_T
139#undef __BTREE_TP
140#undef _BTREE_TP
141#undef BTREE_TP
142#undef BTREE_FN
143#undef BTREE_TYPE_HEAD
144#undef BTREE_TYPE_SUFFIX
145#undef BTREE_TYPE_GEO
146#undef BTREE_KEYTYPE
147#undef BTREE_TYPE_BITS
diff --git a/include/linux/btree.h b/include/linux/btree.h
new file mode 100644
index 000000000000..65b5bb058324
--- /dev/null
+++ b/include/linux/btree.h
@@ -0,0 +1,243 @@
1#ifndef BTREE_H
2#define BTREE_H
3
4#include <linux/kernel.h>
5#include <linux/mempool.h>
6
7/**
8 * DOC: B+Tree basics
9 *
10 * A B+Tree is a data structure for looking up arbitrary (currently allowing
11 * unsigned long, u32, u64 and 2 * u64) keys into pointers. The data structure
12 * is described at http://en.wikipedia.org/wiki/B-tree, we currently do not
13 * use binary search to find the key on lookups.
14 *
15 * Each B+Tree consists of a head, that contains bookkeeping information and
16 * a variable number (starting with zero) nodes. Each node contains the keys
17 * and pointers to sub-nodes, or, for leaf nodes, the keys and values for the
18 * tree entries.
19 *
20 * Each node in this implementation has the following layout:
21 * [key1, key2, ..., keyN] [val1, val2, ..., valN]
22 *
23 * Each key here is an array of unsigned longs, geo->no_longs in total. The
24 * number of keys and values (N) is geo->no_pairs.
25 */
26
27/**
28 * struct btree_head - btree head
29 *
30 * @node: the first node in the tree
31 * @mempool: mempool used for node allocations
32 * @height: current of the tree
33 */
34struct btree_head {
35 unsigned long *node;
36 mempool_t *mempool;
37 int height;
38};
39
40/* btree geometry */
41struct btree_geo;
42
43/**
44 * btree_alloc - allocate function for the mempool
45 * @gfp_mask: gfp mask for the allocation
46 * @pool_data: unused
47 */
48void *btree_alloc(gfp_t gfp_mask, void *pool_data);
49
50/**
51 * btree_free - free function for the mempool
52 * @element: the element to free
53 * @pool_data: unused
54 */
55void btree_free(void *element, void *pool_data);
56
57/**
58 * btree_init_mempool - initialise a btree with given mempool
59 *
60 * @head: the btree head to initialise
61 * @mempool: the mempool to use
62 *
63 * When this function is used, there is no need to destroy
64 * the mempool.
65 */
66void btree_init_mempool(struct btree_head *head, mempool_t *mempool);
67
68/**
69 * btree_init - initialise a btree
70 *
71 * @head: the btree head to initialise
72 *
73 * This function allocates the memory pool that the
74 * btree needs. Returns zero or a negative error code
75 * (-%ENOMEM) when memory allocation fails.
76 *
77 */
78int __must_check btree_init(struct btree_head *head);
79
80/**
81 * btree_destroy - destroy mempool
82 *
83 * @head: the btree head to destroy
84 *
85 * This function destroys the internal memory pool, use only
86 * when using btree_init(), not with btree_init_mempool().
87 */
88void btree_destroy(struct btree_head *head);
89
90/**
91 * btree_lookup - look up a key in the btree
92 *
93 * @head: the btree to look in
94 * @geo: the btree geometry
95 * @key: the key to look up
96 *
97 * This function returns the value for the given key, or %NULL.
98 */
99void *btree_lookup(struct btree_head *head, struct btree_geo *geo,
100 unsigned long *key);
101
102/**
103 * btree_insert - insert an entry into the btree
104 *
105 * @head: the btree to add to
106 * @geo: the btree geometry
107 * @key: the key to add (must not already be present)
108 * @val: the value to add (must not be %NULL)
109 * @gfp: allocation flags for node allocations
110 *
111 * This function returns 0 if the item could be added, or an
112 * error code if it failed (may fail due to memory pressure).
113 */
114int __must_check btree_insert(struct btree_head *head, struct btree_geo *geo,
115 unsigned long *key, void *val, gfp_t gfp);
116/**
117 * btree_update - update an entry in the btree
118 *
119 * @head: the btree to update
120 * @geo: the btree geometry
121 * @key: the key to update
122 * @val: the value to change it to (must not be %NULL)
123 *
124 * This function returns 0 if the update was successful, or
125 * -%ENOENT if the key could not be found.
126 */
127int btree_update(struct btree_head *head, struct btree_geo *geo,
128 unsigned long *key, void *val);
129/**
130 * btree_remove - remove an entry from the btree
131 *
132 * @head: the btree to update
133 * @geo: the btree geometry
134 * @key: the key to remove
135 *
136 * This function returns the removed entry, or %NULL if the key
137 * could not be found.
138 */
139void *btree_remove(struct btree_head *head, struct btree_geo *geo,
140 unsigned long *key);
141
142/**
143 * btree_merge - merge two btrees
144 *
145 * @target: the tree that gets all the entries
146 * @victim: the tree that gets merged into @target
147 * @geo: the btree geometry
148 * @gfp: allocation flags
149 *
150 * The two trees @target and @victim may not contain the same keys,
151 * that is a bug and triggers a BUG(). This function returns zero
152 * if the trees were merged successfully, and may return a failure
153 * when memory allocation fails, in which case both trees might have
154 * been partially merged, i.e. some entries have been moved from
155 * @victim to @target.
156 */
157int btree_merge(struct btree_head *target, struct btree_head *victim,
158 struct btree_geo *geo, gfp_t gfp);
159
160/**
161 * btree_last - get last entry in btree
162 *
163 * @head: btree head
164 * @geo: btree geometry
165 * @key: last key
166 *
167 * Returns the last entry in the btree, and sets @key to the key
168 * of that entry; returns NULL if the tree is empty, in that case
169 * key is not changed.
170 */
171void *btree_last(struct btree_head *head, struct btree_geo *geo,
172 unsigned long *key);
173
174/**
175 * btree_get_prev - get previous entry
176 *
177 * @head: btree head
178 * @geo: btree geometry
179 * @key: pointer to key
180 *
181 * The function returns the next item right before the value pointed to by
182 * @key, and updates @key with its key, or returns %NULL when there is no
183 * entry with a key smaller than the given key.
184 */
185void *btree_get_prev(struct btree_head *head, struct btree_geo *geo,
186 unsigned long *key);
187
188
189/* internal use, use btree_visitor{l,32,64,128} */
190size_t btree_visitor(struct btree_head *head, struct btree_geo *geo,
191 unsigned long opaque,
192 void (*func)(void *elem, unsigned long opaque,
193 unsigned long *key, size_t index,
194 void *func2),
195 void *func2);
196
197/* internal use, use btree_grim_visitor{l,32,64,128} */
198size_t btree_grim_visitor(struct btree_head *head, struct btree_geo *geo,
199 unsigned long opaque,
200 void (*func)(void *elem, unsigned long opaque,
201 unsigned long *key,
202 size_t index, void *func2),
203 void *func2);
204
205
206#include <linux/btree-128.h>
207
208extern struct btree_geo btree_geo32;
209#define BTREE_TYPE_SUFFIX l
210#define BTREE_TYPE_BITS BITS_PER_LONG
211#define BTREE_TYPE_GEO &btree_geo32
212#define BTREE_KEYTYPE unsigned long
213#include <linux/btree-type.h>
214
215#define btree_for_each_safel(head, key, val) \
216 for (val = btree_lastl(head, &key); \
217 val; \
218 val = btree_get_prevl(head, &key))
219
220#define BTREE_TYPE_SUFFIX 32
221#define BTREE_TYPE_BITS 32
222#define BTREE_TYPE_GEO &btree_geo32
223#define BTREE_KEYTYPE u32
224#include <linux/btree-type.h>
225
226#define btree_for_each_safe32(head, key, val) \
227 for (val = btree_last32(head, &key); \
228 val; \
229 val = btree_get_prev32(head, &key))
230
231extern struct btree_geo btree_geo64;
232#define BTREE_TYPE_SUFFIX 64
233#define BTREE_TYPE_BITS 64
234#define BTREE_TYPE_GEO &btree_geo64
235#define BTREE_KEYTYPE u64
236#include <linux/btree-type.h>
237
238#define btree_for_each_safe64(head, key, val) \
239 for (val = btree_last64(head, &key); \
240 val; \
241 val = btree_get_prev64(head, &key))
242
243#endif
diff --git a/lib/Kconfig b/lib/Kconfig
index 8034c46327cb..496d16e1fa2c 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -163,6 +163,9 @@ config TEXTSEARCH_FSM
163config LIST_SORT 163config LIST_SORT
164 boolean 164 boolean
165 165
166config BTREE
167 boolean
168
166config HAS_IOMEM 169config HAS_IOMEM
167 boolean 170 boolean
168 depends on !NO_IOMEM 171 depends on !NO_IOMEM
diff --git a/lib/Makefile b/lib/Makefile
index e39c361b0be3..59e46a014bc6 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -42,6 +42,7 @@ obj-$(CONFIG_GENERIC_FIND_LAST_BIT) += find_last_bit.o
42obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o 42obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o
43obj-$(CONFIG_LIST_SORT) += list_sort.o 43obj-$(CONFIG_LIST_SORT) += list_sort.o
44obj-$(CONFIG_LOCK_KERNEL) += kernel_lock.o 44obj-$(CONFIG_LOCK_KERNEL) += kernel_lock.o
45obj-$(CONFIG_BTREE) += btree.o
45obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o 46obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o
46obj-$(CONFIG_DEBUG_LIST) += list_debug.o 47obj-$(CONFIG_DEBUG_LIST) += list_debug.o
47obj-$(CONFIG_DEBUG_OBJECTS) += debugobjects.o 48obj-$(CONFIG_DEBUG_OBJECTS) += debugobjects.o
diff --git a/lib/btree.c b/lib/btree.c
new file mode 100644
index 000000000000..41859a820218
--- /dev/null
+++ b/lib/btree.c
@@ -0,0 +1,797 @@
1/*
2 * lib/btree.c - Simple In-memory B+Tree
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2007-2008 Joern Engel <joern@logfs.org>
7 * Bits and pieces stolen from Peter Zijlstra's code, which is
8 * Copyright 2007, Red Hat Inc. Peter Zijlstra <pzijlstr@redhat.com>
9 * GPLv2
10 *
11 * see http://programming.kicks-ass.net/kernel-patches/vma_lookup/btree.patch
12 *
13 * A relatively simple B+Tree implementation. I have written it as a learning
14 * excercise to understand how B+Trees work. Turned out to be useful as well.
15 *
16 * B+Trees can be used similar to Linux radix trees (which don't have anything
17 * in common with textbook radix trees, beware). Prerequisite for them working
18 * well is that access to a random tree node is much faster than a large number
19 * of operations within each node.
20 *
21 * Disks have fulfilled the prerequisite for a long time. More recently DRAM
22 * has gained similar properties, as memory access times, when measured in cpu
23 * cycles, have increased. Cacheline sizes have increased as well, which also
24 * helps B+Trees.
25 *
26 * Compared to radix trees, B+Trees are more efficient when dealing with a
27 * sparsely populated address space. Between 25% and 50% of the memory is
28 * occupied with valid pointers. When densely populated, radix trees contain
29 * ~98% pointers - hard to beat. Very sparse radix trees contain only ~2%
30 * pointers.
31 *
32 * This particular implementation stores pointers identified by a long value.
33 * Storing NULL pointers is illegal, lookup will return NULL when no entry
34 * was found.
35 *
36 * A tricks was used that is not commonly found in textbooks. The lowest
37 * values are to the right, not to the left. All used slots within a node
38 * are on the left, all unused slots contain NUL values. Most operations
39 * simply loop once over all slots and terminate on the first NUL.
40 */
41
42#include <linux/btree.h>
43#include <linux/cache.h>
44#include <linux/kernel.h>
45#include <linux/slab.h>
46#include <linux/module.h>
47
48#define MAX(a, b) ((a) > (b) ? (a) : (b))
49#define NODESIZE MAX(L1_CACHE_BYTES, 128)
50
51struct btree_geo {
52 int keylen;
53 int no_pairs;
54 int no_longs;
55};
56
57struct btree_geo btree_geo32 = {
58 .keylen = 1,
59 .no_pairs = NODESIZE / sizeof(long) / 2,
60 .no_longs = NODESIZE / sizeof(long) / 2,
61};
62EXPORT_SYMBOL_GPL(btree_geo32);
63
64#define LONG_PER_U64 (64 / BITS_PER_LONG)
65struct btree_geo btree_geo64 = {
66 .keylen = LONG_PER_U64,
67 .no_pairs = NODESIZE / sizeof(long) / (1 + LONG_PER_U64),
68 .no_longs = LONG_PER_U64 * (NODESIZE / sizeof(long) / (1 + LONG_PER_U64)),
69};
70EXPORT_SYMBOL_GPL(btree_geo64);
71
72struct btree_geo btree_geo128 = {
73 .keylen = 2 * LONG_PER_U64,
74 .no_pairs = NODESIZE / sizeof(long) / (1 + 2 * LONG_PER_U64),
75 .no_longs = 2 * LONG_PER_U64 * (NODESIZE / sizeof(long) / (1 + 2 * LONG_PER_U64)),
76};
77EXPORT_SYMBOL_GPL(btree_geo128);
78
79static struct kmem_cache *btree_cachep;
80
81void *btree_alloc(gfp_t gfp_mask, void *pool_data)
82{
83 return kmem_cache_alloc(btree_cachep, gfp_mask);
84}
85EXPORT_SYMBOL_GPL(btree_alloc);
86
87void btree_free(void *element, void *pool_data)
88{
89 kmem_cache_free(btree_cachep, element);
90}
91EXPORT_SYMBOL_GPL(btree_free);
92
93static unsigned long *btree_node_alloc(struct btree_head *head, gfp_t gfp)
94{
95 unsigned long *node;
96
97 node = mempool_alloc(head->mempool, gfp);
98 memset(node, 0, NODESIZE);
99 return node;
100}
101
102static int longcmp(const unsigned long *l1, const unsigned long *l2, size_t n)
103{
104 size_t i;
105
106 for (i = 0; i < n; i++) {
107 if (l1[i] < l2[i])
108 return -1;
109 if (l1[i] > l2[i])
110 return 1;
111 }
112 return 0;
113}
114
115static unsigned long *longcpy(unsigned long *dest, const unsigned long *src,
116 size_t n)
117{
118 size_t i;
119
120 for (i = 0; i < n; i++)
121 dest[i] = src[i];
122 return dest;
123}
124
125static unsigned long *longset(unsigned long *s, unsigned long c, size_t n)
126{
127 size_t i;
128
129 for (i = 0; i < n; i++)
130 s[i] = c;
131 return s;
132}
133
134static void dec_key(struct btree_geo *geo, unsigned long *key)
135{
136 unsigned long val;
137 int i;
138
139 for (i = geo->keylen - 1; i >= 0; i--) {
140 val = key[i];
141 key[i] = val - 1;
142 if (val)
143 break;
144 }
145}
146
147static unsigned long *bkey(struct btree_geo *geo, unsigned long *node, int n)
148{
149 return &node[n * geo->keylen];
150}
151
152static void *bval(struct btree_geo *geo, unsigned long *node, int n)
153{
154 return (void *)node[geo->no_longs + n];
155}
156
157static void setkey(struct btree_geo *geo, unsigned long *node, int n,
158 unsigned long *key)
159{
160 longcpy(bkey(geo, node, n), key, geo->keylen);
161}
162
163static void setval(struct btree_geo *geo, unsigned long *node, int n,
164 void *val)
165{
166 node[geo->no_longs + n] = (unsigned long) val;
167}
168
169static void clearpair(struct btree_geo *geo, unsigned long *node, int n)
170{
171 longset(bkey(geo, node, n), 0, geo->keylen);
172 node[geo->no_longs + n] = 0;
173}
174
175static inline void __btree_init(struct btree_head *head)
176{
177 head->node = NULL;
178 head->height = 0;
179}
180
181void btree_init_mempool(struct btree_head *head, mempool_t *mempool)
182{
183 __btree_init(head);
184 head->mempool = mempool;
185}
186EXPORT_SYMBOL_GPL(btree_init_mempool);
187
188int btree_init(struct btree_head *head)
189{
190 __btree_init(head);
191 head->mempool = mempool_create(0, btree_alloc, btree_free, NULL);
192 if (!head->mempool)
193 return -ENOMEM;
194 return 0;
195}
196EXPORT_SYMBOL_GPL(btree_init);
197
198void btree_destroy(struct btree_head *head)
199{
200 mempool_destroy(head->mempool);
201 head->mempool = NULL;
202}
203EXPORT_SYMBOL_GPL(btree_destroy);
204
205void *btree_last(struct btree_head *head, struct btree_geo *geo,
206 unsigned long *key)
207{
208 int height = head->height;
209 unsigned long *node = head->node;
210
211 if (height == 0)
212 return NULL;
213
214 for ( ; height > 1; height--)
215 node = bval(geo, node, 0);
216
217 longcpy(key, bkey(geo, node, 0), geo->keylen);
218 return bval(geo, node, 0);
219}
220EXPORT_SYMBOL_GPL(btree_last);
221
222static int keycmp(struct btree_geo *geo, unsigned long *node, int pos,
223 unsigned long *key)
224{
225 return longcmp(bkey(geo, node, pos), key, geo->keylen);
226}
227
228static int keyzero(struct btree_geo *geo, unsigned long *key)
229{
230 int i;
231
232 for (i = 0; i < geo->keylen; i++)
233 if (key[i])
234 return 0;
235
236 return 1;
237}
238
239void *btree_lookup(struct btree_head *head, struct btree_geo *geo,
240 unsigned long *key)
241{
242 int i, height = head->height;
243 unsigned long *node = head->node;
244
245 if (height == 0)
246 return NULL;
247
248 for ( ; height > 1; height--) {
249 for (i = 0; i < geo->no_pairs; i++)
250 if (keycmp(geo, node, i, key) <= 0)
251 break;
252 if (i == geo->no_pairs)
253 return NULL;
254 node = bval(geo, node, i);
255 if (!node)
256 return NULL;
257 }
258
259 if (!node)
260 return NULL;
261
262 for (i = 0; i < geo->no_pairs; i++)
263 if (keycmp(geo, node, i, key) == 0)
264 return bval(geo, node, i);
265 return NULL;
266}
267EXPORT_SYMBOL_GPL(btree_lookup);
268
269int btree_update(struct btree_head *head, struct btree_geo *geo,
270 unsigned long *key, void *val)
271{
272 int i, height = head->height;
273 unsigned long *node = head->node;
274
275 if (height == 0)
276 return -ENOENT;
277
278 for ( ; height > 1; height--) {
279 for (i = 0; i < geo->no_pairs; i++)
280 if (keycmp(geo, node, i, key) <= 0)
281 break;
282 if (i == geo->no_pairs)
283 return -ENOENT;
284 node = bval(geo, node, i);
285 if (!node)
286 return -ENOENT;
287 }
288
289 if (!node)
290 return -ENOENT;
291
292 for (i = 0; i < geo->no_pairs; i++)
293 if (keycmp(geo, node, i, key) == 0) {
294 setval(geo, node, i, val);
295 return 0;
296 }
297 return -ENOENT;
298}
299EXPORT_SYMBOL_GPL(btree_update);
300
301/*
302 * Usually this function is quite similar to normal lookup. But the key of
303 * a parent node may be smaller than the smallest key of all its siblings.
304 * In such a case we cannot just return NULL, as we have only proven that no
305 * key smaller than __key, but larger than this parent key exists.
306 * So we set __key to the parent key and retry. We have to use the smallest
307 * such parent key, which is the last parent key we encountered.
308 */
309void *btree_get_prev(struct btree_head *head, struct btree_geo *geo,
310 unsigned long *__key)
311{
312 int i, height;
313 unsigned long *node, *oldnode;
314 unsigned long *retry_key = NULL, key[geo->keylen];
315
316 if (keyzero(geo, __key))
317 return NULL;
318
319 if (head->height == 0)
320 return NULL;
321retry:
322 longcpy(key, __key, geo->keylen);
323 dec_key(geo, key);
324
325 node = head->node;
326 for (height = head->height ; height > 1; height--) {
327 for (i = 0; i < geo->no_pairs; i++)
328 if (keycmp(geo, node, i, key) <= 0)
329 break;
330 if (i == geo->no_pairs)
331 goto miss;
332 oldnode = node;
333 node = bval(geo, node, i);
334 if (!node)
335 goto miss;
336 retry_key = bkey(geo, oldnode, i);
337 }
338
339 if (!node)
340 goto miss;
341
342 for (i = 0; i < geo->no_pairs; i++) {
343 if (keycmp(geo, node, i, key) <= 0) {
344 if (bval(geo, node, i)) {
345 longcpy(__key, bkey(geo, node, i), geo->keylen);
346 return bval(geo, node, i);
347 } else
348 goto miss;
349 }
350 }
351miss:
352 if (retry_key) {
353 __key = retry_key;
354 retry_key = NULL;
355 goto retry;
356 }
357 return NULL;
358}
359
360static int getpos(struct btree_geo *geo, unsigned long *node,
361 unsigned long *key)
362{
363 int i;
364
365 for (i = 0; i < geo->no_pairs; i++) {
366 if (keycmp(geo, node, i, key) <= 0)
367 break;
368 }
369 return i;
370}
371
372static int getfill(struct btree_geo *geo, unsigned long *node, int start)
373{
374 int i;
375
376 for (i = start; i < geo->no_pairs; i++)
377 if (!bval(geo, node, i))
378 break;
379 return i;
380}
381
382/*
383 * locate the correct leaf node in the btree
384 */
385static unsigned long *find_level(struct btree_head *head, struct btree_geo *geo,
386 unsigned long *key, int level)
387{
388 unsigned long *node = head->node;
389 int i, height;
390
391 for (height = head->height; height > level; height--) {
392 for (i = 0; i < geo->no_pairs; i++)
393 if (keycmp(geo, node, i, key) <= 0)
394 break;
395
396 if ((i == geo->no_pairs) || !bval(geo, node, i)) {
397 /* right-most key is too large, update it */
398 /* FIXME: If the right-most key on higher levels is
399 * always zero, this wouldn't be necessary. */
400 i--;
401 setkey(geo, node, i, key);
402 }
403 BUG_ON(i < 0);
404 node = bval(geo, node, i);
405 }
406 BUG_ON(!node);
407 return node;
408}
409
410static int btree_grow(struct btree_head *head, struct btree_geo *geo,
411 gfp_t gfp)
412{
413 unsigned long *node;
414 int fill;
415
416 node = btree_node_alloc(head, gfp);
417 if (!node)
418 return -ENOMEM;
419 if (head->node) {
420 fill = getfill(geo, head->node, 0);
421 setkey(geo, node, 0, bkey(geo, head->node, fill - 1));
422 setval(geo, node, 0, head->node);
423 }
424 head->node = node;
425 head->height++;
426 return 0;
427}
428
429static void btree_shrink(struct btree_head *head, struct btree_geo *geo)
430{
431 unsigned long *node;
432 int fill;
433
434 if (head->height <= 1)
435 return;
436
437 node = head->node;
438 fill = getfill(geo, node, 0);
439 BUG_ON(fill > 1);
440 head->node = bval(geo, node, 0);
441 head->height--;
442 mempool_free(node, head->mempool);
443}
444
445static int btree_insert_level(struct btree_head *head, struct btree_geo *geo,
446 unsigned long *key, void *val, int level,
447 gfp_t gfp)
448{
449 unsigned long *node;
450 int i, pos, fill, err;
451
452 BUG_ON(!val);
453 if (head->height < level) {
454 err = btree_grow(head, geo, gfp);
455 if (err)
456 return err;
457 }
458
459retry:
460 node = find_level(head, geo, key, level);
461 pos = getpos(geo, node, key);
462 fill = getfill(geo, node, pos);
463 /* two identical keys are not allowed */
464 BUG_ON(pos < fill && keycmp(geo, node, pos, key) == 0);
465
466 if (fill == geo->no_pairs) {
467 /* need to split node */
468 unsigned long *new;
469
470 new = btree_node_alloc(head, gfp);
471 if (!new)
472 return -ENOMEM;
473 err = btree_insert_level(head, geo,
474 bkey(geo, node, fill / 2 - 1),
475 new, level + 1, gfp);
476 if (err) {
477 mempool_free(new, head->mempool);
478 return err;
479 }
480 for (i = 0; i < fill / 2; i++) {
481 setkey(geo, new, i, bkey(geo, node, i));
482 setval(geo, new, i, bval(geo, node, i));
483 setkey(geo, node, i, bkey(geo, node, i + fill / 2));
484 setval(geo, node, i, bval(geo, node, i + fill / 2));
485 clearpair(geo, node, i + fill / 2);
486 }
487 if (fill & 1) {
488 setkey(geo, node, i, bkey(geo, node, fill - 1));
489 setval(geo, node, i, bval(geo, node, fill - 1));
490 clearpair(geo, node, fill - 1);
491 }
492 goto retry;
493 }
494 BUG_ON(fill >= geo->no_pairs);
495
496 /* shift and insert */
497 for (i = fill; i > pos; i--) {
498 setkey(geo, node, i, bkey(geo, node, i - 1));
499 setval(geo, node, i, bval(geo, node, i - 1));
500 }
501 setkey(geo, node, pos, key);
502 setval(geo, node, pos, val);
503
504 return 0;
505}
506
507int btree_insert(struct btree_head *head, struct btree_geo *geo,
508 unsigned long *key, void *val, gfp_t gfp)
509{
510 return btree_insert_level(head, geo, key, val, 1, gfp);
511}
512EXPORT_SYMBOL_GPL(btree_insert);
513
514static void *btree_remove_level(struct btree_head *head, struct btree_geo *geo,
515 unsigned long *key, int level);
516static void merge(struct btree_head *head, struct btree_geo *geo, int level,
517 unsigned long *left, int lfill,
518 unsigned long *right, int rfill,
519 unsigned long *parent, int lpos)
520{
521 int i;
522
523 for (i = 0; i < rfill; i++) {
524 /* Move all keys to the left */
525 setkey(geo, left, lfill + i, bkey(geo, right, i));
526 setval(geo, left, lfill + i, bval(geo, right, i));
527 }
528 /* Exchange left and right child in parent */
529 setval(geo, parent, lpos, right);
530 setval(geo, parent, lpos + 1, left);
531 /* Remove left (formerly right) child from parent */
532 btree_remove_level(head, geo, bkey(geo, parent, lpos), level + 1);
533 mempool_free(right, head->mempool);
534}
535
536static void rebalance(struct btree_head *head, struct btree_geo *geo,
537 unsigned long *key, int level, unsigned long *child, int fill)
538{
539 unsigned long *parent, *left = NULL, *right = NULL;
540 int i, no_left, no_right;
541
542 if (fill == 0) {
543 /* Because we don't steal entries from a neigbour, this case
544 * can happen. Parent node contains a single child, this
545 * node, so merging with a sibling never happens.
546 */
547 btree_remove_level(head, geo, key, level + 1);
548 mempool_free(child, head->mempool);
549 return;
550 }
551
552 parent = find_level(head, geo, key, level + 1);
553 i = getpos(geo, parent, key);
554 BUG_ON(bval(geo, parent, i) != child);
555
556 if (i > 0) {
557 left = bval(geo, parent, i - 1);
558 no_left = getfill(geo, left, 0);
559 if (fill + no_left <= geo->no_pairs) {
560 merge(head, geo, level,
561 left, no_left,
562 child, fill,
563 parent, i - 1);
564 return;
565 }
566 }
567 if (i + 1 < getfill(geo, parent, i)) {
568 right = bval(geo, parent, i + 1);
569 no_right = getfill(geo, right, 0);
570 if (fill + no_right <= geo->no_pairs) {
571 merge(head, geo, level,
572 child, fill,
573 right, no_right,
574 parent, i);
575 return;
576 }
577 }
578 /*
579 * We could also try to steal one entry from the left or right
580 * neighbor. By not doing so we changed the invariant from
581 * "all nodes are at least half full" to "no two neighboring
582 * nodes can be merged". Which means that the average fill of
583 * all nodes is still half or better.
584 */
585}
586
587static void *btree_remove_level(struct btree_head *head, struct btree_geo *geo,
588 unsigned long *key, int level)
589{
590 unsigned long *node;
591 int i, pos, fill;
592 void *ret;
593
594 if (level > head->height) {
595 /* we recursed all the way up */
596 head->height = 0;
597 head->node = NULL;
598 return NULL;
599 }
600
601 node = find_level(head, geo, key, level);
602 pos = getpos(geo, node, key);
603 fill = getfill(geo, node, pos);
604 if ((level == 1) && (keycmp(geo, node, pos, key) != 0))
605 return NULL;
606 ret = bval(geo, node, pos);
607
608 /* remove and shift */
609 for (i = pos; i < fill - 1; i++) {
610 setkey(geo, node, i, bkey(geo, node, i + 1));
611 setval(geo, node, i, bval(geo, node, i + 1));
612 }
613 clearpair(geo, node, fill - 1);
614
615 if (fill - 1 < geo->no_pairs / 2) {
616 if (level < head->height)
617 rebalance(head, geo, key, level, node, fill - 1);
618 else if (fill - 1 == 1)
619 btree_shrink(head, geo);
620 }
621
622 return ret;
623}
624
625void *btree_remove(struct btree_head *head, struct btree_geo *geo,
626 unsigned long *key)
627{
628 if (head->height == 0)
629 return NULL;
630
631 return btree_remove_level(head, geo, key, 1);
632}
633EXPORT_SYMBOL_GPL(btree_remove);
634
635int btree_merge(struct btree_head *target, struct btree_head *victim,
636 struct btree_geo *geo, gfp_t gfp)
637{
638 unsigned long key[geo->keylen];
639 unsigned long dup[geo->keylen];
640 void *val;
641 int err;
642
643 BUG_ON(target == victim);
644
645 if (!(target->node)) {
646 /* target is empty, just copy fields over */
647 target->node = victim->node;
648 target->height = victim->height;
649 __btree_init(victim);
650 return 0;
651 }
652
653 /* TODO: This needs some optimizations. Currently we do three tree
654 * walks to remove a single object from the victim.
655 */
656 for (;;) {
657 if (!btree_last(victim, geo, key))
658 break;
659 val = btree_lookup(victim, geo, key);
660 err = btree_insert(target, geo, key, val, gfp);
661 if (err)
662 return err;
663 /* We must make a copy of the key, as the original will get
664 * mangled inside btree_remove. */
665 longcpy(dup, key, geo->keylen);
666 btree_remove(victim, geo, dup);
667 }
668 return 0;
669}
670EXPORT_SYMBOL_GPL(btree_merge);
671
672static size_t __btree_for_each(struct btree_head *head, struct btree_geo *geo,
673 unsigned long *node, unsigned long opaque,
674 void (*func)(void *elem, unsigned long opaque,
675 unsigned long *key, size_t index,
676 void *func2),
677 void *func2, int reap, int height, size_t count)
678{
679 int i;
680 unsigned long *child;
681
682 for (i = 0; i < geo->no_pairs; i++) {
683 child = bval(geo, node, i);
684 if (!child)
685 break;
686 if (height > 1)
687 count = __btree_for_each(head, geo, child, opaque,
688 func, func2, reap, height - 1, count);
689 else
690 func(child, opaque, bkey(geo, node, i), count++,
691 func2);
692 }
693 if (reap)
694 mempool_free(node, head->mempool);
695 return count;
696}
697
698static void empty(void *elem, unsigned long opaque, unsigned long *key,
699 size_t index, void *func2)
700{
701}
702
703void visitorl(void *elem, unsigned long opaque, unsigned long *key,
704 size_t index, void *__func)
705{
706 visitorl_t func = __func;
707
708 func(elem, opaque, *key, index);
709}
710EXPORT_SYMBOL_GPL(visitorl);
711
712void visitor32(void *elem, unsigned long opaque, unsigned long *__key,
713 size_t index, void *__func)
714{
715 visitor32_t func = __func;
716 u32 *key = (void *)__key;
717
718 func(elem, opaque, *key, index);
719}
720EXPORT_SYMBOL_GPL(visitor32);
721
722void visitor64(void *elem, unsigned long opaque, unsigned long *__key,
723 size_t index, void *__func)
724{
725 visitor64_t func = __func;
726 u64 *key = (void *)__key;
727
728 func(elem, opaque, *key, index);
729}
730EXPORT_SYMBOL_GPL(visitor64);
731
732void visitor128(void *elem, unsigned long opaque, unsigned long *__key,
733 size_t index, void *__func)
734{
735 visitor128_t func = __func;
736 u64 *key = (void *)__key;
737
738 func(elem, opaque, key[0], key[1], index);
739}
740EXPORT_SYMBOL_GPL(visitor128);
741
742size_t btree_visitor(struct btree_head *head, struct btree_geo *geo,
743 unsigned long opaque,
744 void (*func)(void *elem, unsigned long opaque,
745 unsigned long *key,
746 size_t index, void *func2),
747 void *func2)
748{
749 size_t count = 0;
750
751 if (!func2)
752 func = empty;
753 if (head->node)
754 count = __btree_for_each(head, geo, head->node, opaque, func,
755 func2, 0, head->height, 0);
756 return count;
757}
758EXPORT_SYMBOL_GPL(btree_visitor);
759
760size_t btree_grim_visitor(struct btree_head *head, struct btree_geo *geo,
761 unsigned long opaque,
762 void (*func)(void *elem, unsigned long opaque,
763 unsigned long *key,
764 size_t index, void *func2),
765 void *func2)
766{
767 size_t count = 0;
768
769 if (!func2)
770 func = empty;
771 if (head->node)
772 count = __btree_for_each(head, geo, head->node, opaque, func,
773 func2, 1, head->height, 0);
774 __btree_init(head);
775 return count;
776}
777EXPORT_SYMBOL_GPL(btree_grim_visitor);
778
779static int __init btree_module_init(void)
780{
781 btree_cachep = kmem_cache_create("btree_node", NODESIZE, 0,
782 SLAB_HWCACHE_ALIGN, NULL);
783 return 0;
784}
785
786static void __exit btree_module_exit(void)
787{
788 kmem_cache_destroy(btree_cachep);
789}
790
791/* If core code starts using btree, initialization should happen even earlier */
792module_init(btree_module_init);
793module_exit(btree_module_exit);
794
795MODULE_AUTHOR("Joern Engel <joern@logfs.org>");
796MODULE_AUTHOR("Johannes Berg <johannes@sipsolutions.net>");
797MODULE_LICENSE("GPL");