summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorChristoph Hellwig <hch@lst.de>2016-09-11 10:04:46 -0400
committerAl Viro <viro@zeniv.linux.org.uk>2016-12-14 23:48:11 -0500
commit1d0fd57a50aa372dd2e84b16711023cbcd826cb8 (patch)
treed80b0315115e733f41fb412dbbb3cba035bbfcff
parent64d2ab32efe39354c29e1ecefea3769586026979 (diff)
logfs: remove from tree
Logfs was introduced to the kernel in 2009, and hasn't seen any non drive-by changes since 2012, while having lots of unsolved issues including the complete lack of error handling, with more and more issues popping up without any fixes. The logfs.org domain has been bouncing from a mail, and the maintainer on the non-logfs.org domain hasn't repsonded to past queries either. Signed-off-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
-rw-r--r--Documentation/filesystems/00-INDEX2
-rw-r--r--Documentation/filesystems/logfs.txt241
-rw-r--r--MAINTAINERS8
-rw-r--r--fs/Kconfig1
-rw-r--r--fs/Makefile1
-rw-r--r--fs/logfs/Kconfig17
-rw-r--r--fs/logfs/Makefile13
-rw-r--r--fs/logfs/compr.c95
-rw-r--r--fs/logfs/dev_bdev.c322
-rw-r--r--fs/logfs/dev_mtd.c274
-rw-r--r--fs/logfs/dir.c801
-rw-r--r--fs/logfs/file.c285
-rw-r--r--fs/logfs/gc.c732
-rw-r--r--fs/logfs/inode.c428
-rw-r--r--fs/logfs/journal.c894
-rw-r--r--fs/logfs/logfs.h735
-rw-r--r--fs/logfs/logfs_abi.h629
-rw-r--r--fs/logfs/readwrite.c2298
-rw-r--r--fs/logfs/segment.c961
-rw-r--r--fs/logfs/super.c653
20 files changed, 0 insertions, 9390 deletions
diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX
index f66e748fc5e4..b7bd6c9009cc 100644
--- a/Documentation/filesystems/00-INDEX
+++ b/Documentation/filesystems/00-INDEX
@@ -87,8 +87,6 @@ jfs.txt
87 - info and mount options for the JFS filesystem. 87 - info and mount options for the JFS filesystem.
88locks.txt 88locks.txt
89 - info on file locking implementations, flock() vs. fcntl(), etc. 89 - info on file locking implementations, flock() vs. fcntl(), etc.
90logfs.txt
91 - info on the LogFS flash filesystem.
92mandatory-locking.txt 90mandatory-locking.txt
93 - info on the Linux implementation of Sys V mandatory file locking. 91 - info on the Linux implementation of Sys V mandatory file locking.
94ncpfs.txt 92ncpfs.txt
diff --git a/Documentation/filesystems/logfs.txt b/Documentation/filesystems/logfs.txt
deleted file mode 100644
index bca42c22a143..000000000000
--- a/Documentation/filesystems/logfs.txt
+++ /dev/null
@@ -1,241 +0,0 @@
1
2The LogFS Flash Filesystem
3==========================
4
5Specification
6=============
7
8Superblocks
9-----------
10
11Two superblocks exist at the beginning and end of the filesystem.
12Each superblock is 256 Bytes large, with another 3840 Bytes reserved
13for future purposes, making a total of 4096 Bytes.
14
15Superblock locations may differ for MTD and block devices. On MTD the
16first non-bad block contains a superblock in the first 4096 Bytes and
17the last non-bad block contains a superblock in the last 4096 Bytes.
18On block devices, the first 4096 Bytes of the device contain the first
19superblock and the last aligned 4096 Byte-block contains the second
20superblock.
21
22For the most part, the superblocks can be considered read-only. They
23are written only to correct errors detected within the superblocks,
24move the journal and change the filesystem parameters through tunefs.
25As a result, the superblock does not contain any fields that require
26constant updates, like the amount of free space, etc.
27
28Segments
29--------
30
31The space in the device is split up into equal-sized segments.
32Segments are the primary write unit of LogFS. Within each segments,
33writes happen from front (low addresses) to back (high addresses. If
34only a partial segment has been written, the segment number, the
35current position within and optionally a write buffer are stored in
36the journal.
37
38Segments are erased as a whole. Therefore Garbage Collection may be
39required to completely free a segment before doing so.
40
41Journal
42--------
43
44The journal contains all global information about the filesystem that
45is subject to frequent change. At mount time, it has to be scanned
46for the most recent commit entry, which contains a list of pointers to
47all currently valid entries.
48
49Object Store
50------------
51
52All space except for the superblocks and journal is part of the object
53store. Each segment contains a segment header and a number of
54objects, each consisting of the object header and the payload.
55Objects are either inodes, directory entries (dentries), file data
56blocks or indirect blocks.
57
58Levels
59------
60
61Garbage collection (GC) may fail if all data is written
62indiscriminately. One requirement of GC is that data is separated
63roughly according to the distance between the tree root and the data.
64Effectively that means all file data is on level 0, indirect blocks
65are on levels 1, 2, 3 4 or 5 for 1x, 2x, 3x, 4x or 5x indirect blocks,
66respectively. Inode file data is on level 6 for the inodes and 7-11
67for indirect blocks.
68
69Each segment contains objects of a single level only. As a result,
70each level requires its own separate segment to be open for writing.
71
72Inode File
73----------
74
75All inodes are stored in a special file, the inode file. Single
76exception is the inode file's inode (master inode) which for obvious
77reasons is stored in the journal instead. Instead of data blocks, the
78leaf nodes of the inode files are inodes.
79
80Aliases
81-------
82
83Writes in LogFS are done by means of a wandering tree. A naïve
84implementation would require that for each write or a block, all
85parent blocks are written as well, since the block pointers have
86changed. Such an implementation would not be very efficient.
87
88In LogFS, the block pointer changes are cached in the journal by means
89of alias entries. Each alias consists of its logical address - inode
90number, block index, level and child number (index into block) - and
91the changed data. Any 8-byte word can be changes in this manner.
92
93Currently aliases are used for block pointers, file size, file used
94bytes and the height of an inodes indirect tree.
95
96Segment Aliases
97---------------
98
99Related to regular aliases, these are used to handle bad blocks.
100Initially, bad blocks are handled by moving the affected segment
101content to a spare segment and noting this move in the journal with a
102segment alias, a simple (to, from) tupel. GC will later empty this
103segment and the alias can be removed again. This is used on MTD only.
104
105Vim
106---
107
108By cleverly predicting the life time of data, it is possible to
109separate long-living data from short-living data and thereby reduce
110the GC overhead later. Each type of distinc life expectency (vim) can
111have a separate segment open for writing. Each (level, vim) tupel can
112be open just once. If an open segment with unknown vim is encountered
113at mount time, it is closed and ignored henceforth.
114
115Indirect Tree
116-------------
117
118Inodes in LogFS are similar to FFS-style filesystems with direct and
119indirect block pointers. One difference is that LogFS uses a single
120indirect pointer that can be either a 1x, 2x, etc. indirect pointer.
121A height field in the inode defines the height of the indirect tree
122and thereby the indirection of the pointer.
123
124Another difference is the addressing of indirect blocks. In LogFS,
125the first 16 pointers in the first indirect block are left empty,
126corresponding to the 16 direct pointers in the inode. In ext2 (maybe
127others as well) the first pointer in the first indirect block
128corresponds to logical block 12, skipping the 12 direct pointers.
129So where ext2 is using arithmetic to better utilize space, LogFS keeps
130arithmetic simple and uses compression to save space.
131
132Compression
133-----------
134
135Both file data and metadata can be compressed. Compression for file
136data can be enabled with chattr +c and disabled with chattr -c. Doing
137so has no effect on existing data, but new data will be stored
138accordingly. New inodes will inherit the compression flag of the
139parent directory.
140
141Metadata is always compressed. However, the space accounting ignores
142this and charges for the uncompressed size. Failing to do so could
143result in GC failures when, after moving some data, indirect blocks
144compress worse than previously. Even on a 100% full medium, GC may
145not consume any extra space, so the compression gains are lost space
146to the user.
147
148However, they are not lost space to the filesystem internals. By
149cheating the user for those bytes, the filesystem gained some slack
150space and GC will run less often and faster.
151
152Garbage Collection and Wear Leveling
153------------------------------------
154
155Garbage collection is invoked whenever the number of free segments
156falls below a threshold. The best (known) candidate is picked based
157on the least amount of valid data contained in the segment. All
158remaining valid data is copied elsewhere, thereby invalidating it.
159
160The GC code also checks for aliases and writes then back if their
161number gets too large.
162
163Wear leveling is done by occasionally picking a suboptimal segment for
164garbage collection. If a stale segments erase count is significantly
165lower than the active segments' erase counts, it will be picked. Wear
166leveling is rate limited, so it will never monopolize the device for
167more than one segment worth at a time.
168
169Values for "occasionally", "significantly lower" are compile time
170constants.
171
172Hashed directories
173------------------
174
175To satisfy efficient lookup(), directory entries are hashed and
176located based on the hash. In order to both support large directories
177and not be overly inefficient for small directories, several hash
178tables of increasing size are used. For each table, the hash value
179modulo the table size gives the table index.
180
181Tables sizes are chosen to limit the number of indirect blocks with a
182fully populated table to 0, 1, 2 or 3 respectively. So the first
183table contains 16 entries, the second 512-16, etc.
184
185The last table is special in several ways. First its size depends on
186the effective 32bit limit on telldir/seekdir cookies. Since logfs
187uses the upper half of the address space for indirect blocks, the size
188is limited to 2^31. Secondly the table contains hash buckets with 16
189entries each.
190
191Using single-entry buckets would result in birthday "attacks". At
192just 2^16 used entries, hash collisions would be likely (P >= 0.5).
193My math skills are insufficient to do the combinatorics for the 17x
194collisions necessary to overflow a bucket, but testing showed that in
19510,000 runs the lowest directory fill before a bucket overflow was
196188,057,130 entries with an average of 315,149,915 entries. So for
197directory sizes of up to a million, bucket overflows should be
198virtually impossible under normal circumstances.
199
200With carefully chosen filenames, it is obviously possible to cause an
201overflow with just 21 entries (4 higher tables + 16 entries + 1). So
202there may be a security concern if a malicious user has write access
203to a directory.
204
205Open For Discussion
206===================
207
208Device Address Space
209--------------------
210
211A device address space is used for caching. Both block devices and
212MTD provide functions to either read a single page or write a segment.
213Partial segments may be written for data integrity, but where possible
214complete segments are written for performance on simple block device
215flash media.
216
217Meta Inodes
218-----------
219
220Inodes are stored in the inode file, which is just a regular file for
221most purposes. At umount time, however, the inode file needs to
222remain open until all dirty inodes are written. So
223generic_shutdown_super() may not close this inode, but shouldn't
224complain about remaining inodes due to the inode file either. Same
225goes for mapping inode of the device address space.
226
227Currently logfs uses a hack that essentially copies part of fs/inode.c
228code over. A general solution would be preferred.
229
230Indirect block mapping
231----------------------
232
233With compression, the block device (or mapping inode) cannot be used
234to cache indirect blocks. Some other place is required. Currently
235logfs uses the top half of each inode's address space. The low 8TB
236(on 32bit) are filled with file data, the high 8TB are used for
237indirect blocks.
238
239One problem is that 16TB files created on 64bit systems actually have
240data in the top 8TB. But files >16TB would cause problems anyway, so
241only the limit has changed.
diff --git a/MAINTAINERS b/MAINTAINERS
index 8d4148406923..a76d34a28ce9 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7432,14 +7432,6 @@ S: Maintained
7432F: Documentation/ldm.txt 7432F: Documentation/ldm.txt
7433F: block/partitions/ldm.* 7433F: block/partitions/ldm.*
7434 7434
7435LogFS
7436M: Joern Engel <joern@logfs.org>
7437M: Prasad Joshi <prasadjoshi.linux@gmail.com>
7438L: logfs@logfs.org
7439W: logfs.org
7440S: Maintained
7441F: fs/logfs/
7442
7443LSILOGIC MPT FUSION DRIVERS (FC/SAS/SPI) 7435LSILOGIC MPT FUSION DRIVERS (FC/SAS/SPI)
7444M: Sathya Prakash <sathya.prakash@broadcom.com> 7436M: Sathya Prakash <sathya.prakash@broadcom.com>
7445M: Chaitra P B <chaitra.basappa@broadcom.com> 7437M: Chaitra P B <chaitra.basappa@broadcom.com>
diff --git a/fs/Kconfig b/fs/Kconfig
index 4bd03a2b0518..884653fc6a8b 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -235,7 +235,6 @@ source "fs/efs/Kconfig"
235source "fs/jffs2/Kconfig" 235source "fs/jffs2/Kconfig"
236# UBIFS File system configuration 236# UBIFS File system configuration
237source "fs/ubifs/Kconfig" 237source "fs/ubifs/Kconfig"
238source "fs/logfs/Kconfig"
239source "fs/cramfs/Kconfig" 238source "fs/cramfs/Kconfig"
240source "fs/squashfs/Kconfig" 239source "fs/squashfs/Kconfig"
241source "fs/freevxfs/Kconfig" 240source "fs/freevxfs/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index ed2b63257ba9..7bbaca9c67b1 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -97,7 +97,6 @@ obj-$(CONFIG_NTFS_FS) += ntfs/
97obj-$(CONFIG_UFS_FS) += ufs/ 97obj-$(CONFIG_UFS_FS) += ufs/
98obj-$(CONFIG_EFS_FS) += efs/ 98obj-$(CONFIG_EFS_FS) += efs/
99obj-$(CONFIG_JFFS2_FS) += jffs2/ 99obj-$(CONFIG_JFFS2_FS) += jffs2/
100obj-$(CONFIG_LOGFS) += logfs/
101obj-$(CONFIG_UBIFS_FS) += ubifs/ 100obj-$(CONFIG_UBIFS_FS) += ubifs/
102obj-$(CONFIG_AFFS_FS) += affs/ 101obj-$(CONFIG_AFFS_FS) += affs/
103obj-$(CONFIG_ROMFS_FS) += romfs/ 102obj-$(CONFIG_ROMFS_FS) += romfs/
diff --git a/fs/logfs/Kconfig b/fs/logfs/Kconfig
deleted file mode 100644
index 2b4503163930..000000000000
--- a/fs/logfs/Kconfig
+++ /dev/null
@@ -1,17 +0,0 @@
1config LOGFS
2 tristate "LogFS file system"
3 depends on MTD || (!MTD && BLOCK)
4 select ZLIB_INFLATE
5 select ZLIB_DEFLATE
6 select CRC32
7 select BTREE
8 help
9 Flash filesystem aimed to scale efficiently to large devices.
10 In comparison to JFFS2 it offers significantly faster mount
11 times and potentially less RAM usage, although the latter has
12 not been measured yet.
13
14 In its current state it is still very experimental and should
15 not be used for other than testing purposes.
16
17 If unsure, say N.
diff --git a/fs/logfs/Makefile b/fs/logfs/Makefile
deleted file mode 100644
index 4820027787ee..000000000000
--- a/fs/logfs/Makefile
+++ /dev/null
@@ -1,13 +0,0 @@
1obj-$(CONFIG_LOGFS) += logfs.o
2
3logfs-y += compr.o
4logfs-y += dir.o
5logfs-y += file.o
6logfs-y += gc.o
7logfs-y += inode.o
8logfs-y += journal.o
9logfs-y += readwrite.o
10logfs-y += segment.o
11logfs-y += super.o
12logfs-$(CONFIG_BLOCK) += dev_bdev.o
13logfs-$(CONFIG_MTD) += dev_mtd.o
diff --git a/fs/logfs/compr.c b/fs/logfs/compr.c
deleted file mode 100644
index 961f02b86d97..000000000000
--- a/fs/logfs/compr.c
+++ /dev/null
@@ -1,95 +0,0 @@
1/*
2 * fs/logfs/compr.c - compression routines
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */
8#include "logfs.h"
9#include <linux/vmalloc.h>
10#include <linux/zlib.h>
11
12#define COMPR_LEVEL 3
13
14static DEFINE_MUTEX(compr_mutex);
15static struct z_stream_s stream;
16
17int logfs_compress(void *in, void *out, size_t inlen, size_t outlen)
18{
19 int err, ret;
20
21 ret = -EIO;
22 mutex_lock(&compr_mutex);
23 err = zlib_deflateInit(&stream, COMPR_LEVEL);
24 if (err != Z_OK)
25 goto error;
26
27 stream.next_in = in;
28 stream.avail_in = inlen;
29 stream.total_in = 0;
30 stream.next_out = out;
31 stream.avail_out = outlen;
32 stream.total_out = 0;
33
34 err = zlib_deflate(&stream, Z_FINISH);
35 if (err != Z_STREAM_END)
36 goto error;
37
38 err = zlib_deflateEnd(&stream);
39 if (err != Z_OK)
40 goto error;
41
42 if (stream.total_out >= stream.total_in)
43 goto error;
44
45 ret = stream.total_out;
46error:
47 mutex_unlock(&compr_mutex);
48 return ret;
49}
50
51int logfs_uncompress(void *in, void *out, size_t inlen, size_t outlen)
52{
53 int err, ret;
54
55 ret = -EIO;
56 mutex_lock(&compr_mutex);
57 err = zlib_inflateInit(&stream);
58 if (err != Z_OK)
59 goto error;
60
61 stream.next_in = in;
62 stream.avail_in = inlen;
63 stream.total_in = 0;
64 stream.next_out = out;
65 stream.avail_out = outlen;
66 stream.total_out = 0;
67
68 err = zlib_inflate(&stream, Z_FINISH);
69 if (err != Z_STREAM_END)
70 goto error;
71
72 err = zlib_inflateEnd(&stream);
73 if (err != Z_OK)
74 goto error;
75
76 ret = 0;
77error:
78 mutex_unlock(&compr_mutex);
79 return ret;
80}
81
82int __init logfs_compr_init(void)
83{
84 size_t size = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL),
85 zlib_inflate_workspacesize());
86 stream.workspace = vmalloc(size);
87 if (!stream.workspace)
88 return -ENOMEM;
89 return 0;
90}
91
92void logfs_compr_exit(void)
93{
94 vfree(stream.workspace);
95}
diff --git a/fs/logfs/dev_bdev.c b/fs/logfs/dev_bdev.c
deleted file mode 100644
index a8329cc47dec..000000000000
--- a/fs/logfs/dev_bdev.c
+++ /dev/null
@@ -1,322 +0,0 @@
1/*
2 * fs/logfs/dev_bdev.c - Device access methods for block devices
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */
8#include "logfs.h"
9#include <linux/bio.h>
10#include <linux/blkdev.h>
11#include <linux/buffer_head.h>
12#include <linux/gfp.h>
13#include <linux/prefetch.h>
14
15#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
16
17static int sync_request(struct page *page, struct block_device *bdev, int op)
18{
19 struct bio bio;
20 struct bio_vec bio_vec;
21
22 bio_init(&bio);
23 bio.bi_max_vecs = 1;
24 bio.bi_io_vec = &bio_vec;
25 bio_vec.bv_page = page;
26 bio_vec.bv_len = PAGE_SIZE;
27 bio_vec.bv_offset = 0;
28 bio.bi_vcnt = 1;
29 bio.bi_bdev = bdev;
30 bio.bi_iter.bi_sector = page->index * (PAGE_SIZE >> 9);
31 bio.bi_iter.bi_size = PAGE_SIZE;
32 bio_set_op_attrs(&bio, op, 0);
33
34 return submit_bio_wait(&bio);
35}
36
37static int bdev_readpage(void *_sb, struct page *page)
38{
39 struct super_block *sb = _sb;
40 struct block_device *bdev = logfs_super(sb)->s_bdev;
41 int err;
42
43 err = sync_request(page, bdev, READ);
44 if (err) {
45 ClearPageUptodate(page);
46 SetPageError(page);
47 } else {
48 SetPageUptodate(page);
49 ClearPageError(page);
50 }
51 unlock_page(page);
52 return err;
53}
54
55static DECLARE_WAIT_QUEUE_HEAD(wq);
56
57static void writeseg_end_io(struct bio *bio)
58{
59 struct bio_vec *bvec;
60 int i;
61 struct super_block *sb = bio->bi_private;
62 struct logfs_super *super = logfs_super(sb);
63
64 BUG_ON(bio->bi_error); /* FIXME: Retry io or write elsewhere */
65
66 bio_for_each_segment_all(bvec, bio, i) {
67 end_page_writeback(bvec->bv_page);
68 put_page(bvec->bv_page);
69 }
70 bio_put(bio);
71 if (atomic_dec_and_test(&super->s_pending_writes))
72 wake_up(&wq);
73}
74
75static int __bdev_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
76 size_t nr_pages)
77{
78 struct logfs_super *super = logfs_super(sb);
79 struct address_space *mapping = super->s_mapping_inode->i_mapping;
80 struct bio *bio;
81 struct page *page;
82 unsigned int max_pages;
83 int i;
84
85 max_pages = min_t(size_t, nr_pages, BIO_MAX_PAGES);
86
87 bio = bio_alloc(GFP_NOFS, max_pages);
88 BUG_ON(!bio);
89
90 for (i = 0; i < nr_pages; i++) {
91 if (i >= max_pages) {
92 /* Block layer cannot split bios :( */
93 bio->bi_vcnt = i;
94 bio->bi_iter.bi_size = i * PAGE_SIZE;
95 bio->bi_bdev = super->s_bdev;
96 bio->bi_iter.bi_sector = ofs >> 9;
97 bio->bi_private = sb;
98 bio->bi_end_io = writeseg_end_io;
99 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
100 atomic_inc(&super->s_pending_writes);
101 submit_bio(bio);
102
103 ofs += i * PAGE_SIZE;
104 index += i;
105 nr_pages -= i;
106 i = 0;
107
108 bio = bio_alloc(GFP_NOFS, max_pages);
109 BUG_ON(!bio);
110 }
111 page = find_lock_page(mapping, index + i);
112 BUG_ON(!page);
113 bio->bi_io_vec[i].bv_page = page;
114 bio->bi_io_vec[i].bv_len = PAGE_SIZE;
115 bio->bi_io_vec[i].bv_offset = 0;
116
117 BUG_ON(PageWriteback(page));
118 set_page_writeback(page);
119 unlock_page(page);
120 }
121 bio->bi_vcnt = nr_pages;
122 bio->bi_iter.bi_size = nr_pages * PAGE_SIZE;
123 bio->bi_bdev = super->s_bdev;
124 bio->bi_iter.bi_sector = ofs >> 9;
125 bio->bi_private = sb;
126 bio->bi_end_io = writeseg_end_io;
127 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
128 atomic_inc(&super->s_pending_writes);
129 submit_bio(bio);
130 return 0;
131}
132
133static void bdev_writeseg(struct super_block *sb, u64 ofs, size_t len)
134{
135 struct logfs_super *super = logfs_super(sb);
136 int head;
137
138 BUG_ON(super->s_flags & LOGFS_SB_FLAG_RO);
139
140 if (len == 0) {
141 /* This can happen when the object fit perfectly into a
142 * segment, the segment gets written per sync and subsequently
143 * closed.
144 */
145 return;
146 }
147 head = ofs & (PAGE_SIZE - 1);
148 if (head) {
149 ofs -= head;
150 len += head;
151 }
152 len = PAGE_ALIGN(len);
153 __bdev_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT);
154}
155
156
157static void erase_end_io(struct bio *bio)
158{
159 struct super_block *sb = bio->bi_private;
160 struct logfs_super *super = logfs_super(sb);
161
162 BUG_ON(bio->bi_error); /* FIXME: Retry io or write elsewhere */
163 BUG_ON(bio->bi_vcnt == 0);
164 bio_put(bio);
165 if (atomic_dec_and_test(&super->s_pending_writes))
166 wake_up(&wq);
167}
168
169static int do_erase(struct super_block *sb, u64 ofs, pgoff_t index,
170 size_t nr_pages)
171{
172 struct logfs_super *super = logfs_super(sb);
173 struct bio *bio;
174 unsigned int max_pages;
175 int i;
176
177 max_pages = min_t(size_t, nr_pages, BIO_MAX_PAGES);
178
179 bio = bio_alloc(GFP_NOFS, max_pages);
180 BUG_ON(!bio);
181
182 for (i = 0; i < nr_pages; i++) {
183 if (i >= max_pages) {
184 /* Block layer cannot split bios :( */
185 bio->bi_vcnt = i;
186 bio->bi_iter.bi_size = i * PAGE_SIZE;
187 bio->bi_bdev = super->s_bdev;
188 bio->bi_iter.bi_sector = ofs >> 9;
189 bio->bi_private = sb;
190 bio->bi_end_io = erase_end_io;
191 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
192 atomic_inc(&super->s_pending_writes);
193 submit_bio(bio);
194
195 ofs += i * PAGE_SIZE;
196 index += i;
197 nr_pages -= i;
198 i = 0;
199
200 bio = bio_alloc(GFP_NOFS, max_pages);
201 BUG_ON(!bio);
202 }
203 bio->bi_io_vec[i].bv_page = super->s_erase_page;
204 bio->bi_io_vec[i].bv_len = PAGE_SIZE;
205 bio->bi_io_vec[i].bv_offset = 0;
206 }
207 bio->bi_vcnt = nr_pages;
208 bio->bi_iter.bi_size = nr_pages * PAGE_SIZE;
209 bio->bi_bdev = super->s_bdev;
210 bio->bi_iter.bi_sector = ofs >> 9;
211 bio->bi_private = sb;
212 bio->bi_end_io = erase_end_io;
213 bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
214 atomic_inc(&super->s_pending_writes);
215 submit_bio(bio);
216 return 0;
217}
218
219static int bdev_erase(struct super_block *sb, loff_t to, size_t len,
220 int ensure_write)
221{
222 struct logfs_super *super = logfs_super(sb);
223
224 BUG_ON(to & (PAGE_SIZE - 1));
225 BUG_ON(len & (PAGE_SIZE - 1));
226
227 if (super->s_flags & LOGFS_SB_FLAG_RO)
228 return -EROFS;
229
230 if (ensure_write) {
231 /*
232 * Object store doesn't care whether erases happen or not.
233 * But for the journal they are required. Otherwise a scan
234 * can find an old commit entry and assume it is the current
235 * one, travelling back in time.
236 */
237 do_erase(sb, to, to >> PAGE_SHIFT, len >> PAGE_SHIFT);
238 }
239
240 return 0;
241}
242
243static void bdev_sync(struct super_block *sb)
244{
245 struct logfs_super *super = logfs_super(sb);
246
247 wait_event(wq, atomic_read(&super->s_pending_writes) == 0);
248}
249
250static struct page *bdev_find_first_sb(struct super_block *sb, u64 *ofs)
251{
252 struct logfs_super *super = logfs_super(sb);
253 struct address_space *mapping = super->s_mapping_inode->i_mapping;
254 filler_t *filler = bdev_readpage;
255
256 *ofs = 0;
257 return read_cache_page(mapping, 0, filler, sb);
258}
259
260static struct page *bdev_find_last_sb(struct super_block *sb, u64 *ofs)
261{
262 struct logfs_super *super = logfs_super(sb);
263 struct address_space *mapping = super->s_mapping_inode->i_mapping;
264 filler_t *filler = bdev_readpage;
265 u64 pos = (super->s_bdev->bd_inode->i_size & ~0xfffULL) - 0x1000;
266 pgoff_t index = pos >> PAGE_SHIFT;
267
268 *ofs = pos;
269 return read_cache_page(mapping, index, filler, sb);
270}
271
272static int bdev_write_sb(struct super_block *sb, struct page *page)
273{
274 struct block_device *bdev = logfs_super(sb)->s_bdev;
275
276 /* Nothing special to do for block devices. */
277 return sync_request(page, bdev, WRITE);
278}
279
280static void bdev_put_device(struct logfs_super *s)
281{
282 blkdev_put(s->s_bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
283}
284
285static int bdev_can_write_buf(struct super_block *sb, u64 ofs)
286{
287 return 0;
288}
289
290static const struct logfs_device_ops bd_devops = {
291 .find_first_sb = bdev_find_first_sb,
292 .find_last_sb = bdev_find_last_sb,
293 .write_sb = bdev_write_sb,
294 .readpage = bdev_readpage,
295 .writeseg = bdev_writeseg,
296 .erase = bdev_erase,
297 .can_write_buf = bdev_can_write_buf,
298 .sync = bdev_sync,
299 .put_device = bdev_put_device,
300};
301
302int logfs_get_sb_bdev(struct logfs_super *p, struct file_system_type *type,
303 const char *devname)
304{
305 struct block_device *bdev;
306
307 bdev = blkdev_get_by_path(devname, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
308 type);
309 if (IS_ERR(bdev))
310 return PTR_ERR(bdev);
311
312 if (MAJOR(bdev->bd_dev) == MTD_BLOCK_MAJOR) {
313 int mtdnr = MINOR(bdev->bd_dev);
314 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
315 return logfs_get_sb_mtd(p, mtdnr);
316 }
317
318 p->s_bdev = bdev;
319 p->s_mtd = NULL;
320 p->s_devops = &bd_devops;
321 return 0;
322}
diff --git a/fs/logfs/dev_mtd.c b/fs/logfs/dev_mtd.c
deleted file mode 100644
index b76a62b1978f..000000000000
--- a/fs/logfs/dev_mtd.c
+++ /dev/null
@@ -1,274 +0,0 @@
1/*
2 * fs/logfs/dev_mtd.c - Device access methods for MTD
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */
8#include "logfs.h"
9#include <linux/completion.h>
10#include <linux/mount.h>
11#include <linux/sched.h>
12#include <linux/slab.h>
13
14#define PAGE_OFS(ofs) ((ofs) & (PAGE_SIZE-1))
15
16static int logfs_mtd_read(struct super_block *sb, loff_t ofs, size_t len,
17 void *buf)
18{
19 struct mtd_info *mtd = logfs_super(sb)->s_mtd;
20 size_t retlen;
21 int ret;
22
23 ret = mtd_read(mtd, ofs, len, &retlen, buf);
24 BUG_ON(ret == -EINVAL);
25 if (ret)
26 return ret;
27
28 /* Not sure if we should loop instead. */
29 if (retlen != len)
30 return -EIO;
31
32 return 0;
33}
34
35static int loffs_mtd_write(struct super_block *sb, loff_t ofs, size_t len,
36 void *buf)
37{
38 struct logfs_super *super = logfs_super(sb);
39 struct mtd_info *mtd = super->s_mtd;
40 size_t retlen;
41 loff_t page_start, page_end;
42 int ret;
43
44 if (super->s_flags & LOGFS_SB_FLAG_RO)
45 return -EROFS;
46
47 BUG_ON((ofs >= mtd->size) || (len > mtd->size - ofs));
48 BUG_ON(ofs != (ofs >> super->s_writeshift) << super->s_writeshift);
49 BUG_ON(len > PAGE_SIZE);
50 page_start = ofs & PAGE_MASK;
51 page_end = PAGE_ALIGN(ofs + len) - 1;
52 ret = mtd_write(mtd, ofs, len, &retlen, buf);
53 if (ret || (retlen != len))
54 return -EIO;
55
56 return 0;
57}
58
59/*
60 * For as long as I can remember (since about 2001) mtd->erase has been an
61 * asynchronous interface lacking the first driver to actually use the
62 * asynchronous properties. So just to prevent the first implementor of such
63 * a thing from breaking logfs in 2350, we do the usual pointless dance to
64 * declare a completion variable and wait for completion before returning
65 * from logfs_mtd_erase(). What an exercise in futility!
66 */
67static void logfs_erase_callback(struct erase_info *ei)
68{
69 complete((struct completion *)ei->priv);
70}
71
72static int logfs_mtd_erase_mapping(struct super_block *sb, loff_t ofs,
73 size_t len)
74{
75 struct logfs_super *super = logfs_super(sb);
76 struct address_space *mapping = super->s_mapping_inode->i_mapping;
77 struct page *page;
78 pgoff_t index = ofs >> PAGE_SHIFT;
79
80 for (index = ofs >> PAGE_SHIFT; index < (ofs + len) >> PAGE_SHIFT; index++) {
81 page = find_get_page(mapping, index);
82 if (!page)
83 continue;
84 memset(page_address(page), 0xFF, PAGE_SIZE);
85 put_page(page);
86 }
87 return 0;
88}
89
90static int logfs_mtd_erase(struct super_block *sb, loff_t ofs, size_t len,
91 int ensure_write)
92{
93 struct mtd_info *mtd = logfs_super(sb)->s_mtd;
94 struct erase_info ei;
95 DECLARE_COMPLETION_ONSTACK(complete);
96 int ret;
97
98 BUG_ON(len % mtd->erasesize);
99 if (logfs_super(sb)->s_flags & LOGFS_SB_FLAG_RO)
100 return -EROFS;
101
102 memset(&ei, 0, sizeof(ei));
103 ei.mtd = mtd;
104 ei.addr = ofs;
105 ei.len = len;
106 ei.callback = logfs_erase_callback;
107 ei.priv = (long)&complete;
108 ret = mtd_erase(mtd, &ei);
109 if (ret)
110 return -EIO;
111
112 wait_for_completion(&complete);
113 if (ei.state != MTD_ERASE_DONE)
114 return -EIO;
115 return logfs_mtd_erase_mapping(sb, ofs, len);
116}
117
118static void logfs_mtd_sync(struct super_block *sb)
119{
120 struct mtd_info *mtd = logfs_super(sb)->s_mtd;
121
122 mtd_sync(mtd);
123}
124
125static int logfs_mtd_readpage(void *_sb, struct page *page)
126{
127 struct super_block *sb = _sb;
128 int err;
129
130 err = logfs_mtd_read(sb, page->index << PAGE_SHIFT, PAGE_SIZE,
131 page_address(page));
132 if (err == -EUCLEAN || err == -EBADMSG) {
133 /* -EBADMSG happens regularly on power failures */
134 err = 0;
135 /* FIXME: force GC this segment */
136 }
137 if (err) {
138 ClearPageUptodate(page);
139 SetPageError(page);
140 } else {
141 SetPageUptodate(page);
142 ClearPageError(page);
143 }
144 unlock_page(page);
145 return err;
146}
147
148static struct page *logfs_mtd_find_first_sb(struct super_block *sb, u64 *ofs)
149{
150 struct logfs_super *super = logfs_super(sb);
151 struct address_space *mapping = super->s_mapping_inode->i_mapping;
152 filler_t *filler = logfs_mtd_readpage;
153 struct mtd_info *mtd = super->s_mtd;
154
155 *ofs = 0;
156 while (mtd_block_isbad(mtd, *ofs)) {
157 *ofs += mtd->erasesize;
158 if (*ofs >= mtd->size)
159 return NULL;
160 }
161 BUG_ON(*ofs & ~PAGE_MASK);
162 return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb);
163}
164
165static struct page *logfs_mtd_find_last_sb(struct super_block *sb, u64 *ofs)
166{
167 struct logfs_super *super = logfs_super(sb);
168 struct address_space *mapping = super->s_mapping_inode->i_mapping;
169 filler_t *filler = logfs_mtd_readpage;
170 struct mtd_info *mtd = super->s_mtd;
171
172 *ofs = mtd->size - mtd->erasesize;
173 while (mtd_block_isbad(mtd, *ofs)) {
174 *ofs -= mtd->erasesize;
175 if (*ofs <= 0)
176 return NULL;
177 }
178 *ofs = *ofs + mtd->erasesize - 0x1000;
179 BUG_ON(*ofs & ~PAGE_MASK);
180 return read_cache_page(mapping, *ofs >> PAGE_SHIFT, filler, sb);
181}
182
183static int __logfs_mtd_writeseg(struct super_block *sb, u64 ofs, pgoff_t index,
184 size_t nr_pages)
185{
186 struct logfs_super *super = logfs_super(sb);
187 struct address_space *mapping = super->s_mapping_inode->i_mapping;
188 struct page *page;
189 int i, err;
190
191 for (i = 0; i < nr_pages; i++) {
192 page = find_lock_page(mapping, index + i);
193 BUG_ON(!page);
194
195 err = loffs_mtd_write(sb, page->index << PAGE_SHIFT, PAGE_SIZE,
196 page_address(page));
197 unlock_page(page);
198 put_page(page);
199 if (err)
200 return err;
201 }
202 return 0;
203}
204
205static void logfs_mtd_writeseg(struct super_block *sb, u64 ofs, size_t len)
206{
207 struct logfs_super *super = logfs_super(sb);
208 int head;
209
210 if (super->s_flags & LOGFS_SB_FLAG_RO)
211 return;
212
213 if (len == 0) {
214 /* This can happen when the object fit perfectly into a
215 * segment, the segment gets written per sync and subsequently
216 * closed.
217 */
218 return;
219 }
220 head = ofs & (PAGE_SIZE - 1);
221 if (head) {
222 ofs -= head;
223 len += head;
224 }
225 len = PAGE_ALIGN(len);
226 __logfs_mtd_writeseg(sb, ofs, ofs >> PAGE_SHIFT, len >> PAGE_SHIFT);
227}
228
229static void logfs_mtd_put_device(struct logfs_super *s)
230{
231 put_mtd_device(s->s_mtd);
232}
233
234static int logfs_mtd_can_write_buf(struct super_block *sb, u64 ofs)
235{
236 struct logfs_super *super = logfs_super(sb);
237 void *buf;
238 int err;
239
240 buf = kmalloc(super->s_writesize, GFP_KERNEL);
241 if (!buf)
242 return -ENOMEM;
243 err = logfs_mtd_read(sb, ofs, super->s_writesize, buf);
244 if (err)
245 goto out;
246 if (memchr_inv(buf, 0xff, super->s_writesize))
247 err = -EIO;
248 kfree(buf);
249out:
250 return err;
251}
252
253static const struct logfs_device_ops mtd_devops = {
254 .find_first_sb = logfs_mtd_find_first_sb,
255 .find_last_sb = logfs_mtd_find_last_sb,
256 .readpage = logfs_mtd_readpage,
257 .writeseg = logfs_mtd_writeseg,
258 .erase = logfs_mtd_erase,
259 .can_write_buf = logfs_mtd_can_write_buf,
260 .sync = logfs_mtd_sync,
261 .put_device = logfs_mtd_put_device,
262};
263
264int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr)
265{
266 struct mtd_info *mtd = get_mtd_device(NULL, mtdnr);
267 if (IS_ERR(mtd))
268 return PTR_ERR(mtd);
269
270 s->s_bdev = NULL;
271 s->s_mtd = mtd;
272 s->s_devops = &mtd_devops;
273 return 0;
274}
diff --git a/fs/logfs/dir.c b/fs/logfs/dir.c
deleted file mode 100644
index c87ea52de3d9..000000000000
--- a/fs/logfs/dir.c
+++ /dev/null
@@ -1,801 +0,0 @@
1/*
2 * fs/logfs/dir.c - directory-related code
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */
8#include "logfs.h"
9#include <linux/slab.h>
10
11/*
12 * Atomic dir operations
13 *
14 * Directory operations are by default not atomic. Dentries and Inodes are
15 * created/removed/altered in separate operations. Therefore we need to do
16 * a small amount of journaling.
17 *
18 * Create, link, mkdir, mknod and symlink all share the same function to do
19 * the work: __logfs_create. This function works in two atomic steps:
20 * 1. allocate inode (remember in journal)
21 * 2. allocate dentry (clear journal)
22 *
23 * As we can only get interrupted between the two, when the inode we just
24 * created is simply stored in the anchor. On next mount, if we were
25 * interrupted, we delete the inode. From a users point of view the
26 * operation never happened.
27 *
28 * Unlink and rmdir also share the same function: unlink. Again, this
29 * function works in two atomic steps
30 * 1. remove dentry (remember inode in journal)
31 * 2. unlink inode (clear journal)
32 *
33 * And again, on the next mount, if we were interrupted, we delete the inode.
34 * From a users point of view the operation succeeded.
35 *
36 * Rename is the real pain to deal with, harder than all the other methods
37 * combined. Depending on the circumstances we can run into three cases.
38 * A "target rename" where the target dentry already existed, a "local
39 * rename" where both parent directories are identical or a "cross-directory
40 * rename" in the remaining case.
41 *
42 * Local rename is atomic, as the old dentry is simply rewritten with a new
43 * name.
44 *
45 * Cross-directory rename works in two steps, similar to __logfs_create and
46 * logfs_unlink:
47 * 1. Write new dentry (remember old dentry in journal)
48 * 2. Remove old dentry (clear journal)
49 *
50 * Here we remember a dentry instead of an inode. On next mount, if we were
51 * interrupted, we delete the dentry. From a users point of view, the
52 * operation succeeded.
53 *
54 * Target rename works in three atomic steps:
55 * 1. Attach old inode to new dentry (remember old dentry and new inode)
56 * 2. Remove old dentry (still remember the new inode)
57 * 3. Remove victim inode
58 *
59 * Here we remember both an inode an a dentry. If we get interrupted
60 * between steps 1 and 2, we delete both the dentry and the inode. If
61 * we get interrupted between steps 2 and 3, we delete just the inode.
62 * In either case, the remaining objects are deleted on next mount. From
63 * a users point of view, the operation succeeded.
64 */
65
66static int write_dir(struct inode *dir, struct logfs_disk_dentry *dd,
67 loff_t pos)
68{
69 return logfs_inode_write(dir, dd, sizeof(*dd), pos, WF_LOCK, NULL);
70}
71
72static int write_inode(struct inode *inode)
73{
74 return __logfs_write_inode(inode, NULL, WF_LOCK);
75}
76
77static s64 dir_seek_data(struct inode *inode, s64 pos)
78{
79 s64 new_pos = logfs_seek_data(inode, pos);
80
81 return max(pos, new_pos - 1);
82}
83
84static int beyond_eof(struct inode *inode, loff_t bix)
85{
86 loff_t pos = bix << inode->i_sb->s_blocksize_bits;
87 return pos >= i_size_read(inode);
88}
89
90/*
91 * Prime value was chosen to be roughly 256 + 26. r5 hash uses 11,
92 * so short names (len <= 9) don't even occupy the complete 32bit name
93 * space. A prime >256 ensures short names quickly spread the 32bit
94 * name space. Add about 26 for the estimated amount of information
95 * of each character and pick a prime nearby, preferably a bit-sparse
96 * one.
97 */
98static u32 logfs_hash_32(const char *s, int len, u32 seed)
99{
100 u32 hash = seed;
101 int i;
102
103 for (i = 0; i < len; i++)
104 hash = hash * 293 + s[i];
105 return hash;
106}
107
108/*
109 * We have to satisfy several conflicting requirements here. Small
110 * directories should stay fairly compact and not require too many
111 * indirect blocks. The number of possible locations for a given hash
112 * should be small to make lookup() fast. And we should try hard not
113 * to overflow the 32bit name space or nfs and 32bit host systems will
114 * be unhappy.
115 *
116 * So we use the following scheme. First we reduce the hash to 0..15
117 * and try a direct block. If that is occupied we reduce the hash to
118 * 16..255 and try an indirect block. Same for 2x and 3x indirect
119 * blocks. Lastly we reduce the hash to 0x800_0000 .. 0xffff_ffff,
120 * but use buckets containing eight entries instead of a single one.
121 *
122 * Using 16 entries should allow for a reasonable amount of hash
123 * collisions, so the 32bit name space can be packed fairly tight
124 * before overflowing. Oh and currently we don't overflow but return
125 * and error.
126 *
127 * How likely are collisions? Doing the appropriate math is beyond me
128 * and the Bronstein textbook. But running a test program to brute
129 * force collisions for a couple of days showed that on average the
130 * first collision occurs after 598M entries, with 290M being the
131 * smallest result. Obviously 21 entries could already cause a
132 * collision if all entries are carefully chosen.
133 */
134static pgoff_t hash_index(u32 hash, int round)
135{
136 u32 i0_blocks = I0_BLOCKS;
137 u32 i1_blocks = I1_BLOCKS;
138 u32 i2_blocks = I2_BLOCKS;
139 u32 i3_blocks = I3_BLOCKS;
140
141 switch (round) {
142 case 0:
143 return hash % i0_blocks;
144 case 1:
145 return i0_blocks + hash % (i1_blocks - i0_blocks);
146 case 2:
147 return i1_blocks + hash % (i2_blocks - i1_blocks);
148 case 3:
149 return i2_blocks + hash % (i3_blocks - i2_blocks);
150 case 4 ... 19:
151 return i3_blocks + 16 * (hash % (((1<<31) - i3_blocks) / 16))
152 + round - 4;
153 }
154 BUG();
155}
156
157static struct page *logfs_get_dd_page(struct inode *dir, struct dentry *dentry)
158{
159 const struct qstr *name = &dentry->d_name;
160 struct page *page;
161 struct logfs_disk_dentry *dd;
162 u32 hash = logfs_hash_32(name->name, name->len, 0);
163 pgoff_t index;
164 int round;
165
166 if (name->len > LOGFS_MAX_NAMELEN)
167 return ERR_PTR(-ENAMETOOLONG);
168
169 for (round = 0; round < 20; round++) {
170 index = hash_index(hash, round);
171
172 if (beyond_eof(dir, index))
173 return NULL;
174 if (!logfs_exist_block(dir, index))
175 continue;
176 page = read_cache_page(dir->i_mapping, index,
177 (filler_t *)logfs_readpage, NULL);
178 if (IS_ERR(page))
179 return page;
180 dd = kmap_atomic(page);
181 BUG_ON(dd->namelen == 0);
182
183 if (name->len != be16_to_cpu(dd->namelen) ||
184 memcmp(name->name, dd->name, name->len)) {
185 kunmap_atomic(dd);
186 put_page(page);
187 continue;
188 }
189
190 kunmap_atomic(dd);
191 return page;
192 }
193 return NULL;
194}
195
196static int logfs_remove_inode(struct inode *inode)
197{
198 int ret;
199
200 drop_nlink(inode);
201 ret = write_inode(inode);
202 LOGFS_BUG_ON(ret, inode->i_sb);
203 return ret;
204}
205
206static void abort_transaction(struct inode *inode, struct logfs_transaction *ta)
207{
208 if (logfs_inode(inode)->li_block)
209 logfs_inode(inode)->li_block->ta = NULL;
210 kfree(ta);
211}
212
213static int logfs_unlink(struct inode *dir, struct dentry *dentry)
214{
215 struct logfs_super *super = logfs_super(dir->i_sb);
216 struct inode *inode = d_inode(dentry);
217 struct logfs_transaction *ta;
218 struct page *page;
219 pgoff_t index;
220 int ret;
221
222 ta = kzalloc(sizeof(*ta), GFP_KERNEL);
223 if (!ta)
224 return -ENOMEM;
225
226 ta->state = UNLINK_1;
227 ta->ino = inode->i_ino;
228
229 inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
230
231 page = logfs_get_dd_page(dir, dentry);
232 if (!page) {
233 kfree(ta);
234 return -ENOENT;
235 }
236 if (IS_ERR(page)) {
237 kfree(ta);
238 return PTR_ERR(page);
239 }
240 index = page->index;
241 put_page(page);
242
243 mutex_lock(&super->s_dirop_mutex);
244 logfs_add_transaction(dir, ta);
245
246 ret = logfs_delete(dir, index, NULL);
247 if (!ret)
248 ret = write_inode(dir);
249
250 if (ret) {
251 abort_transaction(dir, ta);
252 printk(KERN_ERR"LOGFS: unable to delete inode\n");
253 goto out;
254 }
255
256 ta->state = UNLINK_2;
257 logfs_add_transaction(inode, ta);
258 ret = logfs_remove_inode(inode);
259out:
260 mutex_unlock(&super->s_dirop_mutex);
261 return ret;
262}
263
264static inline int logfs_empty_dir(struct inode *dir)
265{
266 u64 data;
267
268 data = logfs_seek_data(dir, 0) << dir->i_sb->s_blocksize_bits;
269 return data >= i_size_read(dir);
270}
271
272static int logfs_rmdir(struct inode *dir, struct dentry *dentry)
273{
274 struct inode *inode = d_inode(dentry);
275
276 if (!logfs_empty_dir(inode))
277 return -ENOTEMPTY;
278
279 return logfs_unlink(dir, dentry);
280}
281
282/* FIXME: readdir currently has it's own dir_walk code. I don't see a good
283 * way to combine the two copies */
284static int logfs_readdir(struct file *file, struct dir_context *ctx)
285{
286 struct inode *dir = file_inode(file);
287 loff_t pos;
288 struct page *page;
289 struct logfs_disk_dentry *dd;
290
291 if (ctx->pos < 0)
292 return -EINVAL;
293
294 if (!dir_emit_dots(file, ctx))
295 return 0;
296
297 pos = ctx->pos - 2;
298 BUG_ON(pos < 0);
299 for (;; pos++, ctx->pos++) {
300 bool full;
301 if (beyond_eof(dir, pos))
302 break;
303 if (!logfs_exist_block(dir, pos)) {
304 /* deleted dentry */
305 pos = dir_seek_data(dir, pos);
306 continue;
307 }
308 page = read_cache_page(dir->i_mapping, pos,
309 (filler_t *)logfs_readpage, NULL);
310 if (IS_ERR(page))
311 return PTR_ERR(page);
312 dd = kmap(page);
313 BUG_ON(dd->namelen == 0);
314
315 full = !dir_emit(ctx, (char *)dd->name,
316 be16_to_cpu(dd->namelen),
317 be64_to_cpu(dd->ino), dd->type);
318 kunmap(page);
319 put_page(page);
320 if (full)
321 break;
322 }
323 return 0;
324}
325
326static void logfs_set_name(struct logfs_disk_dentry *dd, const struct qstr *name)
327{
328 dd->namelen = cpu_to_be16(name->len);
329 memcpy(dd->name, name->name, name->len);
330}
331
332static struct dentry *logfs_lookup(struct inode *dir, struct dentry *dentry,
333 unsigned int flags)
334{
335 struct page *page;
336 struct logfs_disk_dentry *dd;
337 pgoff_t index;
338 u64 ino = 0;
339 struct inode *inode;
340
341 page = logfs_get_dd_page(dir, dentry);
342 if (IS_ERR(page))
343 return ERR_CAST(page);
344 if (!page) {
345 d_add(dentry, NULL);
346 return NULL;
347 }
348 index = page->index;
349 dd = kmap_atomic(page);
350 ino = be64_to_cpu(dd->ino);
351 kunmap_atomic(dd);
352 put_page(page);
353
354 inode = logfs_iget(dir->i_sb, ino);
355 if (IS_ERR(inode))
356 printk(KERN_ERR"LogFS: Cannot read inode #%llx for dentry (%lx, %lx)n",
357 ino, dir->i_ino, index);
358 return d_splice_alias(inode, dentry);
359}
360
361static void grow_dir(struct inode *dir, loff_t index)
362{
363 index = (index + 1) << dir->i_sb->s_blocksize_bits;
364 if (i_size_read(dir) < index)
365 i_size_write(dir, index);
366}
367
368static int logfs_write_dir(struct inode *dir, struct dentry *dentry,
369 struct inode *inode)
370{
371 struct page *page;
372 struct logfs_disk_dentry *dd;
373 u32 hash = logfs_hash_32(dentry->d_name.name, dentry->d_name.len, 0);
374 pgoff_t index;
375 int round, err;
376
377 for (round = 0; round < 20; round++) {
378 index = hash_index(hash, round);
379
380 if (logfs_exist_block(dir, index))
381 continue;
382 page = find_or_create_page(dir->i_mapping, index, GFP_KERNEL);
383 if (!page)
384 return -ENOMEM;
385
386 dd = kmap_atomic(page);
387 memset(dd, 0, sizeof(*dd));
388 dd->ino = cpu_to_be64(inode->i_ino);
389 dd->type = logfs_type(inode);
390 logfs_set_name(dd, &dentry->d_name);
391 kunmap_atomic(dd);
392
393 err = logfs_write_buf(dir, page, WF_LOCK);
394 unlock_page(page);
395 put_page(page);
396 if (!err)
397 grow_dir(dir, index);
398 return err;
399 }
400 /* FIXME: Is there a better return value? In most cases neither
401 * the filesystem nor the directory are full. But we have had
402 * too many collisions for this particular hash and no fallback.
403 */
404 return -ENOSPC;
405}
406
407static int __logfs_create(struct inode *dir, struct dentry *dentry,
408 struct inode *inode, const char *dest, long destlen)
409{
410 struct logfs_super *super = logfs_super(dir->i_sb);
411 struct logfs_inode *li = logfs_inode(inode);
412 struct logfs_transaction *ta;
413 int ret;
414
415 ta = kzalloc(sizeof(*ta), GFP_KERNEL);
416 if (!ta) {
417 drop_nlink(inode);
418 iput(inode);
419 return -ENOMEM;
420 }
421
422 ta->state = CREATE_1;
423 ta->ino = inode->i_ino;
424 mutex_lock(&super->s_dirop_mutex);
425 logfs_add_transaction(inode, ta);
426
427 if (dest) {
428 /* symlink */
429 ret = logfs_inode_write(inode, dest, destlen, 0, WF_LOCK, NULL);
430 if (!ret)
431 ret = write_inode(inode);
432 } else {
433 /* creat/mkdir/mknod */
434 ret = write_inode(inode);
435 }
436 if (ret) {
437 abort_transaction(inode, ta);
438 li->li_flags |= LOGFS_IF_STILLBORN;
439 /* FIXME: truncate symlink */
440 drop_nlink(inode);
441 iput(inode);
442 goto out;
443 }
444
445 ta->state = CREATE_2;
446 logfs_add_transaction(dir, ta);
447 ret = logfs_write_dir(dir, dentry, inode);
448 /* sync directory */
449 if (!ret)
450 ret = write_inode(dir);
451
452 if (ret) {
453 logfs_del_transaction(dir, ta);
454 ta->state = CREATE_2;
455 logfs_add_transaction(inode, ta);
456 logfs_remove_inode(inode);
457 iput(inode);
458 goto out;
459 }
460 d_instantiate(dentry, inode);
461out:
462 mutex_unlock(&super->s_dirop_mutex);
463 return ret;
464}
465
466static int logfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode)
467{
468 struct inode *inode;
469
470 /*
471 * FIXME: why do we have to fill in S_IFDIR, while the mode is
472 * correct for mknod, creat, etc.? Smells like the vfs *should*
473 * do it for us but for some reason fails to do so.
474 */
475 inode = logfs_new_inode(dir, S_IFDIR | mode);
476 if (IS_ERR(inode))
477 return PTR_ERR(inode);
478
479 inode->i_op = &logfs_dir_iops;
480 inode->i_fop = &logfs_dir_fops;
481
482 return __logfs_create(dir, dentry, inode, NULL, 0);
483}
484
485static int logfs_create(struct inode *dir, struct dentry *dentry, umode_t mode,
486 bool excl)
487{
488 struct inode *inode;
489
490 inode = logfs_new_inode(dir, mode);
491 if (IS_ERR(inode))
492 return PTR_ERR(inode);
493
494 inode->i_op = &logfs_reg_iops;
495 inode->i_fop = &logfs_reg_fops;
496 inode->i_mapping->a_ops = &logfs_reg_aops;
497
498 return __logfs_create(dir, dentry, inode, NULL, 0);
499}
500
501static int logfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode,
502 dev_t rdev)
503{
504 struct inode *inode;
505
506 if (dentry->d_name.len > LOGFS_MAX_NAMELEN)
507 return -ENAMETOOLONG;
508
509 inode = logfs_new_inode(dir, mode);
510 if (IS_ERR(inode))
511 return PTR_ERR(inode);
512
513 init_special_inode(inode, mode, rdev);
514
515 return __logfs_create(dir, dentry, inode, NULL, 0);
516}
517
518static int logfs_symlink(struct inode *dir, struct dentry *dentry,
519 const char *target)
520{
521 struct inode *inode;
522 size_t destlen = strlen(target) + 1;
523
524 if (destlen > dir->i_sb->s_blocksize)
525 return -ENAMETOOLONG;
526
527 inode = logfs_new_inode(dir, S_IFLNK | 0777);
528 if (IS_ERR(inode))
529 return PTR_ERR(inode);
530
531 inode->i_op = &page_symlink_inode_operations;
532 inode_nohighmem(inode);
533 inode->i_mapping->a_ops = &logfs_reg_aops;
534
535 return __logfs_create(dir, dentry, inode, target, destlen);
536}
537
538static int logfs_link(struct dentry *old_dentry, struct inode *dir,
539 struct dentry *dentry)
540{
541 struct inode *inode = d_inode(old_dentry);
542
543 inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode);
544 ihold(inode);
545 inc_nlink(inode);
546 mark_inode_dirty_sync(inode);
547
548 return __logfs_create(dir, dentry, inode, NULL, 0);
549}
550
551static int logfs_get_dd(struct inode *dir, struct dentry *dentry,
552 struct logfs_disk_dentry *dd, loff_t *pos)
553{
554 struct page *page;
555 void *map;
556
557 page = logfs_get_dd_page(dir, dentry);
558 if (IS_ERR(page))
559 return PTR_ERR(page);
560 *pos = page->index;
561 map = kmap_atomic(page);
562 memcpy(dd, map, sizeof(*dd));
563 kunmap_atomic(map);
564 put_page(page);
565 return 0;
566}
567
568static int logfs_delete_dd(struct inode *dir, loff_t pos)
569{
570 /*
571 * Getting called with pos somewhere beyond eof is either a goofup
572 * within this file or means someone maliciously edited the
573 * (crc-protected) journal.
574 */
575 BUG_ON(beyond_eof(dir, pos));
576 dir->i_ctime = dir->i_mtime = current_time(dir);
577 log_dir(" Delete dentry (%lx, %llx)\n", dir->i_ino, pos);
578 return logfs_delete(dir, pos, NULL);
579}
580
581/*
582 * Cross-directory rename, target does not exist. Just a little nasty.
583 * Create a new dentry in the target dir, then remove the old dentry,
584 * all the while taking care to remember our operation in the journal.
585 */
586static int logfs_rename_cross(struct inode *old_dir, struct dentry *old_dentry,
587 struct inode *new_dir, struct dentry *new_dentry)
588{
589 struct logfs_super *super = logfs_super(old_dir->i_sb);
590 struct logfs_disk_dentry dd;
591 struct logfs_transaction *ta;
592 loff_t pos;
593 int err;
594
595 /* 1. locate source dd */
596 err = logfs_get_dd(old_dir, old_dentry, &dd, &pos);
597 if (err)
598 return err;
599
600 ta = kzalloc(sizeof(*ta), GFP_KERNEL);
601 if (!ta)
602 return -ENOMEM;
603
604 ta->state = CROSS_RENAME_1;
605 ta->dir = old_dir->i_ino;
606 ta->pos = pos;
607
608 /* 2. write target dd */
609 mutex_lock(&super->s_dirop_mutex);
610 logfs_add_transaction(new_dir, ta);
611 err = logfs_write_dir(new_dir, new_dentry, d_inode(old_dentry));
612 if (!err)
613 err = write_inode(new_dir);
614
615 if (err) {
616 super->s_rename_dir = 0;
617 super->s_rename_pos = 0;
618 abort_transaction(new_dir, ta);
619 goto out;
620 }
621
622 /* 3. remove source dd */
623 ta->state = CROSS_RENAME_2;
624 logfs_add_transaction(old_dir, ta);
625 err = logfs_delete_dd(old_dir, pos);
626 if (!err)
627 err = write_inode(old_dir);
628 LOGFS_BUG_ON(err, old_dir->i_sb);
629out:
630 mutex_unlock(&super->s_dirop_mutex);
631 return err;
632}
633
634static int logfs_replace_inode(struct inode *dir, struct dentry *dentry,
635 struct logfs_disk_dentry *dd, struct inode *inode)
636{
637 loff_t pos;
638 int err;
639
640 err = logfs_get_dd(dir, dentry, dd, &pos);
641 if (err)
642 return err;
643 dd->ino = cpu_to_be64(inode->i_ino);
644 dd->type = logfs_type(inode);
645
646 err = write_dir(dir, dd, pos);
647 if (err)
648 return err;
649 log_dir("Replace dentry (%lx, %llx) %s -> %llx\n", dir->i_ino, pos,
650 dd->name, be64_to_cpu(dd->ino));
651 return write_inode(dir);
652}
653
654/* Target dentry exists - the worst case. We need to attach the source
655 * inode to the target dentry, then remove the orphaned target inode and
656 * source dentry.
657 */
658static int logfs_rename_target(struct inode *old_dir, struct dentry *old_dentry,
659 struct inode *new_dir, struct dentry *new_dentry)
660{
661 struct logfs_super *super = logfs_super(old_dir->i_sb);
662 struct inode *old_inode = d_inode(old_dentry);
663 struct inode *new_inode = d_inode(new_dentry);
664 int isdir = S_ISDIR(old_inode->i_mode);
665 struct logfs_disk_dentry dd;
666 struct logfs_transaction *ta;
667 loff_t pos;
668 int err;
669
670 BUG_ON(isdir != S_ISDIR(new_inode->i_mode));
671 if (isdir) {
672 if (!logfs_empty_dir(new_inode))
673 return -ENOTEMPTY;
674 }
675
676 /* 1. locate source dd */
677 err = logfs_get_dd(old_dir, old_dentry, &dd, &pos);
678 if (err)
679 return err;
680
681 ta = kzalloc(sizeof(*ta), GFP_KERNEL);
682 if (!ta)
683 return -ENOMEM;
684
685 ta->state = TARGET_RENAME_1;
686 ta->dir = old_dir->i_ino;
687 ta->pos = pos;
688 ta->ino = new_inode->i_ino;
689
690 /* 2. attach source inode to target dd */
691 mutex_lock(&super->s_dirop_mutex);
692 logfs_add_transaction(new_dir, ta);
693 err = logfs_replace_inode(new_dir, new_dentry, &dd, old_inode);
694 if (err) {
695 super->s_rename_dir = 0;
696 super->s_rename_pos = 0;
697 super->s_victim_ino = 0;
698 abort_transaction(new_dir, ta);
699 goto out;
700 }
701
702 /* 3. remove source dd */
703 ta->state = TARGET_RENAME_2;
704 logfs_add_transaction(old_dir, ta);
705 err = logfs_delete_dd(old_dir, pos);
706 if (!err)
707 err = write_inode(old_dir);
708 LOGFS_BUG_ON(err, old_dir->i_sb);
709
710 /* 4. remove target inode */
711 ta->state = TARGET_RENAME_3;
712 logfs_add_transaction(new_inode, ta);
713 err = logfs_remove_inode(new_inode);
714
715out:
716 mutex_unlock(&super->s_dirop_mutex);
717 return err;
718}
719
720static int logfs_rename(struct inode *old_dir, struct dentry *old_dentry,
721 struct inode *new_dir, struct dentry *new_dentry,
722 unsigned int flags)
723{
724 if (flags & ~RENAME_NOREPLACE)
725 return -EINVAL;
726
727 if (d_really_is_positive(new_dentry))
728 return logfs_rename_target(old_dir, old_dentry,
729 new_dir, new_dentry);
730 return logfs_rename_cross(old_dir, old_dentry, new_dir, new_dentry);
731}
732
733/* No locking done here, as this is called before .get_sb() returns. */
734int logfs_replay_journal(struct super_block *sb)
735{
736 struct logfs_super *super = logfs_super(sb);
737 struct inode *inode;
738 u64 ino, pos;
739 int err;
740
741 if (super->s_victim_ino) {
742 /* delete victim inode */
743 ino = super->s_victim_ino;
744 printk(KERN_INFO"LogFS: delete unmapped inode #%llx\n", ino);
745 inode = logfs_iget(sb, ino);
746 if (IS_ERR(inode))
747 goto fail;
748
749 LOGFS_BUG_ON(i_size_read(inode) > 0, sb);
750 super->s_victim_ino = 0;
751 err = logfs_remove_inode(inode);
752 iput(inode);
753 if (err) {
754 super->s_victim_ino = ino;
755 goto fail;
756 }
757 }
758 if (super->s_rename_dir) {
759 /* delete old dd from rename */
760 ino = super->s_rename_dir;
761 pos = super->s_rename_pos;
762 printk(KERN_INFO"LogFS: delete unbacked dentry (%llx, %llx)\n",
763 ino, pos);
764 inode = logfs_iget(sb, ino);
765 if (IS_ERR(inode))
766 goto fail;
767
768 super->s_rename_dir = 0;
769 super->s_rename_pos = 0;
770 err = logfs_delete_dd(inode, pos);
771 iput(inode);
772 if (err) {
773 super->s_rename_dir = ino;
774 super->s_rename_pos = pos;
775 goto fail;
776 }
777 }
778 return 0;
779fail:
780 LOGFS_BUG(sb);
781 return -EIO;
782}
783
784const struct inode_operations logfs_dir_iops = {
785 .create = logfs_create,
786 .link = logfs_link,
787 .lookup = logfs_lookup,
788 .mkdir = logfs_mkdir,
789 .mknod = logfs_mknod,
790 .rename = logfs_rename,
791 .rmdir = logfs_rmdir,
792 .symlink = logfs_symlink,
793 .unlink = logfs_unlink,
794};
795const struct file_operations logfs_dir_fops = {
796 .fsync = logfs_fsync,
797 .unlocked_ioctl = logfs_ioctl,
798 .iterate_shared = logfs_readdir,
799 .read = generic_read_dir,
800 .llseek = generic_file_llseek,
801};
diff --git a/fs/logfs/file.c b/fs/logfs/file.c
deleted file mode 100644
index 1db04930ad57..000000000000
--- a/fs/logfs/file.c
+++ /dev/null
@@ -1,285 +0,0 @@
1/*
2 * fs/logfs/file.c - prepare_write, commit_write and friends
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */
8#include "logfs.h"
9#include <linux/sched.h>
10#include <linux/writeback.h>
11
12static int logfs_write_begin(struct file *file, struct address_space *mapping,
13 loff_t pos, unsigned len, unsigned flags,
14 struct page **pagep, void **fsdata)
15{
16 struct inode *inode = mapping->host;
17 struct page *page;
18 pgoff_t index = pos >> PAGE_SHIFT;
19
20 page = grab_cache_page_write_begin(mapping, index, flags);
21 if (!page)
22 return -ENOMEM;
23 *pagep = page;
24
25 if ((len == PAGE_SIZE) || PageUptodate(page))
26 return 0;
27 if ((pos & PAGE_MASK) >= i_size_read(inode)) {
28 unsigned start = pos & (PAGE_SIZE - 1);
29 unsigned end = start + len;
30
31 /* Reading beyond i_size is simple: memset to zero */
32 zero_user_segments(page, 0, start, end, PAGE_SIZE);
33 return 0;
34 }
35 return logfs_readpage_nolock(page);
36}
37
38static int logfs_write_end(struct file *file, struct address_space *mapping,
39 loff_t pos, unsigned len, unsigned copied, struct page *page,
40 void *fsdata)
41{
42 struct inode *inode = mapping->host;
43 pgoff_t index = page->index;
44 unsigned start = pos & (PAGE_SIZE - 1);
45 unsigned end = start + copied;
46 int ret = 0;
47
48 BUG_ON(PAGE_SIZE != inode->i_sb->s_blocksize);
49 BUG_ON(page->index > I3_BLOCKS);
50
51 if (copied < len) {
52 /*
53 * Short write of a non-initialized paged. Just tell userspace
54 * to retry the entire page.
55 */
56 if (!PageUptodate(page)) {
57 copied = 0;
58 goto out;
59 }
60 }
61 if (copied == 0)
62 goto out; /* FIXME: do we need to update inode? */
63
64 if (i_size_read(inode) < (index << PAGE_SHIFT) + end) {
65 i_size_write(inode, (index << PAGE_SHIFT) + end);
66 mark_inode_dirty_sync(inode);
67 }
68
69 SetPageUptodate(page);
70 if (!PageDirty(page)) {
71 if (!get_page_reserve(inode, page))
72 __set_page_dirty_nobuffers(page);
73 else
74 ret = logfs_write_buf(inode, page, WF_LOCK);
75 }
76out:
77 unlock_page(page);
78 put_page(page);
79 return ret ? ret : copied;
80}
81
82int logfs_readpage(struct file *file, struct page *page)
83{
84 int ret;
85
86 ret = logfs_readpage_nolock(page);
87 unlock_page(page);
88 return ret;
89}
90
91/* Clear the page's dirty flag in the radix tree. */
92/* TODO: mucking with PageWriteback is silly. Add a generic function to clear
93 * the dirty bit from the radix tree for filesystems that don't have to wait
94 * for page writeback to finish (i.e. any compressing filesystem).
95 */
96static void clear_radix_tree_dirty(struct page *page)
97{
98 BUG_ON(PagePrivate(page) || page->private);
99 set_page_writeback(page);
100 end_page_writeback(page);
101}
102
103static int __logfs_writepage(struct page *page)
104{
105 struct inode *inode = page->mapping->host;
106 int err;
107
108 err = logfs_write_buf(inode, page, WF_LOCK);
109 if (err)
110 set_page_dirty(page);
111 else
112 clear_radix_tree_dirty(page);
113 unlock_page(page);
114 return err;
115}
116
117static int logfs_writepage(struct page *page, struct writeback_control *wbc)
118{
119 struct inode *inode = page->mapping->host;
120 loff_t i_size = i_size_read(inode);
121 pgoff_t end_index = i_size >> PAGE_SHIFT;
122 unsigned offset;
123 u64 bix;
124 level_t level;
125
126 log_file("logfs_writepage(%lx, %lx, %p)\n", inode->i_ino, page->index,
127 page);
128
129 logfs_unpack_index(page->index, &bix, &level);
130
131 /* Indirect blocks are never truncated */
132 if (level != 0)
133 return __logfs_writepage(page);
134
135 /*
136 * TODO: everything below is a near-verbatim copy of nobh_writepage().
137 * The relevant bits should be factored out after logfs is merged.
138 */
139
140 /* Is the page fully inside i_size? */
141 if (bix < end_index)
142 return __logfs_writepage(page);
143
144 /* Is the page fully outside i_size? (truncate in progress) */
145 offset = i_size & (PAGE_SIZE-1);
146 if (bix > end_index || offset == 0) {
147 unlock_page(page);
148 return 0; /* don't care */
149 }
150
151 /*
152 * The page straddles i_size. It must be zeroed out on each and every
153 * writepage invokation because it may be mmapped. "A file is mapped
154 * in multiples of the page size. For a file that is not a multiple of
155 * the page size, the remaining memory is zeroed when mapped, and
156 * writes to that region are not written out to the file."
157 */
158 zero_user_segment(page, offset, PAGE_SIZE);
159 return __logfs_writepage(page);
160}
161
162static void logfs_invalidatepage(struct page *page, unsigned int offset,
163 unsigned int length)
164{
165 struct logfs_block *block = logfs_block(page);
166
167 if (block->reserved_bytes) {
168 struct super_block *sb = page->mapping->host->i_sb;
169 struct logfs_super *super = logfs_super(sb);
170
171 super->s_dirty_pages -= block->reserved_bytes;
172 block->ops->free_block(sb, block);
173 BUG_ON(bitmap_weight(block->alias_map, LOGFS_BLOCK_FACTOR));
174 } else
175 move_page_to_btree(page);
176 BUG_ON(PagePrivate(page) || page->private);
177}
178
179static int logfs_releasepage(struct page *page, gfp_t only_xfs_uses_this)
180{
181 return 0; /* None of these are easy to release */
182}
183
184
185long logfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
186{
187 struct inode *inode = file_inode(file);
188 struct logfs_inode *li = logfs_inode(inode);
189 unsigned int oldflags, flags;
190 int err;
191
192 switch (cmd) {
193 case FS_IOC_GETFLAGS:
194 flags = li->li_flags & LOGFS_FL_USER_VISIBLE;
195 return put_user(flags, (int __user *)arg);
196 case FS_IOC_SETFLAGS:
197 if (IS_RDONLY(inode))
198 return -EROFS;
199
200 if (!inode_owner_or_capable(inode))
201 return -EACCES;
202
203 err = get_user(flags, (int __user *)arg);
204 if (err)
205 return err;
206
207 inode_lock(inode);
208 oldflags = li->li_flags;
209 flags &= LOGFS_FL_USER_MODIFIABLE;
210 flags |= oldflags & ~LOGFS_FL_USER_MODIFIABLE;
211 li->li_flags = flags;
212 inode_unlock(inode);
213
214 inode->i_ctime = current_time(inode);
215 mark_inode_dirty_sync(inode);
216 return 0;
217
218 default:
219 return -ENOTTY;
220 }
221}
222
223int logfs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
224{
225 struct super_block *sb = file->f_mapping->host->i_sb;
226 struct inode *inode = file->f_mapping->host;
227 int ret;
228
229 ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
230 if (ret)
231 return ret;
232
233 inode_lock(inode);
234 logfs_get_wblocks(sb, NULL, WF_LOCK);
235 logfs_write_anchor(sb);
236 logfs_put_wblocks(sb, NULL, WF_LOCK);
237 inode_unlock(inode);
238
239 return 0;
240}
241
242static int logfs_setattr(struct dentry *dentry, struct iattr *attr)
243{
244 struct inode *inode = d_inode(dentry);
245 int err = 0;
246
247 err = setattr_prepare(dentry, attr);
248 if (err)
249 return err;
250
251 if (attr->ia_valid & ATTR_SIZE) {
252 err = logfs_truncate(inode, attr->ia_size);
253 if (err)
254 return err;
255 }
256
257 setattr_copy(inode, attr);
258 mark_inode_dirty(inode);
259 return 0;
260}
261
262const struct inode_operations logfs_reg_iops = {
263 .setattr = logfs_setattr,
264};
265
266const struct file_operations logfs_reg_fops = {
267 .read_iter = generic_file_read_iter,
268 .write_iter = generic_file_write_iter,
269 .fsync = logfs_fsync,
270 .unlocked_ioctl = logfs_ioctl,
271 .llseek = generic_file_llseek,
272 .mmap = generic_file_readonly_mmap,
273 .open = generic_file_open,
274};
275
276const struct address_space_operations logfs_reg_aops = {
277 .invalidatepage = logfs_invalidatepage,
278 .readpage = logfs_readpage,
279 .releasepage = logfs_releasepage,
280 .set_page_dirty = __set_page_dirty_nobuffers,
281 .writepage = logfs_writepage,
282 .writepages = generic_writepages,
283 .write_begin = logfs_write_begin,
284 .write_end = logfs_write_end,
285};
diff --git a/fs/logfs/gc.c b/fs/logfs/gc.c
deleted file mode 100644
index d4efb061bdc5..000000000000
--- a/fs/logfs/gc.c
+++ /dev/null
@@ -1,732 +0,0 @@
1/*
2 * fs/logfs/gc.c - garbage collection code
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */
8#include "logfs.h"
9#include <linux/sched.h>
10#include <linux/slab.h>
11
12/*
13 * Wear leveling needs to kick in when the difference between low erase
14 * counts and high erase counts gets too big. A good value for "too big"
15 * may be somewhat below 10% of maximum erase count for the device.
16 * Why not 397, to pick a nice round number with no specific meaning? :)
17 *
18 * WL_RATELIMIT is the minimum time between two wear level events. A huge
19 * number of segments may fulfil the requirements for wear leveling at the
20 * same time. If that happens we don't want to cause a latency from hell,
21 * but just gently pick one segment every so often and minimize overhead.
22 */
23#define WL_DELTA 397
24#define WL_RATELIMIT 100
25#define MAX_OBJ_ALIASES 2600
26#define SCAN_RATIO 512 /* number of scanned segments per gc'd segment */
27#define LIST_SIZE 64 /* base size of candidate lists */
28#define SCAN_ROUNDS 128 /* maximum number of complete medium scans */
29#define SCAN_ROUNDS_HIGH 4 /* maximum number of higher-level scans */
30
31static int no_free_segments(struct super_block *sb)
32{
33 struct logfs_super *super = logfs_super(sb);
34
35 return super->s_free_list.count;
36}
37
38/* journal has distance -1, top-most ifile layer distance 0 */
39static u8 root_distance(struct super_block *sb, gc_level_t __gc_level)
40{
41 struct logfs_super *super = logfs_super(sb);
42 u8 gc_level = (__force u8)__gc_level;
43
44 switch (gc_level) {
45 case 0: /* fall through */
46 case 1: /* fall through */
47 case 2: /* fall through */
48 case 3:
49 /* file data or indirect blocks */
50 return super->s_ifile_levels + super->s_iblock_levels - gc_level;
51 case 6: /* fall through */
52 case 7: /* fall through */
53 case 8: /* fall through */
54 case 9:
55 /* inode file data or indirect blocks */
56 return super->s_ifile_levels - (gc_level - 6);
57 default:
58 printk(KERN_ERR"LOGFS: segment of unknown level %x found\n",
59 gc_level);
60 WARN_ON(1);
61 return super->s_ifile_levels + super->s_iblock_levels;
62 }
63}
64
65static int segment_is_reserved(struct super_block *sb, u32 segno)
66{
67 struct logfs_super *super = logfs_super(sb);
68 struct logfs_area *area;
69 void *reserved;
70 int i;
71
72 /* Some segments are reserved. Just pretend they were all valid */
73 reserved = btree_lookup32(&super->s_reserved_segments, segno);
74 if (reserved)
75 return 1;
76
77 /* Currently open segments */
78 for_each_area(i) {
79 area = super->s_area[i];
80 if (area->a_is_open && area->a_segno == segno)
81 return 1;
82 }
83
84 return 0;
85}
86
87static void logfs_mark_segment_bad(struct super_block *sb, u32 segno)
88{
89 BUG();
90}
91
92/*
93 * Returns the bytes consumed by valid objects in this segment. Object headers
94 * are counted, the segment header is not.
95 */
96static u32 logfs_valid_bytes(struct super_block *sb, u32 segno, u32 *ec,
97 gc_level_t *gc_level)
98{
99 struct logfs_segment_entry se;
100 u32 ec_level;
101
102 logfs_get_segment_entry(sb, segno, &se);
103 if (se.ec_level == cpu_to_be32(BADSEG) ||
104 se.valid == cpu_to_be32(RESERVED))
105 return RESERVED;
106
107 ec_level = be32_to_cpu(se.ec_level);
108 *ec = ec_level >> 4;
109 *gc_level = GC_LEVEL(ec_level & 0xf);
110 return be32_to_cpu(se.valid);
111}
112
113static void logfs_cleanse_block(struct super_block *sb, u64 ofs, u64 ino,
114 u64 bix, gc_level_t gc_level)
115{
116 struct inode *inode;
117 int err, cookie;
118
119 inode = logfs_safe_iget(sb, ino, &cookie);
120 err = logfs_rewrite_block(inode, bix, ofs, gc_level, 0);
121 BUG_ON(err);
122 logfs_safe_iput(inode, cookie);
123}
124
125static u32 logfs_gc_segment(struct super_block *sb, u32 segno)
126{
127 struct logfs_super *super = logfs_super(sb);
128 struct logfs_segment_header sh;
129 struct logfs_object_header oh;
130 u64 ofs, ino, bix;
131 u32 seg_ofs, logical_segno, cleaned = 0;
132 int err, len, valid;
133 gc_level_t gc_level;
134
135 LOGFS_BUG_ON(segment_is_reserved(sb, segno), sb);
136
137 btree_insert32(&super->s_reserved_segments, segno, (void *)1, GFP_NOFS);
138 err = wbuf_read(sb, dev_ofs(sb, segno, 0), sizeof(sh), &sh);
139 BUG_ON(err);
140 gc_level = GC_LEVEL(sh.level);
141 logical_segno = be32_to_cpu(sh.segno);
142 if (sh.crc != logfs_crc32(&sh, sizeof(sh), 4)) {
143 logfs_mark_segment_bad(sb, segno);
144 cleaned = -1;
145 goto out;
146 }
147
148 for (seg_ofs = LOGFS_SEGMENT_HEADERSIZE;
149 seg_ofs + sizeof(oh) < super->s_segsize; ) {
150 ofs = dev_ofs(sb, logical_segno, seg_ofs);
151 err = wbuf_read(sb, dev_ofs(sb, segno, seg_ofs), sizeof(oh),
152 &oh);
153 BUG_ON(err);
154
155 if (!memchr_inv(&oh, 0xff, sizeof(oh)))
156 break;
157
158 if (oh.crc != logfs_crc32(&oh, sizeof(oh) - 4, 4)) {
159 logfs_mark_segment_bad(sb, segno);
160 cleaned = super->s_segsize - 1;
161 goto out;
162 }
163
164 ino = be64_to_cpu(oh.ino);
165 bix = be64_to_cpu(oh.bix);
166 len = sizeof(oh) + be16_to_cpu(oh.len);
167 valid = logfs_is_valid_block(sb, ofs, ino, bix, gc_level);
168 if (valid == 1) {
169 logfs_cleanse_block(sb, ofs, ino, bix, gc_level);
170 cleaned += len;
171 } else if (valid == 2) {
172 /* Will be invalid upon journal commit */
173 cleaned += len;
174 }
175 seg_ofs += len;
176 }
177out:
178 btree_remove32(&super->s_reserved_segments, segno);
179 return cleaned;
180}
181
182static struct gc_candidate *add_list(struct gc_candidate *cand,
183 struct candidate_list *list)
184{
185 struct rb_node **p = &list->rb_tree.rb_node;
186 struct rb_node *parent = NULL;
187 struct gc_candidate *cur;
188 int comp;
189
190 cand->list = list;
191 while (*p) {
192 parent = *p;
193 cur = rb_entry(parent, struct gc_candidate, rb_node);
194
195 if (list->sort_by_ec)
196 comp = cand->erase_count < cur->erase_count;
197 else
198 comp = cand->valid < cur->valid;
199
200 if (comp)
201 p = &parent->rb_left;
202 else
203 p = &parent->rb_right;
204 }
205 rb_link_node(&cand->rb_node, parent, p);
206 rb_insert_color(&cand->rb_node, &list->rb_tree);
207
208 if (list->count <= list->maxcount) {
209 list->count++;
210 return NULL;
211 }
212 cand = rb_entry(rb_last(&list->rb_tree), struct gc_candidate, rb_node);
213 rb_erase(&cand->rb_node, &list->rb_tree);
214 cand->list = NULL;
215 return cand;
216}
217
218static void remove_from_list(struct gc_candidate *cand)
219{
220 struct candidate_list *list = cand->list;
221
222 rb_erase(&cand->rb_node, &list->rb_tree);
223 list->count--;
224}
225
226static void free_candidate(struct super_block *sb, struct gc_candidate *cand)
227{
228 struct logfs_super *super = logfs_super(sb);
229
230 btree_remove32(&super->s_cand_tree, cand->segno);
231 kfree(cand);
232}
233
234u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec)
235{
236 struct gc_candidate *cand;
237 u32 segno;
238
239 BUG_ON(list->count == 0);
240
241 cand = rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node);
242 remove_from_list(cand);
243 segno = cand->segno;
244 if (ec)
245 *ec = cand->erase_count;
246 free_candidate(sb, cand);
247 return segno;
248}
249
250/*
251 * We have several lists to manage segments with. The reserve_list is used to
252 * deal with bad blocks. We try to keep the best (lowest ec) segments on this
253 * list.
254 * The free_list contains free segments for normal usage. It usually gets the
255 * second pick after the reserve_list. But when the free_list is running short
256 * it is more important to keep the free_list full than to keep a reserve.
257 *
258 * Segments that are not free are put onto a per-level low_list. If we have
259 * to run garbage collection, we pick a candidate from there. All segments on
260 * those lists should have at least some free space so GC will make progress.
261 *
262 * And last we have the ec_list, which is used to pick segments for wear
263 * leveling.
264 *
265 * If all appropriate lists are full, we simply free the candidate and forget
266 * about that segment for a while. We have better candidates for each purpose.
267 */
268static void __add_candidate(struct super_block *sb, struct gc_candidate *cand)
269{
270 struct logfs_super *super = logfs_super(sb);
271 u32 full = super->s_segsize - LOGFS_SEGMENT_RESERVE;
272
273 if (cand->valid == 0) {
274 /* 100% free segments */
275 log_gc_noisy("add reserve segment %x (ec %x) at %llx\n",
276 cand->segno, cand->erase_count,
277 dev_ofs(sb, cand->segno, 0));
278 cand = add_list(cand, &super->s_reserve_list);
279 if (cand) {
280 log_gc_noisy("add free segment %x (ec %x) at %llx\n",
281 cand->segno, cand->erase_count,
282 dev_ofs(sb, cand->segno, 0));
283 cand = add_list(cand, &super->s_free_list);
284 }
285 } else {
286 /* good candidates for Garbage Collection */
287 if (cand->valid < full)
288 cand = add_list(cand, &super->s_low_list[cand->dist]);
289 /* good candidates for wear leveling,
290 * segments that were recently written get ignored */
291 if (cand)
292 cand = add_list(cand, &super->s_ec_list);
293 }
294 if (cand)
295 free_candidate(sb, cand);
296}
297
298static int add_candidate(struct super_block *sb, u32 segno, u32 valid, u32 ec,
299 u8 dist)
300{
301 struct logfs_super *super = logfs_super(sb);
302 struct gc_candidate *cand;
303
304 cand = kmalloc(sizeof(*cand), GFP_NOFS);
305 if (!cand)
306 return -ENOMEM;
307
308 cand->segno = segno;
309 cand->valid = valid;
310 cand->erase_count = ec;
311 cand->dist = dist;
312
313 btree_insert32(&super->s_cand_tree, segno, cand, GFP_NOFS);
314 __add_candidate(sb, cand);
315 return 0;
316}
317
318static void remove_segment_from_lists(struct super_block *sb, u32 segno)
319{
320 struct logfs_super *super = logfs_super(sb);
321 struct gc_candidate *cand;
322
323 cand = btree_lookup32(&super->s_cand_tree, segno);
324 if (cand) {
325 remove_from_list(cand);
326 free_candidate(sb, cand);
327 }
328}
329
330static void scan_segment(struct super_block *sb, u32 segno)
331{
332 u32 valid, ec = 0;
333 gc_level_t gc_level = 0;
334 u8 dist;
335
336 if (segment_is_reserved(sb, segno))
337 return;
338
339 remove_segment_from_lists(sb, segno);
340 valid = logfs_valid_bytes(sb, segno, &ec, &gc_level);
341 if (valid == RESERVED)
342 return;
343
344 dist = root_distance(sb, gc_level);
345 add_candidate(sb, segno, valid, ec, dist);
346}
347
348static struct gc_candidate *first_in_list(struct candidate_list *list)
349{
350 if (list->count == 0)
351 return NULL;
352 return rb_entry(rb_first(&list->rb_tree), struct gc_candidate, rb_node);
353}
354
355/*
356 * Find the best segment for garbage collection. Main criterion is
357 * the segment requiring the least effort to clean. Secondary
358 * criterion is to GC on the lowest level available.
359 *
360 * So we search the least effort segment on the lowest level first,
361 * then move up and pick another segment iff is requires significantly
362 * less effort. Hence the LOGFS_MAX_OBJECTSIZE in the comparison.
363 */
364static struct gc_candidate *get_candidate(struct super_block *sb)
365{
366 struct logfs_super *super = logfs_super(sb);
367 int i, max_dist;
368 struct gc_candidate *cand = NULL, *this;
369
370 max_dist = min(no_free_segments(sb), LOGFS_NO_AREAS - 1);
371
372 for (i = max_dist; i >= 0; i--) {
373 this = first_in_list(&super->s_low_list[i]);
374 if (!this)
375 continue;
376 if (!cand)
377 cand = this;
378 if (this->valid + LOGFS_MAX_OBJECTSIZE <= cand->valid)
379 cand = this;
380 }
381 return cand;
382}
383
384static int __logfs_gc_once(struct super_block *sb, struct gc_candidate *cand)
385{
386 struct logfs_super *super = logfs_super(sb);
387 gc_level_t gc_level;
388 u32 cleaned, valid, segno, ec;
389 u8 dist;
390
391 if (!cand) {
392 log_gc("GC attempted, but no candidate found\n");
393 return 0;
394 }
395
396 segno = cand->segno;
397 dist = cand->dist;
398 valid = logfs_valid_bytes(sb, segno, &ec, &gc_level);
399 free_candidate(sb, cand);
400 log_gc("GC segment #%02x at %llx, %x required, %x free, %x valid, %llx free\n",
401 segno, (u64)segno << super->s_segshift,
402 dist, no_free_segments(sb), valid,
403 super->s_free_bytes);
404 cleaned = logfs_gc_segment(sb, segno);
405 log_gc("GC segment #%02x complete - now %x valid\n", segno,
406 valid - cleaned);
407 BUG_ON(cleaned != valid);
408 return 1;
409}
410
411static int logfs_gc_once(struct super_block *sb)
412{
413 struct gc_candidate *cand;
414
415 cand = get_candidate(sb);
416 if (cand)
417 remove_from_list(cand);
418 return __logfs_gc_once(sb, cand);
419}
420
421/* returns 1 if a wrap occurs, 0 otherwise */
422static int logfs_scan_some(struct super_block *sb)
423{
424 struct logfs_super *super = logfs_super(sb);
425 u32 segno;
426 int i, ret = 0;
427
428 segno = super->s_sweeper;
429 for (i = SCAN_RATIO; i > 0; i--) {
430 segno++;
431 if (segno >= super->s_no_segs) {
432 segno = 0;
433 ret = 1;
434 /* Break out of the loop. We want to read a single
435 * block from the segment size on next invocation if
436 * SCAN_RATIO is set to match block size
437 */
438 break;
439 }
440
441 scan_segment(sb, segno);
442 }
443 super->s_sweeper = segno;
444 return ret;
445}
446
447/*
448 * In principle, this function should loop forever, looking for GC candidates
449 * and moving data. LogFS is designed in such a way that this loop is
450 * guaranteed to terminate.
451 *
452 * Limiting the loop to some iterations serves purely to catch cases when
453 * these guarantees have failed. An actual endless loop is an obvious bug
454 * and should be reported as such.
455 */
456static void __logfs_gc_pass(struct super_block *sb, int target)
457{
458 struct logfs_super *super = logfs_super(sb);
459 struct logfs_block *block;
460 int round, progress, last_progress = 0;
461
462 /*
463 * Doing too many changes to the segfile at once would result
464 * in a large number of aliases. Write the journal before
465 * things get out of hand.
466 */
467 if (super->s_shadow_tree.no_shadowed_segments >= MAX_OBJ_ALIASES)
468 logfs_write_anchor(sb);
469
470 if (no_free_segments(sb) >= target &&
471 super->s_no_object_aliases < MAX_OBJ_ALIASES)
472 return;
473
474 log_gc("__logfs_gc_pass(%x)\n", target);
475 for (round = 0; round < SCAN_ROUNDS; ) {
476 if (no_free_segments(sb) >= target)
477 goto write_alias;
478
479 /* Sync in-memory state with on-medium state in case they
480 * diverged */
481 logfs_write_anchor(sb);
482 round += logfs_scan_some(sb);
483 if (no_free_segments(sb) >= target)
484 goto write_alias;
485 progress = logfs_gc_once(sb);
486 if (progress)
487 last_progress = round;
488 else if (round - last_progress > 2)
489 break;
490 continue;
491
492 /*
493 * The goto logic is nasty, I just don't know a better way to
494 * code it. GC is supposed to ensure two things:
495 * 1. Enough free segments are available.
496 * 2. The number of aliases is bounded.
497 * When 1. is achieved, we take a look at 2. and write back
498 * some alias-containing blocks, if necessary. However, after
499 * each such write we need to go back to 1., as writes can
500 * consume free segments.
501 */
502write_alias:
503 if (super->s_no_object_aliases < MAX_OBJ_ALIASES)
504 return;
505 if (list_empty(&super->s_object_alias)) {
506 /* All aliases are still in btree */
507 return;
508 }
509 log_gc("Write back one alias\n");
510 block = list_entry(super->s_object_alias.next,
511 struct logfs_block, alias_list);
512 block->ops->write_block(block);
513 /*
514 * To round off the nasty goto logic, we reset round here. It
515 * is a safety-net for GC not making any progress and limited
516 * to something reasonably small. If incremented it for every
517 * single alias, the loop could terminate rather quickly.
518 */
519 round = 0;
520 }
521 LOGFS_BUG(sb);
522}
523
524static int wl_ratelimit(struct super_block *sb, u64 *next_event)
525{
526 struct logfs_super *super = logfs_super(sb);
527
528 if (*next_event < super->s_gec) {
529 *next_event = super->s_gec + WL_RATELIMIT;
530 return 0;
531 }
532 return 1;
533}
534
535static void logfs_wl_pass(struct super_block *sb)
536{
537 struct logfs_super *super = logfs_super(sb);
538 struct gc_candidate *wl_cand, *free_cand;
539
540 if (wl_ratelimit(sb, &super->s_wl_gec_ostore))
541 return;
542
543 wl_cand = first_in_list(&super->s_ec_list);
544 if (!wl_cand)
545 return;
546 free_cand = first_in_list(&super->s_free_list);
547 if (!free_cand)
548 return;
549
550 if (wl_cand->erase_count < free_cand->erase_count + WL_DELTA) {
551 remove_from_list(wl_cand);
552 __logfs_gc_once(sb, wl_cand);
553 }
554}
555
556/*
557 * The journal needs wear leveling as well. But moving the journal is an
558 * expensive operation so we try to avoid it as much as possible. And if we
559 * have to do it, we move the whole journal, not individual segments.
560 *
561 * Ratelimiting is not strictly necessary here, it mainly serves to avoid the
562 * calculations. First we check whether moving the journal would be a
563 * significant improvement. That means that a) the current journal segments
564 * have more wear than the future journal segments and b) the current journal
565 * segments have more wear than normal ostore segments.
566 * Rationale for b) is that we don't have to move the journal if it is aging
567 * less than the ostore, even if the reserve segments age even less (they are
568 * excluded from wear leveling, after all).
569 * Next we check that the superblocks have less wear than the journal. Since
570 * moving the journal requires writing the superblocks, we have to protect the
571 * superblocks even more than the journal.
572 *
573 * Also we double the acceptable wear difference, compared to ostore wear
574 * leveling. Journal data is read and rewritten rapidly, comparatively. So
575 * soft errors have much less time to accumulate and we allow the journal to
576 * be a bit worse than the ostore.
577 */
578static void logfs_journal_wl_pass(struct super_block *sb)
579{
580 struct logfs_super *super = logfs_super(sb);
581 struct gc_candidate *cand;
582 u32 min_journal_ec = -1, max_reserve_ec = 0;
583 int i;
584
585 if (wl_ratelimit(sb, &super->s_wl_gec_journal))
586 return;
587
588 if (super->s_reserve_list.count < super->s_no_journal_segs) {
589 /* Reserve is not full enough to move complete journal */
590 return;
591 }
592
593 journal_for_each(i)
594 if (super->s_journal_seg[i])
595 min_journal_ec = min(min_journal_ec,
596 super->s_journal_ec[i]);
597 cand = rb_entry(rb_first(&super->s_free_list.rb_tree),
598 struct gc_candidate, rb_node);
599 max_reserve_ec = cand->erase_count;
600 for (i = 0; i < 2; i++) {
601 struct logfs_segment_entry se;
602 u32 segno = seg_no(sb, super->s_sb_ofs[i]);
603 u32 ec;
604
605 logfs_get_segment_entry(sb, segno, &se);
606 ec = be32_to_cpu(se.ec_level) >> 4;
607 max_reserve_ec = max(max_reserve_ec, ec);
608 }
609
610 if (min_journal_ec > max_reserve_ec + 2 * WL_DELTA) {
611 do_logfs_journal_wl_pass(sb);
612 }
613}
614
615void logfs_gc_pass(struct super_block *sb)
616{
617 struct logfs_super *super = logfs_super(sb);
618
619 //BUG_ON(mutex_trylock(&logfs_super(sb)->s_w_mutex));
620 /* Write journal before free space is getting saturated with dirty
621 * objects.
622 */
623 if (super->s_dirty_used_bytes + super->s_dirty_free_bytes
624 + LOGFS_MAX_OBJECTSIZE >= super->s_free_bytes)
625 logfs_write_anchor(sb);
626 __logfs_gc_pass(sb, super->s_total_levels);
627 logfs_wl_pass(sb);
628 logfs_journal_wl_pass(sb);
629}
630
631static int check_area(struct super_block *sb, int i)
632{
633 struct logfs_super *super = logfs_super(sb);
634 struct logfs_area *area = super->s_area[i];
635 gc_level_t gc_level;
636 u32 cleaned, valid, ec;
637 u32 segno = area->a_segno;
638 u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
639
640 if (!area->a_is_open)
641 return 0;
642
643 if (super->s_devops->can_write_buf(sb, ofs) == 0)
644 return 0;
645
646 printk(KERN_INFO"LogFS: Possibly incomplete write at %llx\n", ofs);
647 /*
648 * The device cannot write back the write buffer. Most likely the
649 * wbuf was already written out and the system crashed at some point
650 * before the journal commit happened. In that case we wouldn't have
651 * to do anything. But if the crash happened before the wbuf was
652 * written out correctly, we must GC this segment. So assume the
653 * worst and always do the GC run.
654 */
655 area->a_is_open = 0;
656 valid = logfs_valid_bytes(sb, segno, &ec, &gc_level);
657 cleaned = logfs_gc_segment(sb, segno);
658 if (cleaned != valid)
659 return -EIO;
660 return 0;
661}
662
663int logfs_check_areas(struct super_block *sb)
664{
665 int i, err;
666
667 for_each_area(i) {
668 err = check_area(sb, i);
669 if (err)
670 return err;
671 }
672 return 0;
673}
674
675static void logfs_init_candlist(struct candidate_list *list, int maxcount,
676 int sort_by_ec)
677{
678 list->count = 0;
679 list->maxcount = maxcount;
680 list->sort_by_ec = sort_by_ec;
681 list->rb_tree = RB_ROOT;
682}
683
684int logfs_init_gc(struct super_block *sb)
685{
686 struct logfs_super *super = logfs_super(sb);
687 int i;
688
689 btree_init_mempool32(&super->s_cand_tree, super->s_btree_pool);
690 logfs_init_candlist(&super->s_free_list, LIST_SIZE + SCAN_RATIO, 1);
691 logfs_init_candlist(&super->s_reserve_list,
692 super->s_bad_seg_reserve, 1);
693 for_each_area(i)
694 logfs_init_candlist(&super->s_low_list[i], LIST_SIZE, 0);
695 logfs_init_candlist(&super->s_ec_list, LIST_SIZE, 1);
696 return 0;
697}
698
699static void logfs_cleanup_list(struct super_block *sb,
700 struct candidate_list *list)
701{
702 struct gc_candidate *cand;
703
704 while (list->count) {
705 cand = rb_entry(list->rb_tree.rb_node, struct gc_candidate,
706 rb_node);
707 remove_from_list(cand);
708 free_candidate(sb, cand);
709 }
710 BUG_ON(list->rb_tree.rb_node);
711}
712
713void logfs_cleanup_gc(struct super_block *sb)
714{
715 struct logfs_super *super = logfs_super(sb);
716 int i;
717
718 if (!super->s_free_list.count)
719 return;
720
721 /*
722 * FIXME: The btree may still contain a single empty node. So we
723 * call the grim visitor to clean up that mess. Btree code should
724 * do it for us, really.
725 */
726 btree_grim_visitor32(&super->s_cand_tree, 0, NULL);
727 logfs_cleanup_list(sb, &super->s_free_list);
728 logfs_cleanup_list(sb, &super->s_reserve_list);
729 for_each_area(i)
730 logfs_cleanup_list(sb, &super->s_low_list[i]);
731 logfs_cleanup_list(sb, &super->s_ec_list);
732}
diff --git a/fs/logfs/inode.c b/fs/logfs/inode.c
deleted file mode 100644
index f440a1525da8..000000000000
--- a/fs/logfs/inode.c
+++ /dev/null
@@ -1,428 +0,0 @@
1/*
2 * fs/logfs/inode.c - inode handling code
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */
8#include "logfs.h"
9#include <linux/slab.h>
10#include <linux/writeback.h>
11#include <linux/backing-dev.h>
12
13/*
14 * How soon to reuse old inode numbers? LogFS doesn't store deleted inodes
15 * on the medium. It therefore also lacks a method to store the previous
16 * generation number for deleted inodes. Instead a single generation number
17 * is stored which will be used for new inodes. Being just a 32bit counter,
18 * this can obvious wrap relatively quickly. So we only reuse inodes if we
19 * know that a fair number of inodes can be created before we have to increment
20 * the generation again - effectively adding some bits to the counter.
21 * But being too aggressive here means we keep a very large and very sparse
22 * inode file, wasting space on indirect blocks.
23 * So what is a good value? Beats me. 64k seems moderately bad on both
24 * fronts, so let's use that for now...
25 *
26 * NFS sucks, as everyone already knows.
27 */
28#define INOS_PER_WRAP (0x10000)
29
30/*
31 * Logfs' requirement to read inodes for garbage collection makes life a bit
32 * harder. GC may have to read inodes that are in I_FREEING state, when they
33 * are being written out - and waiting for GC to make progress, naturally.
34 *
35 * So we cannot just call iget() or some variant of it, but first have to check
36 * whether the inode in question might be in I_FREEING state. Therefore we
37 * maintain our own per-sb list of "almost deleted" inodes and check against
38 * that list first. Normally this should be at most 1-2 entries long.
39 *
40 * Also, inodes have logfs-specific reference counting on top of what the vfs
41 * does. When .destroy_inode is called, normally the reference count will drop
42 * to zero and the inode gets deleted. But if GC accessed the inode, its
43 * refcount will remain nonzero and final deletion will have to wait.
44 *
45 * As a result we have two sets of functions to get/put inodes:
46 * logfs_safe_iget/logfs_safe_iput - safe to call from GC context
47 * logfs_iget/iput - normal version
48 */
49static struct kmem_cache *logfs_inode_cache;
50
51static DEFINE_SPINLOCK(logfs_inode_lock);
52
53static void logfs_inode_setops(struct inode *inode)
54{
55 switch (inode->i_mode & S_IFMT) {
56 case S_IFDIR:
57 inode->i_op = &logfs_dir_iops;
58 inode->i_fop = &logfs_dir_fops;
59 inode->i_mapping->a_ops = &logfs_reg_aops;
60 break;
61 case S_IFREG:
62 inode->i_op = &logfs_reg_iops;
63 inode->i_fop = &logfs_reg_fops;
64 inode->i_mapping->a_ops = &logfs_reg_aops;
65 break;
66 case S_IFLNK:
67 inode->i_op = &page_symlink_inode_operations;
68 inode_nohighmem(inode);
69 inode->i_mapping->a_ops = &logfs_reg_aops;
70 break;
71 case S_IFSOCK: /* fall through */
72 case S_IFBLK: /* fall through */
73 case S_IFCHR: /* fall through */
74 case S_IFIFO:
75 init_special_inode(inode, inode->i_mode, inode->i_rdev);
76 break;
77 default:
78 BUG();
79 }
80}
81
82static struct inode *__logfs_iget(struct super_block *sb, ino_t ino)
83{
84 struct inode *inode = iget_locked(sb, ino);
85 int err;
86
87 if (!inode)
88 return ERR_PTR(-ENOMEM);
89 if (!(inode->i_state & I_NEW))
90 return inode;
91
92 err = logfs_read_inode(inode);
93 if (err || inode->i_nlink == 0) {
94 /* inode->i_nlink == 0 can be true when called from
95 * block validator */
96 /* set i_nlink to 0 to prevent caching */
97 clear_nlink(inode);
98 logfs_inode(inode)->li_flags |= LOGFS_IF_ZOMBIE;
99 iget_failed(inode);
100 if (!err)
101 err = -ENOENT;
102 return ERR_PTR(err);
103 }
104
105 logfs_inode_setops(inode);
106 unlock_new_inode(inode);
107 return inode;
108}
109
110struct inode *logfs_iget(struct super_block *sb, ino_t ino)
111{
112 BUG_ON(ino == LOGFS_INO_MASTER);
113 BUG_ON(ino == LOGFS_INO_SEGFILE);
114 return __logfs_iget(sb, ino);
115}
116
117/*
118 * is_cached is set to 1 if we hand out a cached inode, 0 otherwise.
119 * this allows logfs_iput to do the right thing later
120 */
121struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *is_cached)
122{
123 struct logfs_super *super = logfs_super(sb);
124 struct logfs_inode *li;
125
126 if (ino == LOGFS_INO_MASTER)
127 return super->s_master_inode;
128 if (ino == LOGFS_INO_SEGFILE)
129 return super->s_segfile_inode;
130
131 spin_lock(&logfs_inode_lock);
132 list_for_each_entry(li, &super->s_freeing_list, li_freeing_list)
133 if (li->vfs_inode.i_ino == ino) {
134 li->li_refcount++;
135 spin_unlock(&logfs_inode_lock);
136 *is_cached = 1;
137 return &li->vfs_inode;
138 }
139 spin_unlock(&logfs_inode_lock);
140
141 *is_cached = 0;
142 return __logfs_iget(sb, ino);
143}
144
145static void logfs_i_callback(struct rcu_head *head)
146{
147 struct inode *inode = container_of(head, struct inode, i_rcu);
148 kmem_cache_free(logfs_inode_cache, logfs_inode(inode));
149}
150
151static void __logfs_destroy_inode(struct inode *inode)
152{
153 struct logfs_inode *li = logfs_inode(inode);
154
155 BUG_ON(li->li_block);
156 list_del(&li->li_freeing_list);
157 call_rcu(&inode->i_rcu, logfs_i_callback);
158}
159
160static void __logfs_destroy_meta_inode(struct inode *inode)
161{
162 struct logfs_inode *li = logfs_inode(inode);
163 BUG_ON(li->li_block);
164 call_rcu(&inode->i_rcu, logfs_i_callback);
165}
166
167static void logfs_destroy_inode(struct inode *inode)
168{
169 struct logfs_inode *li = logfs_inode(inode);
170
171 if (inode->i_ino < LOGFS_RESERVED_INOS) {
172 /*
173 * The reserved inodes are never destroyed unless we are in
174 * unmont path.
175 */
176 __logfs_destroy_meta_inode(inode);
177 return;
178 }
179
180 BUG_ON(list_empty(&li->li_freeing_list));
181 spin_lock(&logfs_inode_lock);
182 li->li_refcount--;
183 if (li->li_refcount == 0)
184 __logfs_destroy_inode(inode);
185 spin_unlock(&logfs_inode_lock);
186}
187
188void logfs_safe_iput(struct inode *inode, int is_cached)
189{
190 if (inode->i_ino == LOGFS_INO_MASTER)
191 return;
192 if (inode->i_ino == LOGFS_INO_SEGFILE)
193 return;
194
195 if (is_cached) {
196 logfs_destroy_inode(inode);
197 return;
198 }
199
200 iput(inode);
201}
202
203static void logfs_init_inode(struct super_block *sb, struct inode *inode)
204{
205 struct logfs_inode *li = logfs_inode(inode);
206 int i;
207
208 li->li_flags = 0;
209 li->li_height = 0;
210 li->li_used_bytes = 0;
211 li->li_block = NULL;
212 i_uid_write(inode, 0);
213 i_gid_write(inode, 0);
214 inode->i_size = 0;
215 inode->i_blocks = 0;
216 inode->i_ctime = current_time(inode);
217 inode->i_mtime = current_time(inode);
218 li->li_refcount = 1;
219 INIT_LIST_HEAD(&li->li_freeing_list);
220
221 for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
222 li->li_data[i] = 0;
223
224 return;
225}
226
227static struct inode *logfs_alloc_inode(struct super_block *sb)
228{
229 struct logfs_inode *li;
230
231 li = kmem_cache_alloc(logfs_inode_cache, GFP_NOFS);
232 if (!li)
233 return NULL;
234 logfs_init_inode(sb, &li->vfs_inode);
235 return &li->vfs_inode;
236}
237
238/*
239 * In logfs inodes are written to an inode file. The inode file, like any
240 * other file, is managed with a inode. The inode file's inode, aka master
241 * inode, requires special handling in several respects. First, it cannot be
242 * written to the inode file, so it is stored in the journal instead.
243 *
244 * Secondly, this inode cannot be written back and destroyed before all other
245 * inodes have been written. The ordering is important. Linux' VFS is happily
246 * unaware of the ordering constraint and would ordinarily destroy the master
247 * inode at umount time while other inodes are still in use and dirty. Not
248 * good.
249 *
250 * So logfs makes sure the master inode is not written until all other inodes
251 * have been destroyed. Sadly, this method has another side-effect. The VFS
252 * will notice one remaining inode and print a frightening warning message.
253 * Worse, it is impossible to judge whether such a warning was caused by the
254 * master inode or any other inodes have leaked as well.
255 *
256 * Our attempt of solving this is with logfs_new_meta_inode() below. Its
257 * purpose is to create a new inode that will not trigger the warning if such
258 * an inode is still in use. An ugly hack, no doubt. Suggections for
259 * improvement are welcome.
260 *
261 * AV: that's what ->put_super() is for...
262 */
263struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino)
264{
265 struct inode *inode;
266
267 inode = new_inode(sb);
268 if (!inode)
269 return ERR_PTR(-ENOMEM);
270
271 inode->i_mode = S_IFREG;
272 inode->i_ino = ino;
273 inode->i_data.a_ops = &logfs_reg_aops;
274 mapping_set_gfp_mask(&inode->i_data, GFP_NOFS);
275
276 return inode;
277}
278
279struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino)
280{
281 struct inode *inode;
282 int err;
283
284 inode = logfs_new_meta_inode(sb, ino);
285 if (IS_ERR(inode))
286 return inode;
287
288 err = logfs_read_inode(inode);
289 if (err) {
290 iput(inode);
291 return ERR_PTR(err);
292 }
293 logfs_inode_setops(inode);
294 return inode;
295}
296
297static int logfs_write_inode(struct inode *inode, struct writeback_control *wbc)
298{
299 int ret;
300 long flags = WF_LOCK;
301
302 /* Can only happen if creat() failed. Safe to skip. */
303 if (logfs_inode(inode)->li_flags & LOGFS_IF_STILLBORN)
304 return 0;
305
306 ret = __logfs_write_inode(inode, NULL, flags);
307 LOGFS_BUG_ON(ret, inode->i_sb);
308 return ret;
309}
310
311/* called with inode->i_lock held */
312static int logfs_drop_inode(struct inode *inode)
313{
314 struct logfs_super *super = logfs_super(inode->i_sb);
315 struct logfs_inode *li = logfs_inode(inode);
316
317 spin_lock(&logfs_inode_lock);
318 list_move(&li->li_freeing_list, &super->s_freeing_list);
319 spin_unlock(&logfs_inode_lock);
320 return generic_drop_inode(inode);
321}
322
323static void logfs_set_ino_generation(struct super_block *sb,
324 struct inode *inode)
325{
326 struct logfs_super *super = logfs_super(sb);
327 u64 ino;
328
329 mutex_lock(&super->s_journal_mutex);
330 ino = logfs_seek_hole(super->s_master_inode, super->s_last_ino + 1);
331 super->s_last_ino = ino;
332 super->s_inos_till_wrap--;
333 if (super->s_inos_till_wrap < 0) {
334 super->s_last_ino = LOGFS_RESERVED_INOS;
335 super->s_generation++;
336 super->s_inos_till_wrap = INOS_PER_WRAP;
337 }
338 inode->i_ino = ino;
339 inode->i_generation = super->s_generation;
340 mutex_unlock(&super->s_journal_mutex);
341}
342
343struct inode *logfs_new_inode(struct inode *dir, umode_t mode)
344{
345 struct super_block *sb = dir->i_sb;
346 struct inode *inode;
347
348 inode = new_inode(sb);
349 if (!inode)
350 return ERR_PTR(-ENOMEM);
351
352 logfs_init_inode(sb, inode);
353
354 /* inherit parent flags */
355 logfs_inode(inode)->li_flags |=
356 logfs_inode(dir)->li_flags & LOGFS_FL_INHERITED;
357
358 inode->i_mode = mode;
359 logfs_set_ino_generation(sb, inode);
360
361 inode_init_owner(inode, dir, mode);
362 logfs_inode_setops(inode);
363 insert_inode_hash(inode);
364
365 return inode;
366}
367
368static void logfs_init_once(void *_li)
369{
370 struct logfs_inode *li = _li;
371 int i;
372
373 li->li_flags = 0;
374 li->li_used_bytes = 0;
375 li->li_refcount = 1;
376 for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
377 li->li_data[i] = 0;
378 inode_init_once(&li->vfs_inode);
379}
380
381static int logfs_sync_fs(struct super_block *sb, int wait)
382{
383 logfs_get_wblocks(sb, NULL, WF_LOCK);
384 logfs_write_anchor(sb);
385 logfs_put_wblocks(sb, NULL, WF_LOCK);
386 return 0;
387}
388
389static void logfs_put_super(struct super_block *sb)
390{
391 struct logfs_super *super = logfs_super(sb);
392 /* kill the meta-inodes */
393 iput(super->s_segfile_inode);
394 iput(super->s_master_inode);
395 iput(super->s_mapping_inode);
396}
397
398const struct super_operations logfs_super_operations = {
399 .alloc_inode = logfs_alloc_inode,
400 .destroy_inode = logfs_destroy_inode,
401 .evict_inode = logfs_evict_inode,
402 .drop_inode = logfs_drop_inode,
403 .put_super = logfs_put_super,
404 .write_inode = logfs_write_inode,
405 .statfs = logfs_statfs,
406 .sync_fs = logfs_sync_fs,
407};
408
409int logfs_init_inode_cache(void)
410{
411 logfs_inode_cache = kmem_cache_create("logfs_inode_cache",
412 sizeof(struct logfs_inode), 0,
413 SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT,
414 logfs_init_once);
415 if (!logfs_inode_cache)
416 return -ENOMEM;
417 return 0;
418}
419
420void logfs_destroy_inode_cache(void)
421{
422 /*
423 * Make sure all delayed rcu free inodes are flushed before we
424 * destroy cache.
425 */
426 rcu_barrier();
427 kmem_cache_destroy(logfs_inode_cache);
428}
diff --git a/fs/logfs/journal.c b/fs/logfs/journal.c
deleted file mode 100644
index 2a09b8d73989..000000000000
--- a/fs/logfs/journal.c
+++ /dev/null
@@ -1,894 +0,0 @@
1/*
2 * fs/logfs/journal.c - journal handling code
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 */
8#include "logfs.h"
9#include <linux/slab.h>
10
11static void logfs_calc_free(struct super_block *sb)
12{
13 struct logfs_super *super = logfs_super(sb);
14 u64 reserve, no_segs = super->s_no_segs;
15 s64 free;
16 int i;
17
18 /* superblock segments */
19 no_segs -= 2;
20 super->s_no_journal_segs = 0;
21 /* journal */
22 journal_for_each(i)
23 if (super->s_journal_seg[i]) {
24 no_segs--;
25 super->s_no_journal_segs++;
26 }
27
28 /* open segments plus one extra per level for GC */
29 no_segs -= 2 * super->s_total_levels;
30
31 free = no_segs * (super->s_segsize - LOGFS_SEGMENT_RESERVE);
32 free -= super->s_used_bytes;
33 /* just a bit extra */
34 free -= super->s_total_levels * 4096;
35
36 /* Bad blocks are 'paid' for with speed reserve - the filesystem
37 * simply gets slower as bad blocks accumulate. Until the bad blocks
38 * exceed the speed reserve - then the filesystem gets smaller.
39 */
40 reserve = super->s_bad_segments + super->s_bad_seg_reserve;
41 reserve *= super->s_segsize - LOGFS_SEGMENT_RESERVE;
42 reserve = max(reserve, super->s_speed_reserve);
43 free -= reserve;
44 if (free < 0)
45 free = 0;
46
47 super->s_free_bytes = free;
48}
49
50static void reserve_sb_and_journal(struct super_block *sb)
51{
52 struct logfs_super *super = logfs_super(sb);
53 struct btree_head32 *head = &super->s_reserved_segments;
54 int i, err;
55
56 err = btree_insert32(head, seg_no(sb, super->s_sb_ofs[0]), (void *)1,
57 GFP_KERNEL);
58 BUG_ON(err);
59
60 err = btree_insert32(head, seg_no(sb, super->s_sb_ofs[1]), (void *)1,
61 GFP_KERNEL);
62 BUG_ON(err);
63
64 journal_for_each(i) {
65 if (!super->s_journal_seg[i])
66 continue;
67 err = btree_insert32(head, super->s_journal_seg[i], (void *)1,
68 GFP_KERNEL);
69 BUG_ON(err);
70 }
71}
72
73static void read_dynsb(struct super_block *sb,
74 struct logfs_je_dynsb *dynsb)
75{
76 struct logfs_super *super = logfs_super(sb);
77
78 super->s_gec = be64_to_cpu(dynsb->ds_gec);
79 super->s_sweeper = be64_to_cpu(dynsb->ds_sweeper);
80 super->s_victim_ino = be64_to_cpu(dynsb->ds_victim_ino);
81 super->s_rename_dir = be64_to_cpu(dynsb->ds_rename_dir);
82 super->s_rename_pos = be64_to_cpu(dynsb->ds_rename_pos);
83 super->s_used_bytes = be64_to_cpu(dynsb->ds_used_bytes);
84 super->s_generation = be32_to_cpu(dynsb->ds_generation);
85}
86
87static void read_anchor(struct super_block *sb,
88 struct logfs_je_anchor *da)
89{
90 struct logfs_super *super = logfs_super(sb);
91 struct inode *inode = super->s_master_inode;
92 struct logfs_inode *li = logfs_inode(inode);
93 int i;
94
95 super->s_last_ino = be64_to_cpu(da->da_last_ino);
96 li->li_flags = 0;
97 li->li_height = da->da_height;
98 i_size_write(inode, be64_to_cpu(da->da_size));
99 li->li_used_bytes = be64_to_cpu(da->da_used_bytes);
100
101 for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
102 li->li_data[i] = be64_to_cpu(da->da_data[i]);
103}
104
105static void read_erasecount(struct super_block *sb,
106 struct logfs_je_journal_ec *ec)
107{
108 struct logfs_super *super = logfs_super(sb);
109 int i;
110
111 journal_for_each(i)
112 super->s_journal_ec[i] = be32_to_cpu(ec->ec[i]);
113}
114
115static int read_area(struct super_block *sb, struct logfs_je_area *a)
116{
117 struct logfs_super *super = logfs_super(sb);
118 struct logfs_area *area = super->s_area[a->gc_level];
119 u64 ofs;
120 u32 writemask = ~(super->s_writesize - 1);
121
122 if (a->gc_level >= LOGFS_NO_AREAS)
123 return -EIO;
124 if (a->vim != VIM_DEFAULT)
125 return -EIO; /* TODO: close area and continue */
126
127 area->a_used_bytes = be32_to_cpu(a->used_bytes);
128 area->a_written_bytes = area->a_used_bytes & writemask;
129 area->a_segno = be32_to_cpu(a->segno);
130 if (area->a_segno)
131 area->a_is_open = 1;
132
133 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
134 if (super->s_writesize > 1)
135 return logfs_buf_recover(area, ofs, a + 1, super->s_writesize);
136 else
137 return logfs_buf_recover(area, ofs, NULL, 0);
138}
139
140static void *unpack(void *from, void *to)
141{
142 struct logfs_journal_header *jh = from;
143 void *data = from + sizeof(struct logfs_journal_header);
144 int err;
145 size_t inlen, outlen;
146
147 inlen = be16_to_cpu(jh->h_len);
148 outlen = be16_to_cpu(jh->h_datalen);
149
150 if (jh->h_compr == COMPR_NONE)
151 memcpy(to, data, inlen);
152 else {
153 err = logfs_uncompress(data, to, inlen, outlen);
154 BUG_ON(err);
155 }
156 return to;
157}
158
159static int __read_je_header(struct super_block *sb, u64 ofs,
160 struct logfs_journal_header *jh)
161{
162 struct logfs_super *super = logfs_super(sb);
163 size_t bufsize = max_t(size_t, sb->s_blocksize, super->s_writesize)
164 + MAX_JOURNAL_HEADER;
165 u16 type, len, datalen;
166 int err;
167
168 /* read header only */
169 err = wbuf_read(sb, ofs, sizeof(*jh), jh);
170 if (err)
171 return err;
172 type = be16_to_cpu(jh->h_type);
173 len = be16_to_cpu(jh->h_len);
174 datalen = be16_to_cpu(jh->h_datalen);
175 if (len > sb->s_blocksize)
176 return -EIO;
177 if ((type < JE_FIRST) || (type > JE_LAST))
178 return -EIO;
179 if (datalen > bufsize)
180 return -EIO;
181 return 0;
182}
183
184static int __read_je_payload(struct super_block *sb, u64 ofs,
185 struct logfs_journal_header *jh)
186{
187 u16 len;
188 int err;
189
190 len = be16_to_cpu(jh->h_len);
191 err = wbuf_read(sb, ofs + sizeof(*jh), len, jh + 1);
192 if (err)
193 return err;
194 if (jh->h_crc != logfs_crc32(jh, len + sizeof(*jh), 4)) {
195 /* Old code was confused. It forgot about the header length
196 * and stopped calculating the crc 16 bytes before the end
197 * of data - ick!
198 * FIXME: Remove this hack once the old code is fixed.
199 */
200 if (jh->h_crc == logfs_crc32(jh, len, 4))
201 WARN_ON_ONCE(1);
202 else
203 return -EIO;
204 }
205 return 0;
206}
207
208/*
209 * jh needs to be large enough to hold the complete entry, not just the header
210 */
211static int __read_je(struct super_block *sb, u64 ofs,
212 struct logfs_journal_header *jh)
213{
214 int err;
215
216 err = __read_je_header(sb, ofs, jh);
217 if (err)
218 return err;
219 return __read_je_payload(sb, ofs, jh);
220}
221
222static int read_je(struct super_block *sb, u64 ofs)
223{
224 struct logfs_super *super = logfs_super(sb);
225 struct logfs_journal_header *jh = super->s_compressed_je;
226 void *scratch = super->s_je;
227 u16 type, datalen;
228 int err;
229
230 err = __read_je(sb, ofs, jh);
231 if (err)
232 return err;
233 type = be16_to_cpu(jh->h_type);
234 datalen = be16_to_cpu(jh->h_datalen);
235
236 switch (type) {
237 case JE_DYNSB:
238 read_dynsb(sb, unpack(jh, scratch));
239 break;
240 case JE_ANCHOR:
241 read_anchor(sb, unpack(jh, scratch));
242 break;
243 case JE_ERASECOUNT:
244 read_erasecount(sb, unpack(jh, scratch));
245 break;
246 case JE_AREA:
247 err = read_area(sb, unpack(jh, scratch));
248 break;
249 case JE_OBJ_ALIAS:
250 err = logfs_load_object_aliases(sb, unpack(jh, scratch),
251 datalen);
252 break;
253 default:
254 WARN_ON_ONCE(1);
255 return -EIO;
256 }
257 return err;
258}
259
260static int logfs_read_segment(struct super_block *sb, u32 segno)
261{
262 struct logfs_super *super = logfs_super(sb);
263 struct logfs_journal_header *jh = super->s_compressed_je;
264 u64 ofs, seg_ofs = dev_ofs(sb, segno, 0);
265 u32 h_ofs, last_ofs = 0;
266 u16 len, datalen, last_len = 0;
267 int i, err;
268
269 /* search for most recent commit */
270 for (h_ofs = 0; h_ofs < super->s_segsize; h_ofs += sizeof(*jh)) {
271 ofs = seg_ofs + h_ofs;
272 err = __read_je_header(sb, ofs, jh);
273 if (err)
274 continue;
275 if (jh->h_type != cpu_to_be16(JE_COMMIT))
276 continue;
277 err = __read_je_payload(sb, ofs, jh);
278 if (err)
279 continue;
280 len = be16_to_cpu(jh->h_len);
281 datalen = be16_to_cpu(jh->h_datalen);
282 if ((datalen > sizeof(super->s_je_array)) ||
283 (datalen % sizeof(__be64)))
284 continue;
285 last_ofs = h_ofs;
286 last_len = datalen;
287 h_ofs += ALIGN(len, sizeof(*jh)) - sizeof(*jh);
288 }
289 /* read commit */
290 if (last_ofs == 0)
291 return -ENOENT;
292 ofs = seg_ofs + last_ofs;
293 log_journal("Read commit from %llx\n", ofs);
294 err = __read_je(sb, ofs, jh);
295 BUG_ON(err); /* We should have caught it in the scan loop already */
296 if (err)
297 return err;
298 /* uncompress */
299 unpack(jh, super->s_je_array);
300 super->s_no_je = last_len / sizeof(__be64);
301 /* iterate over array */
302 for (i = 0; i < super->s_no_je; i++) {
303 err = read_je(sb, be64_to_cpu(super->s_je_array[i]));
304 if (err)
305 return err;
306 }
307 super->s_journal_area->a_segno = segno;
308 return 0;
309}
310
311static u64 read_gec(struct super_block *sb, u32 segno)
312{
313 struct logfs_segment_header sh;
314 __be32 crc;
315 int err;
316
317 if (!segno)
318 return 0;
319 err = wbuf_read(sb, dev_ofs(sb, segno, 0), sizeof(sh), &sh);
320 if (err)
321 return 0;
322 crc = logfs_crc32(&sh, sizeof(sh), 4);
323 if (crc != sh.crc) {
324 WARN_ON(sh.gec != cpu_to_be64(0xffffffffffffffffull));
325 /* Most likely it was just erased */
326 return 0;
327 }
328 return be64_to_cpu(sh.gec);
329}
330
331static int logfs_read_journal(struct super_block *sb)
332{
333 struct logfs_super *super = logfs_super(sb);
334 u64 gec[LOGFS_JOURNAL_SEGS], max;
335 u32 segno;
336 int i, max_i;
337
338 max = 0;
339 max_i = -1;
340 journal_for_each(i) {
341 segno = super->s_journal_seg[i];
342 gec[i] = read_gec(sb, super->s_journal_seg[i]);
343 if (gec[i] > max) {
344 max = gec[i];
345 max_i = i;
346 }
347 }
348 if (max_i == -1)
349 return -EIO;
350 /* FIXME: Try older segments in case of error */
351 return logfs_read_segment(sb, super->s_journal_seg[max_i]);
352}
353
354/*
355 * First search the current segment (outer loop), then pick the next segment
356 * in the array, skipping any zero entries (inner loop).
357 */
358static void journal_get_free_segment(struct logfs_area *area)
359{
360 struct logfs_super *super = logfs_super(area->a_sb);
361 int i;
362
363 journal_for_each(i) {
364 if (area->a_segno != super->s_journal_seg[i])
365 continue;
366
367 do {
368 i++;
369 if (i == LOGFS_JOURNAL_SEGS)
370 i = 0;
371 } while (!super->s_journal_seg[i]);
372
373 area->a_segno = super->s_journal_seg[i];
374 area->a_erase_count = ++(super->s_journal_ec[i]);
375 log_journal("Journal now at %x (ec %x)\n", area->a_segno,
376 area->a_erase_count);
377 return;
378 }
379 BUG();
380}
381
382static void journal_get_erase_count(struct logfs_area *area)
383{
384 /* erase count is stored globally and incremented in
385 * journal_get_free_segment() - nothing to do here */
386}
387
388static int journal_erase_segment(struct logfs_area *area)
389{
390 struct super_block *sb = area->a_sb;
391 union {
392 struct logfs_segment_header sh;
393 unsigned char c[ALIGN(sizeof(struct logfs_segment_header), 16)];
394 } u;
395 u64 ofs;
396 int err;
397
398 err = logfs_erase_segment(sb, area->a_segno, 1);
399 if (err)
400 return err;
401
402 memset(&u, 0, sizeof(u));
403 u.sh.pad = 0;
404 u.sh.type = SEG_JOURNAL;
405 u.sh.level = 0;
406 u.sh.segno = cpu_to_be32(area->a_segno);
407 u.sh.ec = cpu_to_be32(area->a_erase_count);
408 u.sh.gec = cpu_to_be64(logfs_super(sb)->s_gec);
409 u.sh.crc = logfs_crc32(&u.sh, sizeof(u.sh), 4);
410
411 /* This causes a bug in segment.c. Not yet. */
412 //logfs_set_segment_erased(sb, area->a_segno, area->a_erase_count, 0);
413
414 ofs = dev_ofs(sb, area->a_segno, 0);
415 area->a_used_bytes = sizeof(u);
416 logfs_buf_write(area, ofs, &u, sizeof(u));
417 return 0;
418}
419
420static size_t __logfs_write_header(struct logfs_super *super,
421 struct logfs_journal_header *jh, size_t len, size_t datalen,
422 u16 type, u8 compr)
423{
424 jh->h_len = cpu_to_be16(len);
425 jh->h_type = cpu_to_be16(type);
426 jh->h_datalen = cpu_to_be16(datalen);
427 jh->h_compr = compr;
428 jh->h_pad[0] = 'H';
429 jh->h_pad[1] = 'E';
430 jh->h_pad[2] = 'A';
431 jh->h_pad[3] = 'D';
432 jh->h_pad[4] = 'R';
433 jh->h_crc = logfs_crc32(jh, len + sizeof(*jh), 4);
434 return ALIGN(len, 16) + sizeof(*jh);
435}
436
437static size_t logfs_write_header(struct logfs_super *super,
438 struct logfs_journal_header *jh, size_t datalen, u16 type)
439{
440 size_t len = datalen;
441
442 return __logfs_write_header(super, jh, len, datalen, type, COMPR_NONE);
443}
444
445static inline size_t logfs_journal_erasecount_size(struct logfs_super *super)
446{
447 return LOGFS_JOURNAL_SEGS * sizeof(__be32);
448}
449
450static void *logfs_write_erasecount(struct super_block *sb, void *_ec,
451 u16 *type, size_t *len)
452{
453 struct logfs_super *super = logfs_super(sb);
454 struct logfs_je_journal_ec *ec = _ec;
455 int i;
456
457 journal_for_each(i)
458 ec->ec[i] = cpu_to_be32(super->s_journal_ec[i]);
459 *type = JE_ERASECOUNT;
460 *len = logfs_journal_erasecount_size(super);
461 return ec;
462}
463
464static void account_shadow(void *_shadow, unsigned long _sb, u64 ignore,
465 size_t ignore2)
466{
467 struct logfs_shadow *shadow = _shadow;
468 struct super_block *sb = (void *)_sb;
469 struct logfs_super *super = logfs_super(sb);
470
471 /* consume new space */
472 super->s_free_bytes -= shadow->new_len;
473 super->s_used_bytes += shadow->new_len;
474 super->s_dirty_used_bytes -= shadow->new_len;
475
476 /* free up old space */
477 super->s_free_bytes += shadow->old_len;
478 super->s_used_bytes -= shadow->old_len;
479 super->s_dirty_free_bytes -= shadow->old_len;
480
481 logfs_set_segment_used(sb, shadow->old_ofs, -shadow->old_len);
482 logfs_set_segment_used(sb, shadow->new_ofs, shadow->new_len);
483
484 log_journal("account_shadow(%llx, %llx, %x) %llx->%llx %x->%x\n",
485 shadow->ino, shadow->bix, shadow->gc_level,
486 shadow->old_ofs, shadow->new_ofs,
487 shadow->old_len, shadow->new_len);
488 mempool_free(shadow, super->s_shadow_pool);
489}
490
491static void account_shadows(struct super_block *sb)
492{
493 struct logfs_super *super = logfs_super(sb);
494 struct inode *inode = super->s_master_inode;
495 struct logfs_inode *li = logfs_inode(inode);
496 struct shadow_tree *tree = &super->s_shadow_tree;
497
498 btree_grim_visitor64(&tree->new, (unsigned long)sb, account_shadow);
499 btree_grim_visitor64(&tree->old, (unsigned long)sb, account_shadow);
500 btree_grim_visitor32(&tree->segment_map, 0, NULL);
501 tree->no_shadowed_segments = 0;
502
503 if (li->li_block) {
504 /*
505 * We never actually use the structure, when attached to the
506 * master inode. But it is easier to always free it here than
507 * to have checks in several places elsewhere when allocating
508 * it.
509 */
510 li->li_block->ops->free_block(sb, li->li_block);
511 }
512 BUG_ON((s64)li->li_used_bytes < 0);
513}
514
515static void *__logfs_write_anchor(struct super_block *sb, void *_da,
516 u16 *type, size_t *len)
517{
518 struct logfs_super *super = logfs_super(sb);
519 struct logfs_je_anchor *da = _da;
520 struct inode *inode = super->s_master_inode;
521 struct logfs_inode *li = logfs_inode(inode);
522 int i;
523
524 da->da_height = li->li_height;
525 da->da_last_ino = cpu_to_be64(super->s_last_ino);
526 da->da_size = cpu_to_be64(i_size_read(inode));
527 da->da_used_bytes = cpu_to_be64(li->li_used_bytes);
528 for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
529 da->da_data[i] = cpu_to_be64(li->li_data[i]);
530 *type = JE_ANCHOR;
531 *len = sizeof(*da);
532 return da;
533}
534
535static void *logfs_write_dynsb(struct super_block *sb, void *_dynsb,
536 u16 *type, size_t *len)
537{
538 struct logfs_super *super = logfs_super(sb);
539 struct logfs_je_dynsb *dynsb = _dynsb;
540
541 dynsb->ds_gec = cpu_to_be64(super->s_gec);
542 dynsb->ds_sweeper = cpu_to_be64(super->s_sweeper);
543 dynsb->ds_victim_ino = cpu_to_be64(super->s_victim_ino);
544 dynsb->ds_rename_dir = cpu_to_be64(super->s_rename_dir);
545 dynsb->ds_rename_pos = cpu_to_be64(super->s_rename_pos);
546 dynsb->ds_used_bytes = cpu_to_be64(super->s_used_bytes);
547 dynsb->ds_generation = cpu_to_be32(super->s_generation);
548 *type = JE_DYNSB;
549 *len = sizeof(*dynsb);
550 return dynsb;
551}
552
553static void write_wbuf(struct super_block *sb, struct logfs_area *area,
554 void *wbuf)
555{
556 struct logfs_super *super = logfs_super(sb);
557 struct address_space *mapping = super->s_mapping_inode->i_mapping;
558 u64 ofs;
559 pgoff_t index;
560 int page_ofs;
561 struct page *page;
562
563 ofs = dev_ofs(sb, area->a_segno,
564 area->a_used_bytes & ~(super->s_writesize - 1));
565 index = ofs >> PAGE_SHIFT;
566 page_ofs = ofs & (PAGE_SIZE - 1);
567
568 page = find_or_create_page(mapping, index, GFP_NOFS);
569 BUG_ON(!page);
570 memcpy(wbuf, page_address(page) + page_ofs, super->s_writesize);
571 unlock_page(page);
572}
573
574static void *logfs_write_area(struct super_block *sb, void *_a,
575 u16 *type, size_t *len)
576{
577 struct logfs_super *super = logfs_super(sb);
578 struct logfs_area *area = super->s_area[super->s_sum_index];
579 struct logfs_je_area *a = _a;
580
581 a->vim = VIM_DEFAULT;
582 a->gc_level = super->s_sum_index;
583 a->used_bytes = cpu_to_be32(area->a_used_bytes);
584 a->segno = cpu_to_be32(area->a_segno);
585 if (super->s_writesize > 1)
586 write_wbuf(sb, area, a + 1);
587
588 *type = JE_AREA;
589 *len = sizeof(*a) + super->s_writesize;
590 return a;
591}
592
593static void *logfs_write_commit(struct super_block *sb, void *h,
594 u16 *type, size_t *len)
595{
596 struct logfs_super *super = logfs_super(sb);
597
598 *type = JE_COMMIT;
599 *len = super->s_no_je * sizeof(__be64);
600 return super->s_je_array;
601}
602
603static size_t __logfs_write_je(struct super_block *sb, void *buf, u16 type,
604 size_t len)
605{
606 struct logfs_super *super = logfs_super(sb);
607 void *header = super->s_compressed_je;
608 void *data = header + sizeof(struct logfs_journal_header);
609 ssize_t compr_len, pad_len;
610 u8 compr = COMPR_ZLIB;
611
612 if (len == 0)
613 return logfs_write_header(super, header, 0, type);
614
615 compr_len = logfs_compress(buf, data, len, sb->s_blocksize);
616 if (compr_len < 0 || type == JE_ANCHOR) {
617 memcpy(data, buf, len);
618 compr_len = len;
619 compr = COMPR_NONE;
620 }
621
622 pad_len = ALIGN(compr_len, 16);
623 memset(data + compr_len, 0, pad_len - compr_len);
624
625 return __logfs_write_header(super, header, compr_len, len, type, compr);
626}
627
628static s64 logfs_get_free_bytes(struct logfs_area *area, size_t *bytes,
629 int must_pad)
630{
631 u32 writesize = logfs_super(area->a_sb)->s_writesize;
632 s32 ofs;
633 int ret;
634
635 ret = logfs_open_area(area, *bytes);
636 if (ret)
637 return -EAGAIN;
638
639 ofs = area->a_used_bytes;
640 area->a_used_bytes += *bytes;
641
642 if (must_pad) {
643 area->a_used_bytes = ALIGN(area->a_used_bytes, writesize);
644 *bytes = area->a_used_bytes - ofs;
645 }
646
647 return dev_ofs(area->a_sb, area->a_segno, ofs);
648}
649
650static int logfs_write_je_buf(struct super_block *sb, void *buf, u16 type,
651 size_t buf_len)
652{
653 struct logfs_super *super = logfs_super(sb);
654 struct logfs_area *area = super->s_journal_area;
655 struct logfs_journal_header *jh = super->s_compressed_je;
656 size_t len;
657 int must_pad = 0;
658 s64 ofs;
659
660 len = __logfs_write_je(sb, buf, type, buf_len);
661 if (jh->h_type == cpu_to_be16(JE_COMMIT))
662 must_pad = 1;
663
664 ofs = logfs_get_free_bytes(area, &len, must_pad);
665 if (ofs < 0)
666 return ofs;
667 logfs_buf_write(area, ofs, super->s_compressed_je, len);
668 BUG_ON(super->s_no_je >= MAX_JOURNAL_ENTRIES);
669 super->s_je_array[super->s_no_je++] = cpu_to_be64(ofs);
670 return 0;
671}
672
673static int logfs_write_je(struct super_block *sb,
674 void* (*write)(struct super_block *sb, void *scratch,
675 u16 *type, size_t *len))
676{
677 void *buf;
678 size_t len;
679 u16 type;
680
681 buf = write(sb, logfs_super(sb)->s_je, &type, &len);
682 return logfs_write_je_buf(sb, buf, type, len);
683}
684
685int write_alias_journal(struct super_block *sb, u64 ino, u64 bix,
686 level_t level, int child_no, __be64 val)
687{
688 struct logfs_super *super = logfs_super(sb);
689 struct logfs_obj_alias *oa = super->s_je;
690 int err = 0, fill = super->s_je_fill;
691
692 log_aliases("logfs_write_obj_aliases #%x(%llx, %llx, %x, %x) %llx\n",
693 fill, ino, bix, level, child_no, be64_to_cpu(val));
694 oa[fill].ino = cpu_to_be64(ino);
695 oa[fill].bix = cpu_to_be64(bix);
696 oa[fill].val = val;
697 oa[fill].level = (__force u8)level;
698 oa[fill].child_no = cpu_to_be16(child_no);
699 fill++;
700 if (fill >= sb->s_blocksize / sizeof(*oa)) {
701 err = logfs_write_je_buf(sb, oa, JE_OBJ_ALIAS, sb->s_blocksize);
702 fill = 0;
703 }
704
705 super->s_je_fill = fill;
706 return err;
707}
708
709static int logfs_write_obj_aliases(struct super_block *sb)
710{
711 struct logfs_super *super = logfs_super(sb);
712 int err;
713
714 log_journal("logfs_write_obj_aliases: %d aliases to write\n",
715 super->s_no_object_aliases);
716 super->s_je_fill = 0;
717 err = logfs_write_obj_aliases_pagecache(sb);
718 if (err)
719 return err;
720
721 if (super->s_je_fill)
722 err = logfs_write_je_buf(sb, super->s_je, JE_OBJ_ALIAS,
723 super->s_je_fill
724 * sizeof(struct logfs_obj_alias));
725 return err;
726}
727
728/*
729 * Write all journal entries. The goto logic ensures that all journal entries
730 * are written whenever a new segment is used. It is ugly and potentially a
731 * bit wasteful, but robustness is more important. With this we can *always*
732 * erase all journal segments except the one containing the most recent commit.
733 */
734void logfs_write_anchor(struct super_block *sb)
735{
736 struct logfs_super *super = logfs_super(sb);
737 struct logfs_area *area = super->s_journal_area;
738 int i, err;
739
740 if (!(super->s_flags & LOGFS_SB_FLAG_DIRTY))
741 return;
742 super->s_flags &= ~LOGFS_SB_FLAG_DIRTY;
743
744 BUG_ON(super->s_flags & LOGFS_SB_FLAG_SHUTDOWN);
745 mutex_lock(&super->s_journal_mutex);
746
747 /* Do this first or suffer corruption */
748 logfs_sync_segments(sb);
749 account_shadows(sb);
750
751again:
752 super->s_no_je = 0;
753 for_each_area(i) {
754 if (!super->s_area[i]->a_is_open)
755 continue;
756 super->s_sum_index = i;
757 err = logfs_write_je(sb, logfs_write_area);
758 if (err)
759 goto again;
760 }
761 err = logfs_write_obj_aliases(sb);
762 if (err)
763 goto again;
764 err = logfs_write_je(sb, logfs_write_erasecount);
765 if (err)
766 goto again;
767 err = logfs_write_je(sb, __logfs_write_anchor);
768 if (err)
769 goto again;
770 err = logfs_write_je(sb, logfs_write_dynsb);
771 if (err)
772 goto again;
773 /*
774 * Order is imperative. First we sync all writes, including the
775 * non-committed journal writes. Then we write the final commit and
776 * sync the current journal segment.
777 * There is a theoretical bug here. Syncing the journal segment will
778 * write a number of journal entries and the final commit. All these
779 * are written in a single operation. If the device layer writes the
780 * data back-to-front, the commit will precede the other journal
781 * entries, leaving a race window.
782 * Two fixes are possible. Preferred is to fix the device layer to
783 * ensure writes happen front-to-back. Alternatively we can insert
784 * another logfs_sync_area() super->s_devops->sync() combo before
785 * writing the commit.
786 */
787 /*
788 * On another subject, super->s_devops->sync is usually not necessary.
789 * Unless called from sys_sync or friends, a barrier would suffice.
790 */
791 super->s_devops->sync(sb);
792 err = logfs_write_je(sb, logfs_write_commit);
793 if (err)
794 goto again;
795 log_journal("Write commit to %llx\n",
796 be64_to_cpu(super->s_je_array[super->s_no_je - 1]));
797 logfs_sync_area(area);
798 BUG_ON(area->a_used_bytes != area->a_written_bytes);
799 super->s_devops->sync(sb);
800
801 mutex_unlock(&super->s_journal_mutex);
802 return;
803}
804
805void do_logfs_journal_wl_pass(struct super_block *sb)
806{
807 struct logfs_super *super = logfs_super(sb);
808 struct logfs_area *area = super->s_journal_area;
809 struct btree_head32 *head = &super->s_reserved_segments;
810 u32 segno, ec;
811 int i, err;
812
813 log_journal("Journal requires wear-leveling.\n");
814 /* Drop old segments */
815 journal_for_each(i)
816 if (super->s_journal_seg[i]) {
817 btree_remove32(head, super->s_journal_seg[i]);
818 logfs_set_segment_unreserved(sb,
819 super->s_journal_seg[i],
820 super->s_journal_ec[i]);
821 super->s_journal_seg[i] = 0;
822 super->s_journal_ec[i] = 0;
823 }
824 /* Get new segments */
825 for (i = 0; i < super->s_no_journal_segs; i++) {
826 segno = get_best_cand(sb, &super->s_reserve_list, &ec);
827 super->s_journal_seg[i] = segno;
828 super->s_journal_ec[i] = ec;
829 logfs_set_segment_reserved(sb, segno);
830 err = btree_insert32(head, segno, (void *)1, GFP_NOFS);
831 BUG_ON(err); /* mempool should prevent this */
832 err = logfs_erase_segment(sb, segno, 1);
833 BUG_ON(err); /* FIXME: remount-ro would be nicer */
834 }
835 /* Manually move journal_area */
836 freeseg(sb, area->a_segno);
837 area->a_segno = super->s_journal_seg[0];
838 area->a_is_open = 0;
839 area->a_used_bytes = 0;
840 /* Write journal */
841 logfs_write_anchor(sb);
842 /* Write superblocks */
843 err = logfs_write_sb(sb);
844 BUG_ON(err);
845}
846
847static const struct logfs_area_ops journal_area_ops = {
848 .get_free_segment = journal_get_free_segment,
849 .get_erase_count = journal_get_erase_count,
850 .erase_segment = journal_erase_segment,
851};
852
853int logfs_init_journal(struct super_block *sb)
854{
855 struct logfs_super *super = logfs_super(sb);
856 size_t bufsize = max_t(size_t, sb->s_blocksize, super->s_writesize)
857 + MAX_JOURNAL_HEADER;
858 int ret = -ENOMEM;
859
860 mutex_init(&super->s_journal_mutex);
861 btree_init_mempool32(&super->s_reserved_segments, super->s_btree_pool);
862
863 super->s_je = kzalloc(bufsize, GFP_KERNEL);
864 if (!super->s_je)
865 return ret;
866
867 super->s_compressed_je = kzalloc(bufsize, GFP_KERNEL);
868 if (!super->s_compressed_je)
869 return ret;
870
871 super->s_master_inode = logfs_new_meta_inode(sb, LOGFS_INO_MASTER);
872 if (IS_ERR(super->s_master_inode))
873 return PTR_ERR(super->s_master_inode);
874
875 ret = logfs_read_journal(sb);
876 if (ret)
877 return -EIO;
878
879 reserve_sb_and_journal(sb);
880 logfs_calc_free(sb);
881
882 super->s_journal_area->a_ops = &journal_area_ops;
883 return 0;
884}
885
886void logfs_cleanup_journal(struct super_block *sb)
887{
888 struct logfs_super *super = logfs_super(sb);
889
890 btree_grim_visitor32(&super->s_reserved_segments, 0, NULL);
891
892 kfree(super->s_compressed_je);
893 kfree(super->s_je);
894}
diff --git a/fs/logfs/logfs.h b/fs/logfs/logfs.h
deleted file mode 100644
index 27d040e35faa..000000000000
--- a/fs/logfs/logfs.h
+++ /dev/null
@@ -1,735 +0,0 @@
1/*
2 * fs/logfs/logfs.h
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 *
8 * Private header for logfs.
9 */
10#ifndef FS_LOGFS_LOGFS_H
11#define FS_LOGFS_LOGFS_H
12
13#undef __CHECK_ENDIAN__
14#define __CHECK_ENDIAN__
15
16#include <linux/btree.h>
17#include <linux/crc32.h>
18#include <linux/fs.h>
19#include <linux/kernel.h>
20#include <linux/mempool.h>
21#include <linux/pagemap.h>
22#include <linux/mtd/mtd.h>
23#include "logfs_abi.h"
24
25#define LOGFS_DEBUG_SUPER (0x0001)
26#define LOGFS_DEBUG_SEGMENT (0x0002)
27#define LOGFS_DEBUG_JOURNAL (0x0004)
28#define LOGFS_DEBUG_DIR (0x0008)
29#define LOGFS_DEBUG_FILE (0x0010)
30#define LOGFS_DEBUG_INODE (0x0020)
31#define LOGFS_DEBUG_READWRITE (0x0040)
32#define LOGFS_DEBUG_GC (0x0080)
33#define LOGFS_DEBUG_GC_NOISY (0x0100)
34#define LOGFS_DEBUG_ALIASES (0x0200)
35#define LOGFS_DEBUG_BLOCKMOVE (0x0400)
36#define LOGFS_DEBUG_ALL (0xffffffff)
37
38#define LOGFS_DEBUG (0x01)
39/*
40 * To enable specific log messages, simply define LOGFS_DEBUG to match any
41 * or all of the above.
42 */
43#ifndef LOGFS_DEBUG
44#define LOGFS_DEBUG (0)
45#endif
46
47#define log_cond(cond, fmt, arg...) do { \
48 if (cond) \
49 printk(KERN_DEBUG fmt, ##arg); \
50} while (0)
51
52#define log_super(fmt, arg...) \
53 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_SUPER, fmt, ##arg)
54#define log_segment(fmt, arg...) \
55 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_SEGMENT, fmt, ##arg)
56#define log_journal(fmt, arg...) \
57 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_JOURNAL, fmt, ##arg)
58#define log_dir(fmt, arg...) \
59 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_DIR, fmt, ##arg)
60#define log_file(fmt, arg...) \
61 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_FILE, fmt, ##arg)
62#define log_inode(fmt, arg...) \
63 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_INODE, fmt, ##arg)
64#define log_readwrite(fmt, arg...) \
65 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_READWRITE, fmt, ##arg)
66#define log_gc(fmt, arg...) \
67 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_GC, fmt, ##arg)
68#define log_gc_noisy(fmt, arg...) \
69 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_GC_NOISY, fmt, ##arg)
70#define log_aliases(fmt, arg...) \
71 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_ALIASES, fmt, ##arg)
72#define log_blockmove(fmt, arg...) \
73 log_cond(LOGFS_DEBUG & LOGFS_DEBUG_BLOCKMOVE, fmt, ##arg)
74
75#define PG_pre_locked PG_owner_priv_1
76#define PagePreLocked(page) test_bit(PG_pre_locked, &(page)->flags)
77#define SetPagePreLocked(page) set_bit(PG_pre_locked, &(page)->flags)
78#define ClearPagePreLocked(page) clear_bit(PG_pre_locked, &(page)->flags)
79
80/* FIXME: This should really be somewhere in the 64bit area. */
81#define LOGFS_LINK_MAX (1<<30)
82
83/* Read-only filesystem */
84#define LOGFS_SB_FLAG_RO 0x0001
85#define LOGFS_SB_FLAG_DIRTY 0x0002
86#define LOGFS_SB_FLAG_OBJ_ALIAS 0x0004
87#define LOGFS_SB_FLAG_SHUTDOWN 0x0008
88
89/* Write Control Flags */
90#define WF_LOCK 0x01 /* take write lock */
91#define WF_WRITE 0x02 /* write block */
92#define WF_DELETE 0x04 /* delete old block */
93
94typedef u8 __bitwise level_t;
95typedef u8 __bitwise gc_level_t;
96
97#define LEVEL(level) ((__force level_t)(level))
98#define GC_LEVEL(gc_level) ((__force gc_level_t)(gc_level))
99
100#define SUBLEVEL(level) ( (void)((level) == LEVEL(1)), \
101 (__force level_t)((__force u8)(level) - 1) )
102
103/**
104 * struct logfs_area - area management information
105 *
106 * @a_sb: the superblock this area belongs to
107 * @a_is_open: 1 if the area is currently open, else 0
108 * @a_segno: segment number of area
109 * @a_written_bytes: number of bytes already written back
110 * @a_used_bytes: number of used bytes
111 * @a_ops: area operations (either journal or ostore)
112 * @a_erase_count: erase count
113 * @a_level: GC level
114 */
115struct logfs_area { /* a segment open for writing */
116 struct super_block *a_sb;
117 int a_is_open;
118 u32 a_segno;
119 u32 a_written_bytes;
120 u32 a_used_bytes;
121 const struct logfs_area_ops *a_ops;
122 u32 a_erase_count;
123 gc_level_t a_level;
124};
125
126/**
127 * struct logfs_area_ops - area operations
128 *
129 * @get_free_segment: fill area->ofs with the offset of a free segment
130 * @get_erase_count: fill area->erase_count (needs area->ofs)
131 * @erase_segment: erase and setup segment
132 */
133struct logfs_area_ops {
134 void (*get_free_segment)(struct logfs_area *area);
135 void (*get_erase_count)(struct logfs_area *area);
136 int (*erase_segment)(struct logfs_area *area);
137};
138
139struct logfs_super; /* forward */
140/**
141 * struct logfs_device_ops - device access operations
142 *
143 * @readpage: read one page (mm page)
144 * @writeseg: write one segment. may be a partial segment
145 * @erase: erase one segment
146 * @read: read from the device
147 * @erase: erase part of the device
148 * @can_write_buf: decide whether wbuf can be written to ofs
149 */
150struct logfs_device_ops {
151 struct page *(*find_first_sb)(struct super_block *sb, u64 *ofs);
152 struct page *(*find_last_sb)(struct super_block *sb, u64 *ofs);
153 int (*write_sb)(struct super_block *sb, struct page *page);
154 int (*readpage)(void *_sb, struct page *page);
155 void (*writeseg)(struct super_block *sb, u64 ofs, size_t len);
156 int (*erase)(struct super_block *sb, loff_t ofs, size_t len,
157 int ensure_write);
158 int (*can_write_buf)(struct super_block *sb, u64 ofs);
159 void (*sync)(struct super_block *sb);
160 void (*put_device)(struct logfs_super *s);
161};
162
163/**
164 * struct candidate_list - list of similar candidates
165 */
166struct candidate_list {
167 struct rb_root rb_tree;
168 int count;
169 int maxcount;
170 int sort_by_ec;
171};
172
173/**
174 * struct gc_candidate - "candidate" segment to be garbage collected next
175 *
176 * @list: list (either free of low)
177 * @segno: segment number
178 * @valid: number of valid bytes
179 * @erase_count: erase count of segment
180 * @dist: distance from tree root
181 *
182 * Candidates can be on two lists. The free list contains electees rather
183 * than candidates - segments that no longer contain any valid data. The
184 * low list contains candidates to be picked for GC. It should be kept
185 * short. It is not required to always pick a perfect candidate. In the
186 * worst case GC will have to move more data than absolutely necessary.
187 */
188struct gc_candidate {
189 struct rb_node rb_node;
190 struct candidate_list *list;
191 u32 segno;
192 u32 valid;
193 u32 erase_count;
194 u8 dist;
195};
196
197/**
198 * struct logfs_journal_entry - temporary structure used during journal scan
199 *
200 * @used:
201 * @version: normalized version
202 * @len: length
203 * @offset: offset
204 */
205struct logfs_journal_entry {
206 int used;
207 s16 version;
208 u16 len;
209 u16 datalen;
210 u64 offset;
211};
212
213enum transaction_state {
214 CREATE_1 = 1,
215 CREATE_2,
216 UNLINK_1,
217 UNLINK_2,
218 CROSS_RENAME_1,
219 CROSS_RENAME_2,
220 TARGET_RENAME_1,
221 TARGET_RENAME_2,
222 TARGET_RENAME_3
223};
224
225/**
226 * struct logfs_transaction - essential fields to support atomic dirops
227 *
228 * @ino: target inode
229 * @dir: inode of directory containing dentry
230 * @pos: pos of dentry in directory
231 */
232struct logfs_transaction {
233 enum transaction_state state;
234 u64 ino;
235 u64 dir;
236 u64 pos;
237};
238
239/**
240 * struct logfs_shadow - old block in the shadow of a not-yet-committed new one
241 * @old_ofs: offset of old block on medium
242 * @new_ofs: offset of new block on medium
243 * @ino: inode number
244 * @bix: block index
245 * @old_len: size of old block, including header
246 * @new_len: size of new block, including header
247 * @level: block level
248 */
249struct logfs_shadow {
250 u64 old_ofs;
251 u64 new_ofs;
252 u64 ino;
253 u64 bix;
254 int old_len;
255 int new_len;
256 gc_level_t gc_level;
257};
258
259/**
260 * struct shadow_tree
261 * @new: shadows where old_ofs==0, indexed by new_ofs
262 * @old: shadows where old_ofs!=0, indexed by old_ofs
263 * @segment_map: bitfield of segments containing shadows
264 * @no_shadowed_segment: number of segments containing shadows
265 */
266struct shadow_tree {
267 struct btree_head64 new;
268 struct btree_head64 old;
269 struct btree_head32 segment_map;
270 int no_shadowed_segments;
271};
272
273struct object_alias_item {
274 struct list_head list;
275 __be64 val;
276 int child_no;
277};
278
279/**
280 * struct logfs_block - contains any block state
281 * @type: indirect block or inode
282 * @full: number of fully populated children
283 * @partial: number of partially populated children
284 *
285 * Most blocks are directly represented by page cache pages. But when a block
286 * becomes dirty, is part of a transaction, contains aliases or is otherwise
287 * special, a struct logfs_block is allocated to track the additional state.
288 * Inodes are very similar to indirect blocks, so they can also get one of
289 * these structures added when appropriate.
290 */
291#define BLOCK_INDIRECT 1 /* Indirect block */
292#define BLOCK_INODE 2 /* Inode */
293struct logfs_block_ops;
294struct logfs_block {
295 struct list_head alias_list;
296 struct list_head item_list;
297 struct super_block *sb;
298 u64 ino;
299 u64 bix;
300 level_t level;
301 struct page *page;
302 struct inode *inode;
303 struct logfs_transaction *ta;
304 unsigned long alias_map[LOGFS_BLOCK_FACTOR / BITS_PER_LONG];
305 const struct logfs_block_ops *ops;
306 int full;
307 int partial;
308 int reserved_bytes;
309};
310
311typedef int write_alias_t(struct super_block *sb, u64 ino, u64 bix,
312 level_t level, int child_no, __be64 val);
313struct logfs_block_ops {
314 void (*write_block)(struct logfs_block *block);
315 void (*free_block)(struct super_block *sb, struct logfs_block*block);
316 int (*write_alias)(struct super_block *sb,
317 struct logfs_block *block,
318 write_alias_t *write_one_alias);
319};
320
321#define MAX_JOURNAL_ENTRIES 256
322
323struct logfs_super {
324 struct mtd_info *s_mtd; /* underlying device */
325 struct block_device *s_bdev; /* underlying device */
326 const struct logfs_device_ops *s_devops;/* device access */
327 struct inode *s_master_inode; /* inode file */
328 struct inode *s_segfile_inode; /* segment file */
329 struct inode *s_mapping_inode; /* device mapping */
330 atomic_t s_pending_writes; /* outstanting bios */
331 long s_flags;
332 mempool_t *s_btree_pool; /* for btree nodes */
333 mempool_t *s_alias_pool; /* aliases in segment.c */
334 u64 s_feature_incompat;
335 u64 s_feature_ro_compat;
336 u64 s_feature_compat;
337 u64 s_feature_flags;
338 u64 s_sb_ofs[2];
339 struct page *s_erase_page; /* for dev_bdev.c */
340 /* alias.c fields */
341 struct btree_head32 s_segment_alias; /* remapped segments */
342 int s_no_object_aliases;
343 struct list_head s_object_alias; /* remapped objects */
344 struct btree_head128 s_object_alias_tree; /* remapped objects */
345 struct mutex s_object_alias_mutex;
346 /* dir.c fields */
347 struct mutex s_dirop_mutex; /* for creat/unlink/rename */
348 u64 s_victim_ino; /* used for atomic dir-ops */
349 u64 s_rename_dir; /* source directory ino */
350 u64 s_rename_pos; /* position of source dd */
351 /* gc.c fields */
352 long s_segsize; /* size of a segment */
353 int s_segshift; /* log2 of segment size */
354 long s_segmask; /* 1 << s_segshift - 1 */
355 long s_no_segs; /* segments on device */
356 long s_no_journal_segs; /* segments used for journal */
357 long s_no_blocks; /* blocks per segment */
358 long s_writesize; /* minimum write size */
359 int s_writeshift; /* log2 of write size */
360 u64 s_size; /* filesystem size */
361 struct logfs_area *s_area[LOGFS_NO_AREAS]; /* open segment array */
362 u64 s_gec; /* global erase count */
363 u64 s_wl_gec_ostore; /* time of last wl event */
364 u64 s_wl_gec_journal; /* time of last wl event */
365 u64 s_sweeper; /* current sweeper pos */
366 u8 s_ifile_levels; /* max level of ifile */
367 u8 s_iblock_levels; /* max level of regular files */
368 u8 s_data_levels; /* # of segments to leaf block*/
369 u8 s_total_levels; /* sum of above three */
370 struct btree_head32 s_cand_tree; /* all candidates */
371 struct candidate_list s_free_list; /* 100% free segments */
372 struct candidate_list s_reserve_list; /* Bad segment reserve */
373 struct candidate_list s_low_list[LOGFS_NO_AREAS];/* good candidates */
374 struct candidate_list s_ec_list; /* wear level candidates */
375 struct btree_head32 s_reserved_segments;/* sb, journal, bad, etc. */
376 /* inode.c fields */
377 u64 s_last_ino; /* highest ino used */
378 long s_inos_till_wrap;
379 u32 s_generation; /* i_generation for new files */
380 struct list_head s_freeing_list; /* inodes being freed */
381 /* journal.c fields */
382 struct mutex s_journal_mutex;
383 void *s_je; /* journal entry to compress */
384 void *s_compressed_je; /* block to write to journal */
385 u32 s_journal_seg[LOGFS_JOURNAL_SEGS]; /* journal segments */
386 u32 s_journal_ec[LOGFS_JOURNAL_SEGS]; /* journal erasecounts */
387 u64 s_last_version;
388 struct logfs_area *s_journal_area; /* open journal segment */
389 __be64 s_je_array[MAX_JOURNAL_ENTRIES];
390 int s_no_je;
391
392 int s_sum_index; /* for the 12 summaries */
393 struct shadow_tree s_shadow_tree;
394 int s_je_fill; /* index of current je */
395 /* readwrite.c fields */
396 struct mutex s_write_mutex;
397 int s_lock_count;
398 mempool_t *s_block_pool; /* struct logfs_block pool */
399 mempool_t *s_shadow_pool; /* struct logfs_shadow pool */
400 struct list_head s_writeback_list; /* writeback pages */
401 /*
402 * Space accounting:
403 * - s_used_bytes specifies space used to store valid data objects.
404 * - s_dirty_used_bytes is space used to store non-committed data
405 * objects. Those objects have already been written themselves,
406 * but they don't become valid until all indirect blocks up to the
407 * journal have been written as well.
408 * - s_dirty_free_bytes is space used to store the old copy of a
409 * replaced object, as long as the replacement is non-committed.
410 * In other words, it is the amount of space freed when all dirty
411 * blocks are written back.
412 * - s_free_bytes is the amount of free space available for any
413 * purpose.
414 * - s_root_reserve is the amount of free space available only to
415 * the root user. Non-privileged users can no longer write once
416 * this watermark has been reached.
417 * - s_speed_reserve is space which remains unused to speed up
418 * garbage collection performance.
419 * - s_dirty_pages is the space reserved for currently dirty pages.
420 * It is a pessimistic estimate, so some/most will get freed on
421 * page writeback.
422 *
423 * s_used_bytes + s_free_bytes + s_speed_reserve = total usable size
424 */
425 u64 s_free_bytes;
426 u64 s_used_bytes;
427 u64 s_dirty_free_bytes;
428 u64 s_dirty_used_bytes;
429 u64 s_root_reserve;
430 u64 s_speed_reserve;
431 u64 s_dirty_pages;
432 /* Bad block handling:
433 * - s_bad_seg_reserve is a number of segments usually kept
434 * free. When encountering bad blocks, the affected segment's data
435 * is _temporarily_ moved to a reserved segment.
436 * - s_bad_segments is the number of known bad segments.
437 */
438 u32 s_bad_seg_reserve;
439 u32 s_bad_segments;
440};
441
442/**
443 * struct logfs_inode - in-memory inode
444 *
445 * @vfs_inode: struct inode
446 * @li_data: data pointers
447 * @li_used_bytes: number of used bytes
448 * @li_freeing_list: used to track inodes currently being freed
449 * @li_flags: inode flags
450 * @li_refcount: number of internal (GC-induced) references
451 */
452struct logfs_inode {
453 struct inode vfs_inode;
454 u64 li_data[LOGFS_EMBEDDED_FIELDS];
455 u64 li_used_bytes;
456 struct list_head li_freeing_list;
457 struct logfs_block *li_block;
458 u32 li_flags;
459 u8 li_height;
460 int li_refcount;
461};
462
463#define journal_for_each(__i) for (__i = 0; __i < LOGFS_JOURNAL_SEGS; __i++)
464#define for_each_area(__i) for (__i = 0; __i < LOGFS_NO_AREAS; __i++)
465#define for_each_area_down(__i) for (__i = LOGFS_NO_AREAS - 1; __i >= 0; __i--)
466
467/* compr.c */
468int logfs_compress(void *in, void *out, size_t inlen, size_t outlen);
469int logfs_uncompress(void *in, void *out, size_t inlen, size_t outlen);
470int __init logfs_compr_init(void);
471void logfs_compr_exit(void);
472
473/* dev_bdev.c */
474#ifdef CONFIG_BLOCK
475int logfs_get_sb_bdev(struct logfs_super *s,
476 struct file_system_type *type,
477 const char *devname);
478#else
479static inline int logfs_get_sb_bdev(struct logfs_super *s,
480 struct file_system_type *type,
481 const char *devname)
482{
483 return -ENODEV;
484}
485#endif
486
487/* dev_mtd.c */
488#if IS_ENABLED(CONFIG_MTD)
489int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr);
490#else
491static inline int logfs_get_sb_mtd(struct logfs_super *s, int mtdnr)
492{
493 return -ENODEV;
494}
495#endif
496
497/* dir.c */
498extern const struct inode_operations logfs_dir_iops;
499extern const struct file_operations logfs_dir_fops;
500int logfs_replay_journal(struct super_block *sb);
501
502/* file.c */
503extern const struct inode_operations logfs_reg_iops;
504extern const struct file_operations logfs_reg_fops;
505extern const struct address_space_operations logfs_reg_aops;
506int logfs_readpage(struct file *file, struct page *page);
507long logfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
508int logfs_fsync(struct file *file, loff_t start, loff_t end, int datasync);
509
510/* gc.c */
511u32 get_best_cand(struct super_block *sb, struct candidate_list *list, u32 *ec);
512void logfs_gc_pass(struct super_block *sb);
513int logfs_check_areas(struct super_block *sb);
514int logfs_init_gc(struct super_block *sb);
515void logfs_cleanup_gc(struct super_block *sb);
516
517/* inode.c */
518extern const struct super_operations logfs_super_operations;
519struct inode *logfs_iget(struct super_block *sb, ino_t ino);
520struct inode *logfs_safe_iget(struct super_block *sb, ino_t ino, int *cookie);
521void logfs_safe_iput(struct inode *inode, int cookie);
522struct inode *logfs_new_inode(struct inode *dir, umode_t mode);
523struct inode *logfs_new_meta_inode(struct super_block *sb, u64 ino);
524struct inode *logfs_read_meta_inode(struct super_block *sb, u64 ino);
525int logfs_init_inode_cache(void);
526void logfs_destroy_inode_cache(void);
527void logfs_set_blocks(struct inode *inode, u64 no);
528/* these logically belong into inode.c but actually reside in readwrite.c */
529int logfs_read_inode(struct inode *inode);
530int __logfs_write_inode(struct inode *inode, struct page *, long flags);
531void logfs_evict_inode(struct inode *inode);
532
533/* journal.c */
534void logfs_write_anchor(struct super_block *sb);
535int logfs_init_journal(struct super_block *sb);
536void logfs_cleanup_journal(struct super_block *sb);
537int write_alias_journal(struct super_block *sb, u64 ino, u64 bix,
538 level_t level, int child_no, __be64 val);
539void do_logfs_journal_wl_pass(struct super_block *sb);
540
541/* readwrite.c */
542pgoff_t logfs_pack_index(u64 bix, level_t level);
543void logfs_unpack_index(pgoff_t index, u64 *bix, level_t *level);
544int logfs_inode_write(struct inode *inode, const void *buf, size_t count,
545 loff_t bix, long flags, struct shadow_tree *shadow_tree);
546int logfs_readpage_nolock(struct page *page);
547int logfs_write_buf(struct inode *inode, struct page *page, long flags);
548int logfs_delete(struct inode *inode, pgoff_t index,
549 struct shadow_tree *shadow_tree);
550int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs,
551 gc_level_t gc_level, long flags);
552int logfs_is_valid_block(struct super_block *sb, u64 ofs, u64 ino, u64 bix,
553 gc_level_t gc_level);
554int logfs_truncate(struct inode *inode, u64 size);
555u64 logfs_seek_hole(struct inode *inode, u64 bix);
556u64 logfs_seek_data(struct inode *inode, u64 bix);
557int logfs_open_segfile(struct super_block *sb);
558int logfs_init_rw(struct super_block *sb);
559void logfs_cleanup_rw(struct super_block *sb);
560void logfs_add_transaction(struct inode *inode, struct logfs_transaction *ta);
561void logfs_del_transaction(struct inode *inode, struct logfs_transaction *ta);
562void logfs_write_block(struct logfs_block *block, long flags);
563int logfs_write_obj_aliases_pagecache(struct super_block *sb);
564void logfs_get_segment_entry(struct super_block *sb, u32 segno,
565 struct logfs_segment_entry *se);
566void logfs_set_segment_used(struct super_block *sb, u64 ofs, int increment);
567void logfs_set_segment_erased(struct super_block *sb, u32 segno, u32 ec,
568 gc_level_t gc_level);
569void logfs_set_segment_reserved(struct super_block *sb, u32 segno);
570void logfs_set_segment_unreserved(struct super_block *sb, u32 segno, u32 ec);
571struct logfs_block *__alloc_block(struct super_block *sb,
572 u64 ino, u64 bix, level_t level);
573void __free_block(struct super_block *sb, struct logfs_block *block);
574void btree_write_block(struct logfs_block *block);
575void initialize_block_counters(struct page *page, struct logfs_block *block,
576 __be64 *array, int page_is_empty);
577int logfs_exist_block(struct inode *inode, u64 bix);
578int get_page_reserve(struct inode *inode, struct page *page);
579void logfs_get_wblocks(struct super_block *sb, struct page *page, int lock);
580void logfs_put_wblocks(struct super_block *sb, struct page *page, int lock);
581extern const struct logfs_block_ops indirect_block_ops;
582
583/* segment.c */
584int logfs_erase_segment(struct super_block *sb, u32 ofs, int ensure_erase);
585int wbuf_read(struct super_block *sb, u64 ofs, size_t len, void *buf);
586int logfs_segment_read(struct inode *inode, struct page *page, u64 ofs, u64 bix,
587 level_t level);
588int logfs_segment_write(struct inode *inode, struct page *page,
589 struct logfs_shadow *shadow);
590int logfs_segment_delete(struct inode *inode, struct logfs_shadow *shadow);
591int logfs_load_object_aliases(struct super_block *sb,
592 struct logfs_obj_alias *oa, int count);
593void move_page_to_btree(struct page *page);
594int logfs_init_mapping(struct super_block *sb);
595void logfs_sync_area(struct logfs_area *area);
596void logfs_sync_segments(struct super_block *sb);
597void freeseg(struct super_block *sb, u32 segno);
598void free_areas(struct super_block *sb);
599
600/* area handling */
601int logfs_init_areas(struct super_block *sb);
602void logfs_cleanup_areas(struct super_block *sb);
603int logfs_open_area(struct logfs_area *area, size_t bytes);
604int __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
605 int use_filler);
606
607static inline int logfs_buf_write(struct logfs_area *area, u64 ofs,
608 void *buf, size_t len)
609{
610 return __logfs_buf_write(area, ofs, buf, len, 0);
611}
612
613static inline int logfs_buf_recover(struct logfs_area *area, u64 ofs,
614 void *buf, size_t len)
615{
616 return __logfs_buf_write(area, ofs, buf, len, 1);
617}
618
619/* super.c */
620struct page *emergency_read_begin(struct address_space *mapping, pgoff_t index);
621void emergency_read_end(struct page *page);
622void logfs_crash_dump(struct super_block *sb);
623int logfs_statfs(struct dentry *dentry, struct kstatfs *stats);
624int logfs_check_ds(struct logfs_disk_super *ds);
625int logfs_write_sb(struct super_block *sb);
626
627static inline struct logfs_super *logfs_super(struct super_block *sb)
628{
629 return sb->s_fs_info;
630}
631
632static inline struct logfs_inode *logfs_inode(struct inode *inode)
633{
634 return container_of(inode, struct logfs_inode, vfs_inode);
635}
636
637static inline void logfs_set_ro(struct super_block *sb)
638{
639 logfs_super(sb)->s_flags |= LOGFS_SB_FLAG_RO;
640}
641
642#define LOGFS_BUG(sb) do { \
643 struct super_block *__sb = sb; \
644 logfs_crash_dump(__sb); \
645 logfs_super(__sb)->s_flags |= LOGFS_SB_FLAG_RO; \
646 BUG(); \
647} while (0)
648
649#define LOGFS_BUG_ON(condition, sb) \
650 do { if (unlikely(condition)) LOGFS_BUG((sb)); } while (0)
651
652static inline __be32 logfs_crc32(void *data, size_t len, size_t skip)
653{
654 return cpu_to_be32(crc32(~0, data+skip, len-skip));
655}
656
657static inline u8 logfs_type(struct inode *inode)
658{
659 return (inode->i_mode >> 12) & 15;
660}
661
662static inline pgoff_t logfs_index(struct super_block *sb, u64 pos)
663{
664 return pos >> sb->s_blocksize_bits;
665}
666
667static inline u64 dev_ofs(struct super_block *sb, u32 segno, u32 ofs)
668{
669 return ((u64)segno << logfs_super(sb)->s_segshift) + ofs;
670}
671
672static inline u32 seg_no(struct super_block *sb, u64 ofs)
673{
674 return ofs >> logfs_super(sb)->s_segshift;
675}
676
677static inline u32 seg_ofs(struct super_block *sb, u64 ofs)
678{
679 return ofs & logfs_super(sb)->s_segmask;
680}
681
682static inline u64 seg_align(struct super_block *sb, u64 ofs)
683{
684 return ofs & ~logfs_super(sb)->s_segmask;
685}
686
687static inline struct logfs_block *logfs_block(struct page *page)
688{
689 return (void *)page->private;
690}
691
692static inline level_t shrink_level(gc_level_t __level)
693{
694 u8 level = (__force u8)__level;
695
696 if (level >= LOGFS_MAX_LEVELS)
697 level -= LOGFS_MAX_LEVELS;
698 return (__force level_t)level;
699}
700
701static inline gc_level_t expand_level(u64 ino, level_t __level)
702{
703 u8 level = (__force u8)__level;
704
705 if (ino == LOGFS_INO_MASTER) {
706 /* ifile has separate areas */
707 level += LOGFS_MAX_LEVELS;
708 }
709 return (__force gc_level_t)level;
710}
711
712static inline int logfs_block_shift(struct super_block *sb, level_t level)
713{
714 level = shrink_level((__force gc_level_t)level);
715 return (__force int)level * (sb->s_blocksize_bits - 3);
716}
717
718static inline u64 logfs_block_mask(struct super_block *sb, level_t level)
719{
720 return ~0ull << logfs_block_shift(sb, level);
721}
722
723static inline struct logfs_area *get_area(struct super_block *sb,
724 gc_level_t gc_level)
725{
726 return logfs_super(sb)->s_area[(__force u8)gc_level];
727}
728
729static inline void logfs_mempool_destroy(mempool_t *pool)
730{
731 if (pool)
732 mempool_destroy(pool);
733}
734
735#endif
diff --git a/fs/logfs/logfs_abi.h b/fs/logfs/logfs_abi.h
deleted file mode 100644
index ae960519c54a..000000000000
--- a/fs/logfs/logfs_abi.h
+++ /dev/null
@@ -1,629 +0,0 @@
1/*
2 * fs/logfs/logfs_abi.h
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 *
8 * Public header for logfs.
9 */
10#ifndef FS_LOGFS_LOGFS_ABI_H
11#define FS_LOGFS_LOGFS_ABI_H
12
13/* For out-of-kernel compiles */
14#ifndef BUILD_BUG_ON
15#define BUILD_BUG_ON(condition) /**/
16#endif
17
18#define SIZE_CHECK(type, size) \
19static inline void check_##type(void) \
20{ \
21 BUILD_BUG_ON(sizeof(struct type) != (size)); \
22}
23
24/*
25 * Throughout the logfs code, we're constantly dealing with blocks at
26 * various positions or offsets. To remove confusion, we stricly
27 * distinguish between a "position" - the logical position within a
28 * file and an "offset" - the physical location within the device.
29 *
30 * Any usage of the term offset for a logical location or position for
31 * a physical one is a bug and should get fixed.
32 */
33
34/*
35 * Block are allocated in one of several segments depending on their
36 * level. The following levels are used:
37 * 0 - regular data block
38 * 1 - i1 indirect blocks
39 * 2 - i2 indirect blocks
40 * 3 - i3 indirect blocks
41 * 4 - i4 indirect blocks
42 * 5 - i5 indirect blocks
43 * 6 - ifile data blocks
44 * 7 - ifile i1 indirect blocks
45 * 8 - ifile i2 indirect blocks
46 * 9 - ifile i3 indirect blocks
47 * 10 - ifile i4 indirect blocks
48 * 11 - ifile i5 indirect blocks
49 * Potential levels to be used in the future:
50 * 12 - gc recycled blocks, long-lived data
51 * 13 - replacement blocks, short-lived data
52 *
53 * Levels 1-11 are necessary for robust gc operations and help separate
54 * short-lived metadata from longer-lived file data. In the future,
55 * file data should get separated into several segments based on simple
56 * heuristics. Old data recycled during gc operation is expected to be
57 * long-lived. New data is of uncertain life expectancy. New data
58 * used to replace older blocks in existing files is expected to be
59 * short-lived.
60 */
61
62
63/* Magic numbers. 64bit for superblock, 32bit for statfs f_type */
64#define LOGFS_MAGIC 0x7a3a8e5cb9d5bf67ull
65#define LOGFS_MAGIC_U32 0xc97e8168u
66
67/*
68 * Various blocksize related macros. Blocksize is currently fixed at 4KiB.
69 * Sooner or later that should become configurable and the macros replaced
70 * by something superblock-dependent. Pointers in indirect blocks are and
71 * will remain 64bit.
72 *
73 * LOGFS_BLOCKSIZE - self-explaining
74 * LOGFS_BLOCK_FACTOR - number of pointers per indirect block
75 * LOGFS_BLOCK_BITS - log2 of LOGFS_BLOCK_FACTOR, used for shifts
76 */
77#define LOGFS_BLOCKSIZE (4096ull)
78#define LOGFS_BLOCK_FACTOR (LOGFS_BLOCKSIZE / sizeof(u64))
79#define LOGFS_BLOCK_BITS (9)
80
81/*
82 * Number of blocks at various levels of indirection. There are 16 direct
83 * block pointers plus a single indirect pointer.
84 */
85#define I0_BLOCKS (16)
86#define I1_BLOCKS LOGFS_BLOCK_FACTOR
87#define I2_BLOCKS (LOGFS_BLOCK_FACTOR * I1_BLOCKS)
88#define I3_BLOCKS (LOGFS_BLOCK_FACTOR * I2_BLOCKS)
89#define I4_BLOCKS (LOGFS_BLOCK_FACTOR * I3_BLOCKS)
90#define I5_BLOCKS (LOGFS_BLOCK_FACTOR * I4_BLOCKS)
91
92#define INDIRECT_INDEX I0_BLOCKS
93#define LOGFS_EMBEDDED_FIELDS (I0_BLOCKS + 1)
94
95/*
96 * Sizes at which files require another level of indirection. Files smaller
97 * than LOGFS_EMBEDDED_SIZE can be completely stored in the inode itself,
98 * similar like ext2 fast symlinks.
99 *
100 * Data at a position smaller than LOGFS_I0_SIZE is accessed through the
101 * direct pointers, else through the 1x indirect pointer and so forth.
102 */
103#define LOGFS_EMBEDDED_SIZE (LOGFS_EMBEDDED_FIELDS * sizeof(u64))
104#define LOGFS_I0_SIZE (I0_BLOCKS * LOGFS_BLOCKSIZE)
105#define LOGFS_I1_SIZE (I1_BLOCKS * LOGFS_BLOCKSIZE)
106#define LOGFS_I2_SIZE (I2_BLOCKS * LOGFS_BLOCKSIZE)
107#define LOGFS_I3_SIZE (I3_BLOCKS * LOGFS_BLOCKSIZE)
108#define LOGFS_I4_SIZE (I4_BLOCKS * LOGFS_BLOCKSIZE)
109#define LOGFS_I5_SIZE (I5_BLOCKS * LOGFS_BLOCKSIZE)
110
111/*
112 * Each indirect block pointer must have this flag set, if all block pointers
113 * behind it are set, i.e. there is no hole hidden in the shadow of this
114 * indirect block pointer.
115 */
116#define LOGFS_FULLY_POPULATED (1ULL << 63)
117#define pure_ofs(ofs) (ofs & ~LOGFS_FULLY_POPULATED)
118
119/*
120 * LogFS needs to separate data into levels. Each level is defined as the
121 * maximal possible distance from the master inode (inode of the inode file).
122 * Data blocks reside on level 0, 1x indirect block on level 1, etc.
123 * Inodes reside on level 6, indirect blocks for the inode file on levels 7-11.
124 * This effort is necessary to guarantee garbage collection to always make
125 * progress.
126 *
127 * LOGFS_MAX_INDIRECT is the maximal indirection through indirect blocks,
128 * LOGFS_MAX_LEVELS is one more for the actual data level of a file. It is
129 * the maximal number of levels for one file.
130 * LOGFS_NO_AREAS is twice that, as the inode file and regular files are
131 * effectively stacked on top of each other.
132 */
133#define LOGFS_MAX_INDIRECT (5)
134#define LOGFS_MAX_LEVELS (LOGFS_MAX_INDIRECT + 1)
135#define LOGFS_NO_AREAS (2 * LOGFS_MAX_LEVELS)
136
137/* Maximum size of filenames */
138#define LOGFS_MAX_NAMELEN (255)
139
140/* Number of segments in the primary journal. */
141#define LOGFS_JOURNAL_SEGS (16)
142
143/* Maximum number of free/erased/etc. segments in journal entries */
144#define MAX_CACHED_SEGS (64)
145
146
147/*
148 * LOGFS_OBJECT_HEADERSIZE is the size of a single header in the object store,
149 * LOGFS_MAX_OBJECTSIZE the size of the largest possible object, including
150 * its header,
151 * LOGFS_SEGMENT_RESERVE is the amount of space reserved for each segment for
152 * its segment header and the padded space at the end when no further objects
153 * fit.
154 */
155#define LOGFS_OBJECT_HEADERSIZE (0x1c)
156#define LOGFS_SEGMENT_HEADERSIZE (0x18)
157#define LOGFS_MAX_OBJECTSIZE (LOGFS_OBJECT_HEADERSIZE + LOGFS_BLOCKSIZE)
158#define LOGFS_SEGMENT_RESERVE \
159 (LOGFS_SEGMENT_HEADERSIZE + LOGFS_MAX_OBJECTSIZE - 1)
160
161/*
162 * Segment types:
163 * SEG_SUPER - Data or indirect block
164 * SEG_JOURNAL - Inode
165 * SEG_OSTORE - Dentry
166 */
167enum {
168 SEG_SUPER = 0x01,
169 SEG_JOURNAL = 0x02,
170 SEG_OSTORE = 0x03,
171};
172
173/**
174 * struct logfs_segment_header - per-segment header in the ostore
175 *
176 * @crc: crc32 of header (there is no data)
177 * @pad: unused, must be 0
178 * @type: segment type, see above
179 * @level: GC level for all objects in this segment
180 * @segno: segment number
181 * @ec: erase count for this segment
182 * @gec: global erase count at time of writing
183 */
184struct logfs_segment_header {
185 __be32 crc;
186 __be16 pad;
187 __u8 type;
188 __u8 level;
189 __be32 segno;
190 __be32 ec;
191 __be64 gec;
192};
193
194SIZE_CHECK(logfs_segment_header, LOGFS_SEGMENT_HEADERSIZE);
195
196#define LOGFS_FEATURES_INCOMPAT (0ull)
197#define LOGFS_FEATURES_RO_COMPAT (0ull)
198#define LOGFS_FEATURES_COMPAT (0ull)
199
200/**
201 * struct logfs_disk_super - on-medium superblock
202 *
203 * @ds_magic: magic number, must equal LOGFS_MAGIC
204 * @ds_crc: crc32 of structure starting with the next field
205 * @ds_ifile_levels: maximum number of levels for ifile
206 * @ds_iblock_levels: maximum number of levels for regular files
207 * @ds_data_levels: number of separate levels for data
208 * @pad0: reserved, must be 0
209 * @ds_feature_incompat: incompatible filesystem features
210 * @ds_feature_ro_compat: read-only compatible filesystem features
211 * @ds_feature_compat: compatible filesystem features
212 * @ds_flags: flags
213 * @ds_segment_shift: log2 of segment size
214 * @ds_block_shift: log2 of block size
215 * @ds_write_shift: log2 of write size
216 * @pad1: reserved, must be 0
217 * @ds_journal_seg: segments used by primary journal
218 * @ds_root_reserve: bytes reserved for the superuser
219 * @ds_speed_reserve: bytes reserved to speed up GC
220 * @ds_bad_seg_reserve: number of segments reserved to handle bad blocks
221 * @pad2: reserved, must be 0
222 * @pad3: reserved, must be 0
223 *
224 * Contains only read-only fields. Read-write fields like the amount of used
225 * space is tracked in the dynamic superblock, which is stored in the journal.
226 */
227struct logfs_disk_super {
228 struct logfs_segment_header ds_sh;
229 __be64 ds_magic;
230
231 __be32 ds_crc;
232 __u8 ds_ifile_levels;
233 __u8 ds_iblock_levels;
234 __u8 ds_data_levels;
235 __u8 ds_segment_shift;
236 __u8 ds_block_shift;
237 __u8 ds_write_shift;
238 __u8 pad0[6];
239
240 __be64 ds_filesystem_size;
241 __be32 ds_segment_size;
242 __be32 ds_bad_seg_reserve;
243
244 __be64 ds_feature_incompat;
245 __be64 ds_feature_ro_compat;
246
247 __be64 ds_feature_compat;
248 __be64 ds_feature_flags;
249
250 __be64 ds_root_reserve;
251 __be64 ds_speed_reserve;
252
253 __be32 ds_journal_seg[LOGFS_JOURNAL_SEGS];
254
255 __be64 ds_super_ofs[2];
256 __be64 pad3[8];
257};
258
259SIZE_CHECK(logfs_disk_super, 256);
260
261/*
262 * Object types:
263 * OBJ_BLOCK - Data or indirect block
264 * OBJ_INODE - Inode
265 * OBJ_DENTRY - Dentry
266 */
267enum {
268 OBJ_BLOCK = 0x04,
269 OBJ_INODE = 0x05,
270 OBJ_DENTRY = 0x06,
271};
272
273/**
274 * struct logfs_object_header - per-object header in the ostore
275 *
276 * @crc: crc32 of header, excluding data_crc
277 * @len: length of data
278 * @type: object type, see above
279 * @compr: compression type
280 * @ino: inode number
281 * @bix: block index
282 * @data_crc: crc32 of payload
283 */
284struct logfs_object_header {
285 __be32 crc;
286 __be16 len;
287 __u8 type;
288 __u8 compr;
289 __be64 ino;
290 __be64 bix;
291 __be32 data_crc;
292} __attribute__((packed));
293
294SIZE_CHECK(logfs_object_header, LOGFS_OBJECT_HEADERSIZE);
295
296/*
297 * Reserved inode numbers:
298 * LOGFS_INO_MASTER - master inode (for inode file)
299 * LOGFS_INO_ROOT - root directory
300 * LOGFS_INO_SEGFILE - per-segment used bytes and erase count
301 */
302enum {
303 LOGFS_INO_MAPPING = 0x00,
304 LOGFS_INO_MASTER = 0x01,
305 LOGFS_INO_ROOT = 0x02,
306 LOGFS_INO_SEGFILE = 0x03,
307 LOGFS_RESERVED_INOS = 0x10,
308};
309
310/*
311 * Inode flags. High bits should never be written to the medium. They are
312 * reserved for in-memory usage.
313 * Low bits should either remain in sync with the corresponding FS_*_FL or
314 * reuse slots that obviously don't make sense for logfs.
315 *
316 * LOGFS_IF_DIRTY Inode must be written back
317 * LOGFS_IF_ZOMBIE Inode has been deleted
318 * LOGFS_IF_STILLBORN -ENOSPC happened when creating inode
319 */
320#define LOGFS_IF_COMPRESSED 0x00000004 /* == FS_COMPR_FL */
321#define LOGFS_IF_DIRTY 0x20000000
322#define LOGFS_IF_ZOMBIE 0x40000000
323#define LOGFS_IF_STILLBORN 0x80000000
324
325/* Flags available to chattr */
326#define LOGFS_FL_USER_VISIBLE (LOGFS_IF_COMPRESSED)
327#define LOGFS_FL_USER_MODIFIABLE (LOGFS_IF_COMPRESSED)
328/* Flags inherited from parent directory on file/directory creation */
329#define LOGFS_FL_INHERITED (LOGFS_IF_COMPRESSED)
330
331/**
332 * struct logfs_disk_inode - on-medium inode
333 *
334 * @di_mode: file mode
335 * @di_pad: reserved, must be 0
336 * @di_flags: inode flags, see above
337 * @di_uid: user id
338 * @di_gid: group id
339 * @di_ctime: change time
340 * @di_mtime: modify time
341 * @di_refcount: reference count (aka nlink or link count)
342 * @di_generation: inode generation, for nfs
343 * @di_used_bytes: number of bytes used
344 * @di_size: file size
345 * @di_data: data pointers
346 */
347struct logfs_disk_inode {
348 __be16 di_mode;
349 __u8 di_height;
350 __u8 di_pad;
351 __be32 di_flags;
352 __be32 di_uid;
353 __be32 di_gid;
354
355 __be64 di_ctime;
356 __be64 di_mtime;
357
358 __be64 di_atime;
359 __be32 di_refcount;
360 __be32 di_generation;
361
362 __be64 di_used_bytes;
363 __be64 di_size;
364
365 __be64 di_data[LOGFS_EMBEDDED_FIELDS];
366};
367
368SIZE_CHECK(logfs_disk_inode, 200);
369
370#define INODE_POINTER_OFS \
371 (offsetof(struct logfs_disk_inode, di_data) / sizeof(__be64))
372#define INODE_USED_OFS \
373 (offsetof(struct logfs_disk_inode, di_used_bytes) / sizeof(__be64))
374#define INODE_SIZE_OFS \
375 (offsetof(struct logfs_disk_inode, di_size) / sizeof(__be64))
376#define INODE_HEIGHT_OFS (0)
377
378/**
379 * struct logfs_disk_dentry - on-medium dentry structure
380 *
381 * @ino: inode number
382 * @namelen: length of file name
383 * @type: file type, identical to bits 12..15 of mode
384 * @name: file name
385 */
386/* FIXME: add 6 bytes of padding to remove the __packed */
387struct logfs_disk_dentry {
388 __be64 ino;
389 __be16 namelen;
390 __u8 type;
391 __u8 name[LOGFS_MAX_NAMELEN];
392} __attribute__((packed));
393
394SIZE_CHECK(logfs_disk_dentry, 266);
395
396#define RESERVED 0xffffffff
397#define BADSEG 0xffffffff
398/**
399 * struct logfs_segment_entry - segment file entry
400 *
401 * @ec_level: erase count and level
402 * @valid: number of valid bytes
403 *
404 * Segment file contains one entry for every segment. ec_level contains the
405 * erasecount in the upper 28 bits and the level in the lower 4 bits. An
406 * ec_level of BADSEG (-1) identifies bad segments. valid contains the number
407 * of valid bytes or RESERVED (-1 again) if the segment is used for either the
408 * superblock or the journal, or when the segment is bad.
409 */
410struct logfs_segment_entry {
411 __be32 ec_level;
412 __be32 valid;
413};
414
415SIZE_CHECK(logfs_segment_entry, 8);
416
417/**
418 * struct logfs_journal_header - header for journal entries (JEs)
419 *
420 * @h_crc: crc32 of journal entry
421 * @h_len: length of compressed journal entry,
422 * not including header
423 * @h_datalen: length of uncompressed data
424 * @h_type: JE type
425 * @h_compr: compression type
426 * @h_pad: reserved
427 */
428struct logfs_journal_header {
429 __be32 h_crc;
430 __be16 h_len;
431 __be16 h_datalen;
432 __be16 h_type;
433 __u8 h_compr;
434 __u8 h_pad[5];
435};
436
437SIZE_CHECK(logfs_journal_header, 16);
438
439/*
440 * Life expectency of data.
441 * VIM_DEFAULT - default vim
442 * VIM_SEGFILE - for segment file only - very short-living
443 * VIM_GC - GC'd data - likely long-living
444 */
445enum logfs_vim {
446 VIM_DEFAULT = 0,
447 VIM_SEGFILE = 1,
448};
449
450/**
451 * struct logfs_je_area - wbuf header
452 *
453 * @segno: segment number of area
454 * @used_bytes: number of bytes already used
455 * @gc_level: GC level
456 * @vim: life expectancy of data
457 *
458 * "Areas" are segments currently being used for writing. There is at least
459 * one area per GC level. Several may be used to separate long-living from
460 * short-living data. If an area with unknown vim is encountered, it can
461 * simply be closed.
462 * The write buffer immediately follow this header.
463 */
464struct logfs_je_area {
465 __be32 segno;
466 __be32 used_bytes;
467 __u8 gc_level;
468 __u8 vim;
469} __attribute__((packed));
470
471SIZE_CHECK(logfs_je_area, 10);
472
473#define MAX_JOURNAL_HEADER \
474 (sizeof(struct logfs_journal_header) + sizeof(struct logfs_je_area))
475
476/**
477 * struct logfs_je_dynsb - dynamic superblock
478 *
479 * @ds_gec: global erase count
480 * @ds_sweeper: current position of GC "sweeper"
481 * @ds_rename_dir: source directory ino (see dir.c documentation)
482 * @ds_rename_pos: position of source dd (see dir.c documentation)
483 * @ds_victim_ino: victims of incomplete dir operation (see dir.c)
484 * @ds_victim_ino: parent inode of victim (see dir.c)
485 * @ds_used_bytes: number of used bytes
486 */
487struct logfs_je_dynsb {
488 __be64 ds_gec;
489 __be64 ds_sweeper;
490
491 __be64 ds_rename_dir;
492 __be64 ds_rename_pos;
493
494 __be64 ds_victim_ino;
495 __be64 ds_victim_parent; /* XXX */
496
497 __be64 ds_used_bytes;
498 __be32 ds_generation;
499 __be32 pad;
500};
501
502SIZE_CHECK(logfs_je_dynsb, 64);
503
504/**
505 * struct logfs_je_anchor - anchor of filesystem tree, aka master inode
506 *
507 * @da_size: size of inode file
508 * @da_last_ino: last created inode
509 * @da_used_bytes: number of bytes used
510 * @da_data: data pointers
511 */
512struct logfs_je_anchor {
513 __be64 da_size;
514 __be64 da_last_ino;
515
516 __be64 da_used_bytes;
517 u8 da_height;
518 u8 pad[7];
519
520 __be64 da_data[LOGFS_EMBEDDED_FIELDS];
521};
522
523SIZE_CHECK(logfs_je_anchor, 168);
524
525/**
526 * struct logfs_je_spillout - spillout entry (from 1st to 2nd journal)
527 *
528 * @so_segment: segments used for 2nd journal
529 *
530 * Length of the array is given by h_len field in the header.
531 */
532struct logfs_je_spillout {
533 __be64 so_segment[0];
534};
535
536SIZE_CHECK(logfs_je_spillout, 0);
537
538/**
539 * struct logfs_je_journal_ec - erase counts for all journal segments
540 *
541 * @ec: erase count
542 *
543 * Length of the array is given by h_len field in the header.
544 */
545struct logfs_je_journal_ec {
546 __be32 ec[0];
547};
548
549SIZE_CHECK(logfs_je_journal_ec, 0);
550
551/**
552 * struct logfs_je_free_segments - list of free segmetns with erase count
553 */
554struct logfs_je_free_segments {
555 __be32 segno;
556 __be32 ec;
557};
558
559SIZE_CHECK(logfs_je_free_segments, 8);
560
561/**
562 * struct logfs_seg_alias - list of segment aliases
563 */
564struct logfs_seg_alias {
565 __be32 old_segno;
566 __be32 new_segno;
567};
568
569SIZE_CHECK(logfs_seg_alias, 8);
570
571/**
572 * struct logfs_obj_alias - list of object aliases
573 */
574struct logfs_obj_alias {
575 __be64 ino;
576 __be64 bix;
577 __be64 val;
578 u8 level;
579 u8 pad[5];
580 __be16 child_no;
581};
582
583SIZE_CHECK(logfs_obj_alias, 32);
584
585/**
586 * Compression types.
587 *
588 * COMPR_NONE - uncompressed
589 * COMPR_ZLIB - compressed with zlib
590 */
591enum {
592 COMPR_NONE = 0,
593 COMPR_ZLIB = 1,
594};
595
596/*
597 * Journal entries come in groups of 16. First group contains unique
598 * entries, next groups contain one entry per level
599 *
600 * JE_FIRST - smallest possible journal entry number
601 *
602 * JEG_BASE - base group, containing unique entries
603 * JE_COMMIT - commit entry, validates all previous entries
604 * JE_DYNSB - dynamic superblock, anything that ought to be in the
605 * superblock but cannot because it is read-write data
606 * JE_ANCHOR - anchor aka master inode aka inode file's inode
607 * JE_ERASECOUNT erasecounts for all journal segments
608 * JE_SPILLOUT - unused
609 * JE_SEG_ALIAS - aliases segments
610 * JE_AREA - area description
611 *
612 * JE_LAST - largest possible journal entry number
613 */
614enum {
615 JE_FIRST = 0x01,
616
617 JEG_BASE = 0x00,
618 JE_COMMIT = 0x02,
619 JE_DYNSB = 0x03,
620 JE_ANCHOR = 0x04,
621 JE_ERASECOUNT = 0x05,
622 JE_SPILLOUT = 0x06,
623 JE_OBJ_ALIAS = 0x0d,
624 JE_AREA = 0x0e,
625
626 JE_LAST = 0x0e,
627};
628
629#endif
diff --git a/fs/logfs/readwrite.c b/fs/logfs/readwrite.c
deleted file mode 100644
index bf19bf4a243f..000000000000
--- a/fs/logfs/readwrite.c
+++ /dev/null
@@ -1,2298 +0,0 @@
1/*
2 * fs/logfs/readwrite.c
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 *
8 *
9 * Actually contains five sets of very similar functions:
10 * read read blocks from a file
11 * seek_hole find next hole
12 * seek_data find next data block
13 * valid check whether a block still belongs to a file
14 * write write blocks to a file
15 * delete delete a block (for directories and ifile)
16 * rewrite move existing blocks of a file to a new location (gc helper)
17 * truncate truncate a file
18 */
19#include "logfs.h"
20#include <linux/sched.h>
21#include <linux/slab.h>
22
23static u64 adjust_bix(u64 bix, level_t level)
24{
25 switch (level) {
26 case 0:
27 return bix;
28 case LEVEL(1):
29 return max_t(u64, bix, I0_BLOCKS);
30 case LEVEL(2):
31 return max_t(u64, bix, I1_BLOCKS);
32 case LEVEL(3):
33 return max_t(u64, bix, I2_BLOCKS);
34 case LEVEL(4):
35 return max_t(u64, bix, I3_BLOCKS);
36 case LEVEL(5):
37 return max_t(u64, bix, I4_BLOCKS);
38 default:
39 WARN_ON(1);
40 return bix;
41 }
42}
43
44static inline u64 maxbix(u8 height)
45{
46 return 1ULL << (LOGFS_BLOCK_BITS * height);
47}
48
49/**
50 * The inode address space is cut in two halves. Lower half belongs to data
51 * pages, upper half to indirect blocks. If the high bit (INDIRECT_BIT) is
52 * set, the actual block index (bix) and level can be derived from the page
53 * index.
54 *
55 * The lowest three bits of the block index are set to 0 after packing and
56 * unpacking. Since the lowest n bits (9 for 4KiB blocksize) are ignored
57 * anyway this is harmless.
58 */
59#define ARCH_SHIFT (BITS_PER_LONG - 32)
60#define INDIRECT_BIT (0x80000000UL << ARCH_SHIFT)
61#define LEVEL_SHIFT (28 + ARCH_SHIFT)
62static inline pgoff_t first_indirect_block(void)
63{
64 return INDIRECT_BIT | (1ULL << LEVEL_SHIFT);
65}
66
67pgoff_t logfs_pack_index(u64 bix, level_t level)
68{
69 pgoff_t index;
70
71 BUG_ON(bix >= INDIRECT_BIT);
72 if (level == 0)
73 return bix;
74
75 index = INDIRECT_BIT;
76 index |= (__force long)level << LEVEL_SHIFT;
77 index |= bix >> ((__force u8)level * LOGFS_BLOCK_BITS);
78 return index;
79}
80
81void logfs_unpack_index(pgoff_t index, u64 *bix, level_t *level)
82{
83 u8 __level;
84
85 if (!(index & INDIRECT_BIT)) {
86 *bix = index;
87 *level = 0;
88 return;
89 }
90
91 __level = (index & ~INDIRECT_BIT) >> LEVEL_SHIFT;
92 *level = LEVEL(__level);
93 *bix = (index << (__level * LOGFS_BLOCK_BITS)) & ~INDIRECT_BIT;
94 *bix = adjust_bix(*bix, *level);
95 return;
96}
97#undef ARCH_SHIFT
98#undef INDIRECT_BIT
99#undef LEVEL_SHIFT
100
101/*
102 * Time is stored as nanoseconds since the epoch.
103 */
104static struct timespec be64_to_timespec(__be64 betime)
105{
106 return ns_to_timespec(be64_to_cpu(betime));
107}
108
109static __be64 timespec_to_be64(struct timespec tsp)
110{
111 return cpu_to_be64((u64)tsp.tv_sec * NSEC_PER_SEC + tsp.tv_nsec);
112}
113
114static void logfs_disk_to_inode(struct logfs_disk_inode *di, struct inode*inode)
115{
116 struct logfs_inode *li = logfs_inode(inode);
117 int i;
118
119 inode->i_mode = be16_to_cpu(di->di_mode);
120 li->li_height = di->di_height;
121 li->li_flags = be32_to_cpu(di->di_flags);
122 i_uid_write(inode, be32_to_cpu(di->di_uid));
123 i_gid_write(inode, be32_to_cpu(di->di_gid));
124 inode->i_size = be64_to_cpu(di->di_size);
125 logfs_set_blocks(inode, be64_to_cpu(di->di_used_bytes));
126 inode->i_atime = be64_to_timespec(di->di_atime);
127 inode->i_ctime = be64_to_timespec(di->di_ctime);
128 inode->i_mtime = be64_to_timespec(di->di_mtime);
129 set_nlink(inode, be32_to_cpu(di->di_refcount));
130 inode->i_generation = be32_to_cpu(di->di_generation);
131
132 switch (inode->i_mode & S_IFMT) {
133 case S_IFSOCK: /* fall through */
134 case S_IFBLK: /* fall through */
135 case S_IFCHR: /* fall through */
136 case S_IFIFO:
137 inode->i_rdev = be64_to_cpu(di->di_data[0]);
138 break;
139 case S_IFDIR: /* fall through */
140 case S_IFREG: /* fall through */
141 case S_IFLNK:
142 for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
143 li->li_data[i] = be64_to_cpu(di->di_data[i]);
144 break;
145 default:
146 BUG();
147 }
148}
149
150static void logfs_inode_to_disk(struct inode *inode, struct logfs_disk_inode*di)
151{
152 struct logfs_inode *li = logfs_inode(inode);
153 int i;
154
155 di->di_mode = cpu_to_be16(inode->i_mode);
156 di->di_height = li->li_height;
157 di->di_pad = 0;
158 di->di_flags = cpu_to_be32(li->li_flags);
159 di->di_uid = cpu_to_be32(i_uid_read(inode));
160 di->di_gid = cpu_to_be32(i_gid_read(inode));
161 di->di_size = cpu_to_be64(i_size_read(inode));
162 di->di_used_bytes = cpu_to_be64(li->li_used_bytes);
163 di->di_atime = timespec_to_be64(inode->i_atime);
164 di->di_ctime = timespec_to_be64(inode->i_ctime);
165 di->di_mtime = timespec_to_be64(inode->i_mtime);
166 di->di_refcount = cpu_to_be32(inode->i_nlink);
167 di->di_generation = cpu_to_be32(inode->i_generation);
168
169 switch (inode->i_mode & S_IFMT) {
170 case S_IFSOCK: /* fall through */
171 case S_IFBLK: /* fall through */
172 case S_IFCHR: /* fall through */
173 case S_IFIFO:
174 di->di_data[0] = cpu_to_be64(inode->i_rdev);
175 break;
176 case S_IFDIR: /* fall through */
177 case S_IFREG: /* fall through */
178 case S_IFLNK:
179 for (i = 0; i < LOGFS_EMBEDDED_FIELDS; i++)
180 di->di_data[i] = cpu_to_be64(li->li_data[i]);
181 break;
182 default:
183 BUG();
184 }
185}
186
187static void __logfs_set_blocks(struct inode *inode)
188{
189 struct super_block *sb = inode->i_sb;
190 struct logfs_inode *li = logfs_inode(inode);
191
192 inode->i_blocks = ULONG_MAX;
193 if (li->li_used_bytes >> sb->s_blocksize_bits < ULONG_MAX)
194 inode->i_blocks = ALIGN(li->li_used_bytes, 512) >> 9;
195}
196
197void logfs_set_blocks(struct inode *inode, u64 bytes)
198{
199 struct logfs_inode *li = logfs_inode(inode);
200
201 li->li_used_bytes = bytes;
202 __logfs_set_blocks(inode);
203}
204
205static void prelock_page(struct super_block *sb, struct page *page, int lock)
206{
207 struct logfs_super *super = logfs_super(sb);
208
209 BUG_ON(!PageLocked(page));
210 if (lock) {
211 BUG_ON(PagePreLocked(page));
212 SetPagePreLocked(page);
213 } else {
214 /* We are in GC path. */
215 if (PagePreLocked(page))
216 super->s_lock_count++;
217 else
218 SetPagePreLocked(page);
219 }
220}
221
222static void preunlock_page(struct super_block *sb, struct page *page, int lock)
223{
224 struct logfs_super *super = logfs_super(sb);
225
226 BUG_ON(!PageLocked(page));
227 if (lock)
228 ClearPagePreLocked(page);
229 else {
230 /* We are in GC path. */
231 BUG_ON(!PagePreLocked(page));
232 if (super->s_lock_count)
233 super->s_lock_count--;
234 else
235 ClearPagePreLocked(page);
236 }
237}
238
239/*
240 * Logfs is prone to an AB-BA deadlock where one task tries to acquire
241 * s_write_mutex with a locked page and GC tries to get that page while holding
242 * s_write_mutex.
243 * To solve this issue logfs will ignore the page lock iff the page in question
244 * is waiting for s_write_mutex. We annotate this fact by setting PG_pre_locked
245 * in addition to PG_locked.
246 */
247void logfs_get_wblocks(struct super_block *sb, struct page *page, int lock)
248{
249 struct logfs_super *super = logfs_super(sb);
250
251 if (page)
252 prelock_page(sb, page, lock);
253
254 if (lock) {
255 mutex_lock(&super->s_write_mutex);
256 logfs_gc_pass(sb);
257 /* FIXME: We also have to check for shadowed space
258 * and mempool fill grade */
259 }
260}
261
262void logfs_put_wblocks(struct super_block *sb, struct page *page, int lock)
263{
264 struct logfs_super *super = logfs_super(sb);
265
266 if (page)
267 preunlock_page(sb, page, lock);
268 /* Order matters - we must clear PG_pre_locked before releasing
269 * s_write_mutex or we could race against another task. */
270 if (lock)
271 mutex_unlock(&super->s_write_mutex);
272}
273
274static struct page *logfs_get_read_page(struct inode *inode, u64 bix,
275 level_t level)
276{
277 return find_or_create_page(inode->i_mapping,
278 logfs_pack_index(bix, level), GFP_NOFS);
279}
280
281static void logfs_put_read_page(struct page *page)
282{
283 unlock_page(page);
284 put_page(page);
285}
286
287static void logfs_lock_write_page(struct page *page)
288{
289 int loop = 0;
290
291 while (unlikely(!trylock_page(page))) {
292 if (loop++ > 0x1000) {
293 /* Has been observed once so far... */
294 printk(KERN_ERR "stack at %p\n", &loop);
295 BUG();
296 }
297 if (PagePreLocked(page)) {
298 /* Holder of page lock is waiting for us, it
299 * is safe to use this page. */
300 break;
301 }
302 /* Some other process has this page locked and has
303 * nothing to do with us. Wait for it to finish.
304 */
305 schedule();
306 }
307 BUG_ON(!PageLocked(page));
308}
309
310static struct page *logfs_get_write_page(struct inode *inode, u64 bix,
311 level_t level)
312{
313 struct address_space *mapping = inode->i_mapping;
314 pgoff_t index = logfs_pack_index(bix, level);
315 struct page *page;
316 int err;
317
318repeat:
319 page = find_get_page(mapping, index);
320 if (!page) {
321 page = __page_cache_alloc(GFP_NOFS);
322 if (!page)
323 return NULL;
324 err = add_to_page_cache_lru(page, mapping, index, GFP_NOFS);
325 if (unlikely(err)) {
326 put_page(page);
327 if (err == -EEXIST)
328 goto repeat;
329 return NULL;
330 }
331 } else logfs_lock_write_page(page);
332 BUG_ON(!PageLocked(page));
333 return page;
334}
335
336static void logfs_unlock_write_page(struct page *page)
337{
338 if (!PagePreLocked(page))
339 unlock_page(page);
340}
341
342static void logfs_put_write_page(struct page *page)
343{
344 logfs_unlock_write_page(page);
345 put_page(page);
346}
347
348static struct page *logfs_get_page(struct inode *inode, u64 bix, level_t level,
349 int rw)
350{
351 if (rw == READ)
352 return logfs_get_read_page(inode, bix, level);
353 else
354 return logfs_get_write_page(inode, bix, level);
355}
356
357static void logfs_put_page(struct page *page, int rw)
358{
359 if (rw == READ)
360 logfs_put_read_page(page);
361 else
362 logfs_put_write_page(page);
363}
364
365static unsigned long __get_bits(u64 val, int skip, int no)
366{
367 u64 ret = val;
368
369 ret >>= skip * no;
370 ret <<= 64 - no;
371 ret >>= 64 - no;
372 return ret;
373}
374
375static unsigned long get_bits(u64 val, level_t skip)
376{
377 return __get_bits(val, (__force int)skip, LOGFS_BLOCK_BITS);
378}
379
380static inline void init_shadow_tree(struct super_block *sb,
381 struct shadow_tree *tree)
382{
383 struct logfs_super *super = logfs_super(sb);
384
385 btree_init_mempool64(&tree->new, super->s_btree_pool);
386 btree_init_mempool64(&tree->old, super->s_btree_pool);
387}
388
389static void indirect_write_block(struct logfs_block *block)
390{
391 struct page *page;
392 struct inode *inode;
393 int ret;
394
395 page = block->page;
396 inode = page->mapping->host;
397 logfs_lock_write_page(page);
398 ret = logfs_write_buf(inode, page, 0);
399 logfs_unlock_write_page(page);
400 /*
401 * This needs some rework. Unless you want your filesystem to run
402 * completely synchronously (you don't), the filesystem will always
403 * report writes as 'successful' before the actual work has been
404 * done. The actual work gets done here and this is where any errors
405 * will show up. And there isn't much we can do about it, really.
406 *
407 * Some attempts to fix the errors (move from bad blocks, retry io,...)
408 * have already been done, so anything left should be either a broken
409 * device or a bug somewhere in logfs itself. Being relatively new,
410 * the odds currently favor a bug, so for now the line below isn't
411 * entirely tasteles.
412 */
413 BUG_ON(ret);
414}
415
416static void inode_write_block(struct logfs_block *block)
417{
418 struct inode *inode;
419 int ret;
420
421 inode = block->inode;
422 if (inode->i_ino == LOGFS_INO_MASTER)
423 logfs_write_anchor(inode->i_sb);
424 else {
425 ret = __logfs_write_inode(inode, NULL, 0);
426 /* see indirect_write_block comment */
427 BUG_ON(ret);
428 }
429}
430
431/*
432 * This silences a false, yet annoying gcc warning. I hate it when my editor
433 * jumps into bitops.h each time I recompile this file.
434 * TODO: Complain to gcc folks about this and upgrade compiler.
435 */
436static unsigned long fnb(const unsigned long *addr,
437 unsigned long size, unsigned long offset)
438{
439 return find_next_bit(addr, size, offset);
440}
441
442static __be64 inode_val0(struct inode *inode)
443{
444 struct logfs_inode *li = logfs_inode(inode);
445 u64 val;
446
447 /*
448 * Explicit shifting generates good code, but must match the format
449 * of the structure. Add some paranoia just in case.
450 */
451 BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_mode) != 0);
452 BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_height) != 2);
453 BUILD_BUG_ON(offsetof(struct logfs_disk_inode, di_flags) != 4);
454
455 val = (u64)inode->i_mode << 48 |
456 (u64)li->li_height << 40 |
457 (u64)li->li_flags;
458 return cpu_to_be64(val);
459}
460
461static int inode_write_alias(struct super_block *sb,
462 struct logfs_block *block, write_alias_t *write_one_alias)
463{
464 struct inode *inode = block->inode;
465 struct logfs_inode *li = logfs_inode(inode);
466 unsigned long pos;
467 u64 ino , bix;
468 __be64 val;
469 level_t level;
470 int err;
471
472 for (pos = 0; ; pos++) {
473 pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos);
474 if (pos >= LOGFS_EMBEDDED_FIELDS + INODE_POINTER_OFS)
475 return 0;
476
477 switch (pos) {
478 case INODE_HEIGHT_OFS:
479 val = inode_val0(inode);
480 break;
481 case INODE_USED_OFS:
482 val = cpu_to_be64(li->li_used_bytes);
483 break;
484 case INODE_SIZE_OFS:
485 val = cpu_to_be64(i_size_read(inode));
486 break;
487 case INODE_POINTER_OFS ... INODE_POINTER_OFS + LOGFS_EMBEDDED_FIELDS - 1:
488 val = cpu_to_be64(li->li_data[pos - INODE_POINTER_OFS]);
489 break;
490 default:
491 BUG();
492 }
493
494 ino = LOGFS_INO_MASTER;
495 bix = inode->i_ino;
496 level = LEVEL(0);
497 err = write_one_alias(sb, ino, bix, level, pos, val);
498 if (err)
499 return err;
500 }
501}
502
503static int indirect_write_alias(struct super_block *sb,
504 struct logfs_block *block, write_alias_t *write_one_alias)
505{
506 unsigned long pos;
507 struct page *page = block->page;
508 u64 ino , bix;
509 __be64 *child, val;
510 level_t level;
511 int err;
512
513 for (pos = 0; ; pos++) {
514 pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos);
515 if (pos >= LOGFS_BLOCK_FACTOR)
516 return 0;
517
518 ino = page->mapping->host->i_ino;
519 logfs_unpack_index(page->index, &bix, &level);
520 child = kmap_atomic(page);
521 val = child[pos];
522 kunmap_atomic(child);
523 err = write_one_alias(sb, ino, bix, level, pos, val);
524 if (err)
525 return err;
526 }
527}
528
529int logfs_write_obj_aliases_pagecache(struct super_block *sb)
530{
531 struct logfs_super *super = logfs_super(sb);
532 struct logfs_block *block;
533 int err;
534
535 list_for_each_entry(block, &super->s_object_alias, alias_list) {
536 err = block->ops->write_alias(sb, block, write_alias_journal);
537 if (err)
538 return err;
539 }
540 return 0;
541}
542
543void __free_block(struct super_block *sb, struct logfs_block *block)
544{
545 BUG_ON(!list_empty(&block->item_list));
546 list_del(&block->alias_list);
547 mempool_free(block, logfs_super(sb)->s_block_pool);
548}
549
550static void inode_free_block(struct super_block *sb, struct logfs_block *block)
551{
552 struct inode *inode = block->inode;
553
554 logfs_inode(inode)->li_block = NULL;
555 __free_block(sb, block);
556}
557
558static void indirect_free_block(struct super_block *sb,
559 struct logfs_block *block)
560{
561 struct page *page = block->page;
562
563 if (PagePrivate(page)) {
564 ClearPagePrivate(page);
565 put_page(page);
566 set_page_private(page, 0);
567 }
568 __free_block(sb, block);
569}
570
571
572static const struct logfs_block_ops inode_block_ops = {
573 .write_block = inode_write_block,
574 .free_block = inode_free_block,
575 .write_alias = inode_write_alias,
576};
577
578const struct logfs_block_ops indirect_block_ops = {
579 .write_block = indirect_write_block,
580 .free_block = indirect_free_block,
581 .write_alias = indirect_write_alias,
582};
583
584struct logfs_block *__alloc_block(struct super_block *sb,
585 u64 ino, u64 bix, level_t level)
586{
587 struct logfs_super *super = logfs_super(sb);
588 struct logfs_block *block;
589
590 block = mempool_alloc(super->s_block_pool, GFP_NOFS);
591 memset(block, 0, sizeof(*block));
592 INIT_LIST_HEAD(&block->alias_list);
593 INIT_LIST_HEAD(&block->item_list);
594 block->sb = sb;
595 block->ino = ino;
596 block->bix = bix;
597 block->level = level;
598 return block;
599}
600
601static void alloc_inode_block(struct inode *inode)
602{
603 struct logfs_inode *li = logfs_inode(inode);
604 struct logfs_block *block;
605
606 if (li->li_block)
607 return;
608
609 block = __alloc_block(inode->i_sb, LOGFS_INO_MASTER, inode->i_ino, 0);
610 block->inode = inode;
611 li->li_block = block;
612 block->ops = &inode_block_ops;
613}
614
615void initialize_block_counters(struct page *page, struct logfs_block *block,
616 __be64 *array, int page_is_empty)
617{
618 u64 ptr;
619 int i, start;
620
621 block->partial = 0;
622 block->full = 0;
623 start = 0;
624 if (page->index < first_indirect_block()) {
625 /* Counters are pointless on level 0 */
626 return;
627 }
628 if (page->index == first_indirect_block()) {
629 /* Skip unused pointers */
630 start = I0_BLOCKS;
631 block->full = I0_BLOCKS;
632 }
633 if (!page_is_empty) {
634 for (i = start; i < LOGFS_BLOCK_FACTOR; i++) {
635 ptr = be64_to_cpu(array[i]);
636 if (ptr)
637 block->partial++;
638 if (ptr & LOGFS_FULLY_POPULATED)
639 block->full++;
640 }
641 }
642}
643
644static void alloc_data_block(struct inode *inode, struct page *page)
645{
646 struct logfs_block *block;
647 u64 bix;
648 level_t level;
649
650 if (PagePrivate(page))
651 return;
652
653 logfs_unpack_index(page->index, &bix, &level);
654 block = __alloc_block(inode->i_sb, inode->i_ino, bix, level);
655 block->page = page;
656
657 SetPagePrivate(page);
658 get_page(page);
659 set_page_private(page, (unsigned long) block);
660
661 block->ops = &indirect_block_ops;
662}
663
664static void alloc_indirect_block(struct inode *inode, struct page *page,
665 int page_is_empty)
666{
667 struct logfs_block *block;
668 __be64 *array;
669
670 if (PagePrivate(page))
671 return;
672
673 alloc_data_block(inode, page);
674
675 block = logfs_block(page);
676 array = kmap_atomic(page);
677 initialize_block_counters(page, block, array, page_is_empty);
678 kunmap_atomic(array);
679}
680
681static void block_set_pointer(struct page *page, int index, u64 ptr)
682{
683 struct logfs_block *block = logfs_block(page);
684 __be64 *array;
685 u64 oldptr;
686
687 BUG_ON(!block);
688 array = kmap_atomic(page);
689 oldptr = be64_to_cpu(array[index]);
690 array[index] = cpu_to_be64(ptr);
691 kunmap_atomic(array);
692 SetPageUptodate(page);
693
694 block->full += !!(ptr & LOGFS_FULLY_POPULATED)
695 - !!(oldptr & LOGFS_FULLY_POPULATED);
696 block->partial += !!ptr - !!oldptr;
697}
698
699static u64 block_get_pointer(struct page *page, int index)
700{
701 __be64 *block;
702 u64 ptr;
703
704 block = kmap_atomic(page);
705 ptr = be64_to_cpu(block[index]);
706 kunmap_atomic(block);
707 return ptr;
708}
709
710static int logfs_read_empty(struct page *page)
711{
712 zero_user_segment(page, 0, PAGE_SIZE);
713 return 0;
714}
715
716static int logfs_read_direct(struct inode *inode, struct page *page)
717{
718 struct logfs_inode *li = logfs_inode(inode);
719 pgoff_t index = page->index;
720 u64 block;
721
722 block = li->li_data[index];
723 if (!block)
724 return logfs_read_empty(page);
725
726 return logfs_segment_read(inode, page, block, index, 0);
727}
728
729static int logfs_read_loop(struct inode *inode, struct page *page,
730 int rw_context)
731{
732 struct logfs_inode *li = logfs_inode(inode);
733 u64 bix, bofs = li->li_data[INDIRECT_INDEX];
734 level_t level, target_level;
735 int ret;
736 struct page *ipage;
737
738 logfs_unpack_index(page->index, &bix, &target_level);
739 if (!bofs)
740 return logfs_read_empty(page);
741
742 if (bix >= maxbix(li->li_height))
743 return logfs_read_empty(page);
744
745 for (level = LEVEL(li->li_height);
746 (__force u8)level > (__force u8)target_level;
747 level = SUBLEVEL(level)){
748 ipage = logfs_get_page(inode, bix, level, rw_context);
749 if (!ipage)
750 return -ENOMEM;
751
752 ret = logfs_segment_read(inode, ipage, bofs, bix, level);
753 if (ret) {
754 logfs_put_read_page(ipage);
755 return ret;
756 }
757
758 bofs = block_get_pointer(ipage, get_bits(bix, SUBLEVEL(level)));
759 logfs_put_page(ipage, rw_context);
760 if (!bofs)
761 return logfs_read_empty(page);
762 }
763
764 return logfs_segment_read(inode, page, bofs, bix, 0);
765}
766
767static int logfs_read_block(struct inode *inode, struct page *page,
768 int rw_context)
769{
770 pgoff_t index = page->index;
771
772 if (index < I0_BLOCKS)
773 return logfs_read_direct(inode, page);
774 return logfs_read_loop(inode, page, rw_context);
775}
776
777static int logfs_exist_loop(struct inode *inode, u64 bix)
778{
779 struct logfs_inode *li = logfs_inode(inode);
780 u64 bofs = li->li_data[INDIRECT_INDEX];
781 level_t level;
782 int ret;
783 struct page *ipage;
784
785 if (!bofs)
786 return 0;
787 if (bix >= maxbix(li->li_height))
788 return 0;
789
790 for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)) {
791 ipage = logfs_get_read_page(inode, bix, level);
792 if (!ipage)
793 return -ENOMEM;
794
795 ret = logfs_segment_read(inode, ipage, bofs, bix, level);
796 if (ret) {
797 logfs_put_read_page(ipage);
798 return ret;
799 }
800
801 bofs = block_get_pointer(ipage, get_bits(bix, SUBLEVEL(level)));
802 logfs_put_read_page(ipage);
803 if (!bofs)
804 return 0;
805 }
806
807 return 1;
808}
809
810int logfs_exist_block(struct inode *inode, u64 bix)
811{
812 struct logfs_inode *li = logfs_inode(inode);
813
814 if (bix < I0_BLOCKS)
815 return !!li->li_data[bix];
816 return logfs_exist_loop(inode, bix);
817}
818
819static u64 seek_holedata_direct(struct inode *inode, u64 bix, int data)
820{
821 struct logfs_inode *li = logfs_inode(inode);
822
823 for (; bix < I0_BLOCKS; bix++)
824 if (data ^ (li->li_data[bix] == 0))
825 return bix;
826 return I0_BLOCKS;
827}
828
829static u64 seek_holedata_loop(struct inode *inode, u64 bix, int data)
830{
831 struct logfs_inode *li = logfs_inode(inode);
832 __be64 *rblock;
833 u64 increment, bofs = li->li_data[INDIRECT_INDEX];
834 level_t level;
835 int ret, slot;
836 struct page *page;
837
838 BUG_ON(!bofs);
839
840 for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)) {
841 increment = 1 << (LOGFS_BLOCK_BITS * ((__force u8)level-1));
842 page = logfs_get_read_page(inode, bix, level);
843 if (!page)
844 return bix;
845
846 ret = logfs_segment_read(inode, page, bofs, bix, level);
847 if (ret) {
848 logfs_put_read_page(page);
849 return bix;
850 }
851
852 slot = get_bits(bix, SUBLEVEL(level));
853 rblock = kmap_atomic(page);
854 while (slot < LOGFS_BLOCK_FACTOR) {
855 if (data && (rblock[slot] != 0))
856 break;
857 if (!data && !(be64_to_cpu(rblock[slot]) & LOGFS_FULLY_POPULATED))
858 break;
859 slot++;
860 bix += increment;
861 bix &= ~(increment - 1);
862 }
863 if (slot >= LOGFS_BLOCK_FACTOR) {
864 kunmap_atomic(rblock);
865 logfs_put_read_page(page);
866 return bix;
867 }
868 bofs = be64_to_cpu(rblock[slot]);
869 kunmap_atomic(rblock);
870 logfs_put_read_page(page);
871 if (!bofs) {
872 BUG_ON(data);
873 return bix;
874 }
875 }
876 return bix;
877}
878
879/**
880 * logfs_seek_hole - find next hole starting at a given block index
881 * @inode: inode to search in
882 * @bix: block index to start searching
883 *
884 * Returns next hole. If the file doesn't contain any further holes, the
885 * block address next to eof is returned instead.
886 */
887u64 logfs_seek_hole(struct inode *inode, u64 bix)
888{
889 struct logfs_inode *li = logfs_inode(inode);
890
891 if (bix < I0_BLOCKS) {
892 bix = seek_holedata_direct(inode, bix, 0);
893 if (bix < I0_BLOCKS)
894 return bix;
895 }
896
897 if (!li->li_data[INDIRECT_INDEX])
898 return bix;
899 else if (li->li_data[INDIRECT_INDEX] & LOGFS_FULLY_POPULATED)
900 bix = maxbix(li->li_height);
901 else if (bix >= maxbix(li->li_height))
902 return bix;
903 else {
904 bix = seek_holedata_loop(inode, bix, 0);
905 if (bix < maxbix(li->li_height))
906 return bix;
907 /* Should not happen anymore. But if some port writes semi-
908 * corrupt images (as this one used to) we might run into it.
909 */
910 WARN_ON_ONCE(bix == maxbix(li->li_height));
911 }
912
913 return bix;
914}
915
916static u64 __logfs_seek_data(struct inode *inode, u64 bix)
917{
918 struct logfs_inode *li = logfs_inode(inode);
919
920 if (bix < I0_BLOCKS) {
921 bix = seek_holedata_direct(inode, bix, 1);
922 if (bix < I0_BLOCKS)
923 return bix;
924 }
925
926 if (bix < maxbix(li->li_height)) {
927 if (!li->li_data[INDIRECT_INDEX])
928 bix = maxbix(li->li_height);
929 else
930 return seek_holedata_loop(inode, bix, 1);
931 }
932
933 return bix;
934}
935
936/**
937 * logfs_seek_data - find next data block after a given block index
938 * @inode: inode to search in
939 * @bix: block index to start searching
940 *
941 * Returns next data block. If the file doesn't contain any further data
942 * blocks, the last block in the file is returned instead.
943 */
944u64 logfs_seek_data(struct inode *inode, u64 bix)
945{
946 struct super_block *sb = inode->i_sb;
947 u64 ret, end;
948
949 ret = __logfs_seek_data(inode, bix);
950 end = i_size_read(inode) >> sb->s_blocksize_bits;
951 if (ret >= end)
952 ret = max(bix, end);
953 return ret;
954}
955
956static int logfs_is_valid_direct(struct logfs_inode *li, u64 bix, u64 ofs)
957{
958 return pure_ofs(li->li_data[bix]) == ofs;
959}
960
961static int __logfs_is_valid_loop(struct inode *inode, u64 bix,
962 u64 ofs, u64 bofs)
963{
964 struct logfs_inode *li = logfs_inode(inode);
965 level_t level;
966 int ret;
967 struct page *page;
968
969 for (level = LEVEL(li->li_height); level != 0; level = SUBLEVEL(level)){
970 page = logfs_get_write_page(inode, bix, level);
971 BUG_ON(!page);
972
973 ret = logfs_segment_read(inode, page, bofs, bix, level);
974 if (ret) {
975 logfs_put_write_page(page);
976 return 0;
977 }
978
979 bofs = block_get_pointer(page, get_bits(bix, SUBLEVEL(level)));
980 logfs_put_write_page(page);
981 if (!bofs)
982 return 0;
983
984 if (pure_ofs(bofs) == ofs)
985 return 1;
986 }
987 return 0;
988}
989
990static int logfs_is_valid_loop(struct inode *inode, u64 bix, u64 ofs)
991{
992 struct logfs_inode *li = logfs_inode(inode);
993 u64 bofs = li->li_data[INDIRECT_INDEX];
994
995 if (!bofs)
996 return 0;
997
998 if (bix >= maxbix(li->li_height))
999 return 0;
1000
1001 if (pure_ofs(bofs) == ofs)
1002 return 1;
1003
1004 return __logfs_is_valid_loop(inode, bix, ofs, bofs);
1005}
1006
1007static int __logfs_is_valid_block(struct inode *inode, u64 bix, u64 ofs)
1008{
1009 struct logfs_inode *li = logfs_inode(inode);
1010
1011 if ((inode->i_nlink == 0) && atomic_read(&inode->i_count) == 1)
1012 return 0;
1013
1014 if (bix < I0_BLOCKS)
1015 return logfs_is_valid_direct(li, bix, ofs);
1016 return logfs_is_valid_loop(inode, bix, ofs);
1017}
1018
1019/**
1020 * logfs_is_valid_block - check whether this block is still valid
1021 *
1022 * @sb: superblock
1023 * @ofs: block physical offset
1024 * @ino: block inode number
1025 * @bix: block index
1026 * @gc_level: block level
1027 *
1028 * Returns 0 if the block is invalid, 1 if it is valid and 2 if it will
1029 * become invalid once the journal is written.
1030 */
1031int logfs_is_valid_block(struct super_block *sb, u64 ofs, u64 ino, u64 bix,
1032 gc_level_t gc_level)
1033{
1034 struct logfs_super *super = logfs_super(sb);
1035 struct inode *inode;
1036 int ret, cookie;
1037
1038 /* Umount closes a segment with free blocks remaining. Those
1039 * blocks are by definition invalid. */
1040 if (ino == -1)
1041 return 0;
1042
1043 LOGFS_BUG_ON((u64)(u_long)ino != ino, sb);
1044
1045 inode = logfs_safe_iget(sb, ino, &cookie);
1046 if (IS_ERR(inode))
1047 goto invalid;
1048
1049 ret = __logfs_is_valid_block(inode, bix, ofs);
1050 logfs_safe_iput(inode, cookie);
1051 if (ret)
1052 return ret;
1053
1054invalid:
1055 /* Block is nominally invalid, but may still sit in the shadow tree,
1056 * waiting for a journal commit.
1057 */
1058 if (btree_lookup64(&super->s_shadow_tree.old, ofs))
1059 return 2;
1060 return 0;
1061}
1062
1063int logfs_readpage_nolock(struct page *page)
1064{
1065 struct inode *inode = page->mapping->host;
1066 int ret = -EIO;
1067
1068 ret = logfs_read_block(inode, page, READ);
1069
1070 if (ret) {
1071 ClearPageUptodate(page);
1072 SetPageError(page);
1073 } else {
1074 SetPageUptodate(page);
1075 ClearPageError(page);
1076 }
1077 flush_dcache_page(page);
1078
1079 return ret;
1080}
1081
1082static int logfs_reserve_bytes(struct inode *inode, int bytes)
1083{
1084 struct logfs_super *super = logfs_super(inode->i_sb);
1085 u64 available = super->s_free_bytes + super->s_dirty_free_bytes
1086 - super->s_dirty_used_bytes - super->s_dirty_pages;
1087
1088 if (!bytes)
1089 return 0;
1090
1091 if (available < bytes)
1092 return -ENOSPC;
1093
1094 if (available < bytes + super->s_root_reserve &&
1095 !capable(CAP_SYS_RESOURCE))
1096 return -ENOSPC;
1097
1098 return 0;
1099}
1100
1101int get_page_reserve(struct inode *inode, struct page *page)
1102{
1103 struct logfs_super *super = logfs_super(inode->i_sb);
1104 struct logfs_block *block = logfs_block(page);
1105 int ret;
1106
1107 if (block && block->reserved_bytes)
1108 return 0;
1109
1110 logfs_get_wblocks(inode->i_sb, page, WF_LOCK);
1111 while ((ret = logfs_reserve_bytes(inode, 6 * LOGFS_MAX_OBJECTSIZE)) &&
1112 !list_empty(&super->s_writeback_list)) {
1113 block = list_entry(super->s_writeback_list.next,
1114 struct logfs_block, alias_list);
1115 block->ops->write_block(block);
1116 }
1117 if (!ret) {
1118 alloc_data_block(inode, page);
1119 block = logfs_block(page);
1120 block->reserved_bytes += 6 * LOGFS_MAX_OBJECTSIZE;
1121 super->s_dirty_pages += 6 * LOGFS_MAX_OBJECTSIZE;
1122 list_move_tail(&block->alias_list, &super->s_writeback_list);
1123 }
1124 logfs_put_wblocks(inode->i_sb, page, WF_LOCK);
1125 return ret;
1126}
1127
1128/*
1129 * We are protected by write lock. Push victims up to superblock level
1130 * and release transaction when appropriate.
1131 */
1132/* FIXME: This is currently called from the wrong spots. */
1133static void logfs_handle_transaction(struct inode *inode,
1134 struct logfs_transaction *ta)
1135{
1136 struct logfs_super *super = logfs_super(inode->i_sb);
1137
1138 if (!ta)
1139 return;
1140 logfs_inode(inode)->li_block->ta = NULL;
1141
1142 if (inode->i_ino != LOGFS_INO_MASTER) {
1143 BUG(); /* FIXME: Yes, this needs more thought */
1144 /* just remember the transaction until inode is written */
1145 //BUG_ON(logfs_inode(inode)->li_transaction);
1146 //logfs_inode(inode)->li_transaction = ta;
1147 return;
1148 }
1149
1150 switch (ta->state) {
1151 case CREATE_1: /* fall through */
1152 case UNLINK_1:
1153 BUG_ON(super->s_victim_ino);
1154 super->s_victim_ino = ta->ino;
1155 break;
1156 case CREATE_2: /* fall through */
1157 case UNLINK_2:
1158 BUG_ON(super->s_victim_ino != ta->ino);
1159 super->s_victim_ino = 0;
1160 /* transaction ends here - free it */
1161 kfree(ta);
1162 break;
1163 case CROSS_RENAME_1:
1164 BUG_ON(super->s_rename_dir);
1165 BUG_ON(super->s_rename_pos);
1166 super->s_rename_dir = ta->dir;
1167 super->s_rename_pos = ta->pos;
1168 break;
1169 case CROSS_RENAME_2:
1170 BUG_ON(super->s_rename_dir != ta->dir);
1171 BUG_ON(super->s_rename_pos != ta->pos);
1172 super->s_rename_dir = 0;
1173 super->s_rename_pos = 0;
1174 kfree(ta);
1175 break;
1176 case TARGET_RENAME_1:
1177 BUG_ON(super->s_rename_dir);
1178 BUG_ON(super->s_rename_pos);
1179 BUG_ON(super->s_victim_ino);
1180 super->s_rename_dir = ta->dir;
1181 super->s_rename_pos = ta->pos;
1182 super->s_victim_ino = ta->ino;
1183 break;
1184 case TARGET_RENAME_2:
1185 BUG_ON(super->s_rename_dir != ta->dir);
1186 BUG_ON(super->s_rename_pos != ta->pos);
1187 BUG_ON(super->s_victim_ino != ta->ino);
1188 super->s_rename_dir = 0;
1189 super->s_rename_pos = 0;
1190 break;
1191 case TARGET_RENAME_3:
1192 BUG_ON(super->s_rename_dir);
1193 BUG_ON(super->s_rename_pos);
1194 BUG_ON(super->s_victim_ino != ta->ino);
1195 super->s_victim_ino = 0;
1196 kfree(ta);
1197 break;
1198 default:
1199 BUG();
1200 }
1201}
1202
1203/*
1204 * Not strictly a reservation, but rather a check that we still have enough
1205 * space to satisfy the write.
1206 */
1207static int logfs_reserve_blocks(struct inode *inode, int blocks)
1208{
1209 return logfs_reserve_bytes(inode, blocks * LOGFS_MAX_OBJECTSIZE);
1210}
1211
1212struct write_control {
1213 u64 ofs;
1214 long flags;
1215};
1216
1217static struct logfs_shadow *alloc_shadow(struct inode *inode, u64 bix,
1218 level_t level, u64 old_ofs)
1219{
1220 struct logfs_super *super = logfs_super(inode->i_sb);
1221 struct logfs_shadow *shadow;
1222
1223 shadow = mempool_alloc(super->s_shadow_pool, GFP_NOFS);
1224 memset(shadow, 0, sizeof(*shadow));
1225 shadow->ino = inode->i_ino;
1226 shadow->bix = bix;
1227 shadow->gc_level = expand_level(inode->i_ino, level);
1228 shadow->old_ofs = old_ofs & ~LOGFS_FULLY_POPULATED;
1229 return shadow;
1230}
1231
1232static void free_shadow(struct inode *inode, struct logfs_shadow *shadow)
1233{
1234 struct logfs_super *super = logfs_super(inode->i_sb);
1235
1236 mempool_free(shadow, super->s_shadow_pool);
1237}
1238
1239static void mark_segment(struct shadow_tree *tree, u32 segno)
1240{
1241 int err;
1242
1243 if (!btree_lookup32(&tree->segment_map, segno)) {
1244 err = btree_insert32(&tree->segment_map, segno, (void *)1,
1245 GFP_NOFS);
1246 BUG_ON(err);
1247 tree->no_shadowed_segments++;
1248 }
1249}
1250
1251/**
1252 * fill_shadow_tree - Propagate shadow tree changes due to a write
1253 * @inode: Inode owning the page
1254 * @page: Struct page that was written
1255 * @shadow: Shadow for the current write
1256 *
1257 * Writes in logfs can result in two semi-valid objects. The old object
1258 * is still valid as long as it can be reached by following pointers on
1259 * the medium. Only when writes propagate all the way up to the journal
1260 * has the new object safely replaced the old one.
1261 *
1262 * To handle this problem, a struct logfs_shadow is used to represent
1263 * every single write. It is attached to the indirect block, which is
1264 * marked dirty. When the indirect block is written, its shadows are
1265 * handed up to the next indirect block (or inode). Untimately they
1266 * will reach the master inode and be freed upon journal commit.
1267 *
1268 * This function handles a single step in the propagation. It adds the
1269 * shadow for the current write to the tree, along with any shadows in
1270 * the page's tree, in case it was an indirect block. If a page is
1271 * written, the inode parameter is left NULL, if an inode is written,
1272 * the page parameter is left NULL.
1273 */
1274static void fill_shadow_tree(struct inode *inode, struct page *page,
1275 struct logfs_shadow *shadow)
1276{
1277 struct logfs_super *super = logfs_super(inode->i_sb);
1278 struct logfs_block *block = logfs_block(page);
1279 struct shadow_tree *tree = &super->s_shadow_tree;
1280
1281 if (PagePrivate(page)) {
1282 if (block->alias_map)
1283 super->s_no_object_aliases -= bitmap_weight(
1284 block->alias_map, LOGFS_BLOCK_FACTOR);
1285 logfs_handle_transaction(inode, block->ta);
1286 block->ops->free_block(inode->i_sb, block);
1287 }
1288 if (shadow) {
1289 if (shadow->old_ofs)
1290 btree_insert64(&tree->old, shadow->old_ofs, shadow,
1291 GFP_NOFS);
1292 else
1293 btree_insert64(&tree->new, shadow->new_ofs, shadow,
1294 GFP_NOFS);
1295
1296 super->s_dirty_used_bytes += shadow->new_len;
1297 super->s_dirty_free_bytes += shadow->old_len;
1298 mark_segment(tree, shadow->old_ofs >> super->s_segshift);
1299 mark_segment(tree, shadow->new_ofs >> super->s_segshift);
1300 }
1301}
1302
1303static void logfs_set_alias(struct super_block *sb, struct logfs_block *block,
1304 long child_no)
1305{
1306 struct logfs_super *super = logfs_super(sb);
1307
1308 if (block->inode && block->inode->i_ino == LOGFS_INO_MASTER) {
1309 /* Aliases in the master inode are pointless. */
1310 return;
1311 }
1312
1313 if (!test_bit(child_no, block->alias_map)) {
1314 set_bit(child_no, block->alias_map);
1315 super->s_no_object_aliases++;
1316 }
1317 list_move_tail(&block->alias_list, &super->s_object_alias);
1318}
1319
1320/*
1321 * Object aliases can and often do change the size and occupied space of a
1322 * file. So not only do we have to change the pointers, we also have to
1323 * change inode->i_size and li->li_used_bytes. Which is done by setting
1324 * another two object aliases for the inode itself.
1325 */
1326static void set_iused(struct inode *inode, struct logfs_shadow *shadow)
1327{
1328 struct logfs_inode *li = logfs_inode(inode);
1329
1330 if (shadow->new_len == shadow->old_len)
1331 return;
1332
1333 alloc_inode_block(inode);
1334 li->li_used_bytes += shadow->new_len - shadow->old_len;
1335 __logfs_set_blocks(inode);
1336 logfs_set_alias(inode->i_sb, li->li_block, INODE_USED_OFS);
1337 logfs_set_alias(inode->i_sb, li->li_block, INODE_SIZE_OFS);
1338}
1339
1340static int logfs_write_i0(struct inode *inode, struct page *page,
1341 struct write_control *wc)
1342{
1343 struct logfs_shadow *shadow;
1344 u64 bix;
1345 level_t level;
1346 int full, err = 0;
1347
1348 logfs_unpack_index(page->index, &bix, &level);
1349 if (wc->ofs == 0)
1350 if (logfs_reserve_blocks(inode, 1))
1351 return -ENOSPC;
1352
1353 shadow = alloc_shadow(inode, bix, level, wc->ofs);
1354 if (wc->flags & WF_WRITE)
1355 err = logfs_segment_write(inode, page, shadow);
1356 if (wc->flags & WF_DELETE)
1357 logfs_segment_delete(inode, shadow);
1358 if (err) {
1359 free_shadow(inode, shadow);
1360 return err;
1361 }
1362
1363 set_iused(inode, shadow);
1364 full = 1;
1365 if (level != 0) {
1366 alloc_indirect_block(inode, page, 0);
1367 full = logfs_block(page)->full == LOGFS_BLOCK_FACTOR;
1368 }
1369 fill_shadow_tree(inode, page, shadow);
1370 wc->ofs = shadow->new_ofs;
1371 if (wc->ofs && full)
1372 wc->ofs |= LOGFS_FULLY_POPULATED;
1373 return 0;
1374}
1375
1376static int logfs_write_direct(struct inode *inode, struct page *page,
1377 long flags)
1378{
1379 struct logfs_inode *li = logfs_inode(inode);
1380 struct write_control wc = {
1381 .ofs = li->li_data[page->index],
1382 .flags = flags,
1383 };
1384 int err;
1385
1386 alloc_inode_block(inode);
1387
1388 err = logfs_write_i0(inode, page, &wc);
1389 if (err)
1390 return err;
1391
1392 li->li_data[page->index] = wc.ofs;
1393 logfs_set_alias(inode->i_sb, li->li_block,
1394 page->index + INODE_POINTER_OFS);
1395 return 0;
1396}
1397
1398static int ptr_change(u64 ofs, struct page *page)
1399{
1400 struct logfs_block *block = logfs_block(page);
1401 int empty0, empty1, full0, full1;
1402
1403 empty0 = ofs == 0;
1404 empty1 = block->partial == 0;
1405 if (empty0 != empty1)
1406 return 1;
1407
1408 /* The !! is necessary to shrink result to int */
1409 full0 = !!(ofs & LOGFS_FULLY_POPULATED);
1410 full1 = block->full == LOGFS_BLOCK_FACTOR;
1411 if (full0 != full1)
1412 return 1;
1413 return 0;
1414}
1415
1416static int __logfs_write_rec(struct inode *inode, struct page *page,
1417 struct write_control *this_wc,
1418 pgoff_t bix, level_t target_level, level_t level)
1419{
1420 int ret, page_empty = 0;
1421 int child_no = get_bits(bix, SUBLEVEL(level));
1422 struct page *ipage;
1423 struct write_control child_wc = {
1424 .flags = this_wc->flags,
1425 };
1426
1427 ipage = logfs_get_write_page(inode, bix, level);
1428 if (!ipage)
1429 return -ENOMEM;
1430
1431 if (this_wc->ofs) {
1432 ret = logfs_segment_read(inode, ipage, this_wc->ofs, bix, level);
1433 if (ret)
1434 goto out;
1435 } else if (!PageUptodate(ipage)) {
1436 page_empty = 1;
1437 logfs_read_empty(ipage);
1438 }
1439
1440 child_wc.ofs = block_get_pointer(ipage, child_no);
1441
1442 if ((__force u8)level-1 > (__force u8)target_level)
1443 ret = __logfs_write_rec(inode, page, &child_wc, bix,
1444 target_level, SUBLEVEL(level));
1445 else
1446 ret = logfs_write_i0(inode, page, &child_wc);
1447
1448 if (ret)
1449 goto out;
1450
1451 alloc_indirect_block(inode, ipage, page_empty);
1452 block_set_pointer(ipage, child_no, child_wc.ofs);
1453 /* FIXME: first condition seems superfluous */
1454 if (child_wc.ofs || logfs_block(ipage)->partial)
1455 this_wc->flags |= WF_WRITE;
1456 /* the condition on this_wc->ofs ensures that we won't consume extra
1457 * space for indirect blocks in the future, which we cannot reserve */
1458 if (!this_wc->ofs || ptr_change(this_wc->ofs, ipage))
1459 ret = logfs_write_i0(inode, ipage, this_wc);
1460 else
1461 logfs_set_alias(inode->i_sb, logfs_block(ipage), child_no);
1462out:
1463 logfs_put_write_page(ipage);
1464 return ret;
1465}
1466
1467static int logfs_write_rec(struct inode *inode, struct page *page,
1468 pgoff_t bix, level_t target_level, long flags)
1469{
1470 struct logfs_inode *li = logfs_inode(inode);
1471 struct write_control wc = {
1472 .ofs = li->li_data[INDIRECT_INDEX],
1473 .flags = flags,
1474 };
1475 int ret;
1476
1477 alloc_inode_block(inode);
1478
1479 if (li->li_height > (__force u8)target_level)
1480 ret = __logfs_write_rec(inode, page, &wc, bix, target_level,
1481 LEVEL(li->li_height));
1482 else
1483 ret = logfs_write_i0(inode, page, &wc);
1484 if (ret)
1485 return ret;
1486
1487 if (li->li_data[INDIRECT_INDEX] != wc.ofs) {
1488 li->li_data[INDIRECT_INDEX] = wc.ofs;
1489 logfs_set_alias(inode->i_sb, li->li_block,
1490 INDIRECT_INDEX + INODE_POINTER_OFS);
1491 }
1492 return ret;
1493}
1494
1495void logfs_add_transaction(struct inode *inode, struct logfs_transaction *ta)
1496{
1497 alloc_inode_block(inode);
1498 logfs_inode(inode)->li_block->ta = ta;
1499}
1500
1501void logfs_del_transaction(struct inode *inode, struct logfs_transaction *ta)
1502{
1503 struct logfs_block *block = logfs_inode(inode)->li_block;
1504
1505 if (block && block->ta)
1506 block->ta = NULL;
1507}
1508
1509static int grow_inode(struct inode *inode, u64 bix, level_t level)
1510{
1511 struct logfs_inode *li = logfs_inode(inode);
1512 u8 height = (__force u8)level;
1513 struct page *page;
1514 struct write_control wc = {
1515 .flags = WF_WRITE,
1516 };
1517 int err;
1518
1519 BUG_ON(height > 5 || li->li_height > 5);
1520 while (height > li->li_height || bix >= maxbix(li->li_height)) {
1521 page = logfs_get_write_page(inode, I0_BLOCKS + 1,
1522 LEVEL(li->li_height + 1));
1523 if (!page)
1524 return -ENOMEM;
1525 logfs_read_empty(page);
1526 alloc_indirect_block(inode, page, 1);
1527 block_set_pointer(page, 0, li->li_data[INDIRECT_INDEX]);
1528 err = logfs_write_i0(inode, page, &wc);
1529 logfs_put_write_page(page);
1530 if (err)
1531 return err;
1532 li->li_data[INDIRECT_INDEX] = wc.ofs;
1533 wc.ofs = 0;
1534 li->li_height++;
1535 logfs_set_alias(inode->i_sb, li->li_block, INODE_HEIGHT_OFS);
1536 }
1537 return 0;
1538}
1539
1540static int __logfs_write_buf(struct inode *inode, struct page *page, long flags)
1541{
1542 struct logfs_super *super = logfs_super(inode->i_sb);
1543 pgoff_t index = page->index;
1544 u64 bix;
1545 level_t level;
1546 int err;
1547
1548 flags |= WF_WRITE | WF_DELETE;
1549 inode->i_ctime = inode->i_mtime = current_time(inode);
1550
1551 logfs_unpack_index(index, &bix, &level);
1552 if (logfs_block(page) && logfs_block(page)->reserved_bytes)
1553 super->s_dirty_pages -= logfs_block(page)->reserved_bytes;
1554
1555 if (index < I0_BLOCKS)
1556 return logfs_write_direct(inode, page, flags);
1557
1558 bix = adjust_bix(bix, level);
1559 err = grow_inode(inode, bix, level);
1560 if (err)
1561 return err;
1562 return logfs_write_rec(inode, page, bix, level, flags);
1563}
1564
1565int logfs_write_buf(struct inode *inode, struct page *page, long flags)
1566{
1567 struct super_block *sb = inode->i_sb;
1568 int ret;
1569
1570 logfs_get_wblocks(sb, page, flags & WF_LOCK);
1571 ret = __logfs_write_buf(inode, page, flags);
1572 logfs_put_wblocks(sb, page, flags & WF_LOCK);
1573 return ret;
1574}
1575
1576static int __logfs_delete(struct inode *inode, struct page *page)
1577{
1578 long flags = WF_DELETE;
1579 int err;
1580
1581 inode->i_ctime = inode->i_mtime = current_time(inode);
1582
1583 if (page->index < I0_BLOCKS)
1584 return logfs_write_direct(inode, page, flags);
1585 err = grow_inode(inode, page->index, 0);
1586 if (err)
1587 return err;
1588 return logfs_write_rec(inode, page, page->index, 0, flags);
1589}
1590
1591int logfs_delete(struct inode *inode, pgoff_t index,
1592 struct shadow_tree *shadow_tree)
1593{
1594 struct super_block *sb = inode->i_sb;
1595 struct page *page;
1596 int ret;
1597
1598 page = logfs_get_read_page(inode, index, 0);
1599 if (!page)
1600 return -ENOMEM;
1601
1602 logfs_get_wblocks(sb, page, 1);
1603 ret = __logfs_delete(inode, page);
1604 logfs_put_wblocks(sb, page, 1);
1605
1606 logfs_put_read_page(page);
1607
1608 return ret;
1609}
1610
1611int logfs_rewrite_block(struct inode *inode, u64 bix, u64 ofs,
1612 gc_level_t gc_level, long flags)
1613{
1614 level_t level = shrink_level(gc_level);
1615 struct page *page;
1616 int err;
1617
1618 page = logfs_get_write_page(inode, bix, level);
1619 if (!page)
1620 return -ENOMEM;
1621
1622 err = logfs_segment_read(inode, page, ofs, bix, level);
1623 if (!err) {
1624 if (level != 0)
1625 alloc_indirect_block(inode, page, 0);
1626 err = logfs_write_buf(inode, page, flags);
1627 if (!err && shrink_level(gc_level) == 0) {
1628 /* Rewrite cannot mark the inode dirty but has to
1629 * write it immediately.
1630 * Q: Can't we just create an alias for the inode
1631 * instead? And if not, why not?
1632 */
1633 if (inode->i_ino == LOGFS_INO_MASTER)
1634 logfs_write_anchor(inode->i_sb);
1635 else {
1636 err = __logfs_write_inode(inode, page, flags);
1637 }
1638 }
1639 }
1640 logfs_put_write_page(page);
1641 return err;
1642}
1643
1644static int truncate_data_block(struct inode *inode, struct page *page,
1645 u64 ofs, struct logfs_shadow *shadow, u64 size)
1646{
1647 loff_t pageofs = page->index << inode->i_sb->s_blocksize_bits;
1648 u64 bix;
1649 level_t level;
1650 int err;
1651
1652 /* Does truncation happen within this page? */
1653 if (size <= pageofs || size - pageofs >= PAGE_SIZE)
1654 return 0;
1655
1656 logfs_unpack_index(page->index, &bix, &level);
1657 BUG_ON(level != 0);
1658
1659 err = logfs_segment_read(inode, page, ofs, bix, level);
1660 if (err)
1661 return err;
1662
1663 zero_user_segment(page, size - pageofs, PAGE_SIZE);
1664 return logfs_segment_write(inode, page, shadow);
1665}
1666
1667static int logfs_truncate_i0(struct inode *inode, struct page *page,
1668 struct write_control *wc, u64 size)
1669{
1670 struct logfs_shadow *shadow;
1671 u64 bix;
1672 level_t level;
1673 int err = 0;
1674
1675 logfs_unpack_index(page->index, &bix, &level);
1676 BUG_ON(level != 0);
1677 shadow = alloc_shadow(inode, bix, level, wc->ofs);
1678
1679 err = truncate_data_block(inode, page, wc->ofs, shadow, size);
1680 if (err) {
1681 free_shadow(inode, shadow);
1682 return err;
1683 }
1684
1685 logfs_segment_delete(inode, shadow);
1686 set_iused(inode, shadow);
1687 fill_shadow_tree(inode, page, shadow);
1688 wc->ofs = shadow->new_ofs;
1689 return 0;
1690}
1691
1692static int logfs_truncate_direct(struct inode *inode, u64 size)
1693{
1694 struct logfs_inode *li = logfs_inode(inode);
1695 struct write_control wc;
1696 struct page *page;
1697 int e;
1698 int err;
1699
1700 alloc_inode_block(inode);
1701
1702 for (e = I0_BLOCKS - 1; e >= 0; e--) {
1703 if (size > (e+1) * LOGFS_BLOCKSIZE)
1704 break;
1705
1706 wc.ofs = li->li_data[e];
1707 if (!wc.ofs)
1708 continue;
1709
1710 page = logfs_get_write_page(inode, e, 0);
1711 if (!page)
1712 return -ENOMEM;
1713 err = logfs_segment_read(inode, page, wc.ofs, e, 0);
1714 if (err) {
1715 logfs_put_write_page(page);
1716 return err;
1717 }
1718 err = logfs_truncate_i0(inode, page, &wc, size);
1719 logfs_put_write_page(page);
1720 if (err)
1721 return err;
1722
1723 li->li_data[e] = wc.ofs;
1724 }
1725 return 0;
1726}
1727
1728/* FIXME: these need to become per-sb once we support different blocksizes */
1729static u64 __logfs_step[] = {
1730 1,
1731 I1_BLOCKS,
1732 I2_BLOCKS,
1733 I3_BLOCKS,
1734};
1735
1736static u64 __logfs_start_index[] = {
1737 I0_BLOCKS,
1738 I1_BLOCKS,
1739 I2_BLOCKS,
1740 I3_BLOCKS
1741};
1742
1743static inline u64 logfs_step(level_t level)
1744{
1745 return __logfs_step[(__force u8)level];
1746}
1747
1748static inline u64 logfs_factor(u8 level)
1749{
1750 return __logfs_step[level] * LOGFS_BLOCKSIZE;
1751}
1752
1753static inline u64 logfs_start_index(level_t level)
1754{
1755 return __logfs_start_index[(__force u8)level];
1756}
1757
1758static void logfs_unpack_raw_index(pgoff_t index, u64 *bix, level_t *level)
1759{
1760 logfs_unpack_index(index, bix, level);
1761 if (*bix <= logfs_start_index(SUBLEVEL(*level)))
1762 *bix = 0;
1763}
1764
1765static int __logfs_truncate_rec(struct inode *inode, struct page *ipage,
1766 struct write_control *this_wc, u64 size)
1767{
1768 int truncate_happened = 0;
1769 int e, err = 0;
1770 u64 bix, child_bix, next_bix;
1771 level_t level;
1772 struct page *page;
1773 struct write_control child_wc = { /* FIXME: flags */ };
1774
1775 logfs_unpack_raw_index(ipage->index, &bix, &level);
1776 err = logfs_segment_read(inode, ipage, this_wc->ofs, bix, level);
1777 if (err)
1778 return err;
1779
1780 for (e = LOGFS_BLOCK_FACTOR - 1; e >= 0; e--) {
1781 child_bix = bix + e * logfs_step(SUBLEVEL(level));
1782 next_bix = child_bix + logfs_step(SUBLEVEL(level));
1783 if (size > next_bix * LOGFS_BLOCKSIZE)
1784 break;
1785
1786 child_wc.ofs = pure_ofs(block_get_pointer(ipage, e));
1787 if (!child_wc.ofs)
1788 continue;
1789
1790 page = logfs_get_write_page(inode, child_bix, SUBLEVEL(level));
1791 if (!page)
1792 return -ENOMEM;
1793
1794 if ((__force u8)level > 1)
1795 err = __logfs_truncate_rec(inode, page, &child_wc, size);
1796 else
1797 err = logfs_truncate_i0(inode, page, &child_wc, size);
1798 logfs_put_write_page(page);
1799 if (err)
1800 return err;
1801
1802 truncate_happened = 1;
1803 alloc_indirect_block(inode, ipage, 0);
1804 block_set_pointer(ipage, e, child_wc.ofs);
1805 }
1806
1807 if (!truncate_happened) {
1808 printk("ineffectual truncate (%lx, %lx, %llx)\n", inode->i_ino, ipage->index, size);
1809 return 0;
1810 }
1811
1812 this_wc->flags = WF_DELETE;
1813 if (logfs_block(ipage)->partial)
1814 this_wc->flags |= WF_WRITE;
1815
1816 return logfs_write_i0(inode, ipage, this_wc);
1817}
1818
1819static int logfs_truncate_rec(struct inode *inode, u64 size)
1820{
1821 struct logfs_inode *li = logfs_inode(inode);
1822 struct write_control wc = {
1823 .ofs = li->li_data[INDIRECT_INDEX],
1824 };
1825 struct page *page;
1826 int err;
1827
1828 alloc_inode_block(inode);
1829
1830 if (!wc.ofs)
1831 return 0;
1832
1833 page = logfs_get_write_page(inode, 0, LEVEL(li->li_height));
1834 if (!page)
1835 return -ENOMEM;
1836
1837 err = __logfs_truncate_rec(inode, page, &wc, size);
1838 logfs_put_write_page(page);
1839 if (err)
1840 return err;
1841
1842 if (li->li_data[INDIRECT_INDEX] != wc.ofs)
1843 li->li_data[INDIRECT_INDEX] = wc.ofs;
1844 return 0;
1845}
1846
1847static int __logfs_truncate(struct inode *inode, u64 size)
1848{
1849 int ret;
1850
1851 if (size >= logfs_factor(logfs_inode(inode)->li_height))
1852 return 0;
1853
1854 ret = logfs_truncate_rec(inode, size);
1855 if (ret)
1856 return ret;
1857
1858 return logfs_truncate_direct(inode, size);
1859}
1860
1861/*
1862 * Truncate, by changing the segment file, can consume a fair amount
1863 * of resources. So back off from time to time and do some GC.
1864 * 8 or 2048 blocks should be well within safety limits even if
1865 * every single block resided in a different segment.
1866 */
1867#define TRUNCATE_STEP (8 * 1024 * 1024)
1868int logfs_truncate(struct inode *inode, u64 target)
1869{
1870 struct super_block *sb = inode->i_sb;
1871 u64 size = i_size_read(inode);
1872 int err = 0;
1873
1874 size = ALIGN(size, TRUNCATE_STEP);
1875 while (size > target) {
1876 if (size > TRUNCATE_STEP)
1877 size -= TRUNCATE_STEP;
1878 else
1879 size = 0;
1880 if (size < target)
1881 size = target;
1882
1883 logfs_get_wblocks(sb, NULL, 1);
1884 err = __logfs_truncate(inode, size);
1885 if (!err)
1886 err = __logfs_write_inode(inode, NULL, 0);
1887 logfs_put_wblocks(sb, NULL, 1);
1888 }
1889
1890 if (!err) {
1891 err = inode_newsize_ok(inode, target);
1892 if (err)
1893 goto out;
1894
1895 truncate_setsize(inode, target);
1896 }
1897
1898 out:
1899 /* I don't trust error recovery yet. */
1900 WARN_ON(err);
1901 return err;
1902}
1903
1904static void move_page_to_inode(struct inode *inode, struct page *page)
1905{
1906 struct logfs_inode *li = logfs_inode(inode);
1907 struct logfs_block *block = logfs_block(page);
1908
1909 if (!block)
1910 return;
1911
1912 log_blockmove("move_page_to_inode(%llx, %llx, %x)\n",
1913 block->ino, block->bix, block->level);
1914 BUG_ON(li->li_block);
1915 block->ops = &inode_block_ops;
1916 block->inode = inode;
1917 li->li_block = block;
1918
1919 block->page = NULL;
1920 if (PagePrivate(page)) {
1921 ClearPagePrivate(page);
1922 put_page(page);
1923 set_page_private(page, 0);
1924 }
1925}
1926
1927static void move_inode_to_page(struct page *page, struct inode *inode)
1928{
1929 struct logfs_inode *li = logfs_inode(inode);
1930 struct logfs_block *block = li->li_block;
1931
1932 if (!block)
1933 return;
1934
1935 log_blockmove("move_inode_to_page(%llx, %llx, %x)\n",
1936 block->ino, block->bix, block->level);
1937 BUG_ON(PagePrivate(page));
1938 block->ops = &indirect_block_ops;
1939 block->page = page;
1940
1941 if (!PagePrivate(page)) {
1942 SetPagePrivate(page);
1943 get_page(page);
1944 set_page_private(page, (unsigned long) block);
1945 }
1946
1947 block->inode = NULL;
1948 li->li_block = NULL;
1949}
1950
1951int logfs_read_inode(struct inode *inode)
1952{
1953 struct super_block *sb = inode->i_sb;
1954 struct logfs_super *super = logfs_super(sb);
1955 struct inode *master_inode = super->s_master_inode;
1956 struct page *page;
1957 struct logfs_disk_inode *di;
1958 u64 ino = inode->i_ino;
1959
1960 if (ino << sb->s_blocksize_bits > i_size_read(master_inode))
1961 return -ENODATA;
1962 if (!logfs_exist_block(master_inode, ino))
1963 return -ENODATA;
1964
1965 page = read_cache_page(master_inode->i_mapping, ino,
1966 (filler_t *)logfs_readpage, NULL);
1967 if (IS_ERR(page))
1968 return PTR_ERR(page);
1969
1970 di = kmap_atomic(page);
1971 logfs_disk_to_inode(di, inode);
1972 kunmap_atomic(di);
1973 move_page_to_inode(inode, page);
1974 put_page(page);
1975 return 0;
1976}
1977
1978/* Caller must logfs_put_write_page(page); */
1979static struct page *inode_to_page(struct inode *inode)
1980{
1981 struct inode *master_inode = logfs_super(inode->i_sb)->s_master_inode;
1982 struct logfs_disk_inode *di;
1983 struct page *page;
1984
1985 BUG_ON(inode->i_ino == LOGFS_INO_MASTER);
1986
1987 page = logfs_get_write_page(master_inode, inode->i_ino, 0);
1988 if (!page)
1989 return NULL;
1990
1991 di = kmap_atomic(page);
1992 logfs_inode_to_disk(inode, di);
1993 kunmap_atomic(di);
1994 move_inode_to_page(page, inode);
1995 return page;
1996}
1997
1998static int do_write_inode(struct inode *inode)
1999{
2000 struct super_block *sb = inode->i_sb;
2001 struct inode *master_inode = logfs_super(sb)->s_master_inode;
2002 loff_t size = (inode->i_ino + 1) << inode->i_sb->s_blocksize_bits;
2003 struct page *page;
2004 int err;
2005
2006 BUG_ON(inode->i_ino == LOGFS_INO_MASTER);
2007 /* FIXME: lock inode */
2008
2009 if (i_size_read(master_inode) < size)
2010 i_size_write(master_inode, size);
2011
2012 /* TODO: Tell vfs this inode is clean now */
2013
2014 page = inode_to_page(inode);
2015 if (!page)
2016 return -ENOMEM;
2017
2018 /* FIXME: transaction is part of logfs_block now. Is that enough? */
2019 err = logfs_write_buf(master_inode, page, 0);
2020 if (err)
2021 move_page_to_inode(inode, page);
2022
2023 logfs_put_write_page(page);
2024 return err;
2025}
2026
2027static void logfs_mod_segment_entry(struct super_block *sb, u32 segno,
2028 int write,
2029 void (*change_se)(struct logfs_segment_entry *, long),
2030 long arg)
2031{
2032 struct logfs_super *super = logfs_super(sb);
2033 struct inode *inode;
2034 struct page *page;
2035 struct logfs_segment_entry *se;
2036 pgoff_t page_no;
2037 int child_no;
2038
2039 page_no = segno >> (sb->s_blocksize_bits - 3);
2040 child_no = segno & ((sb->s_blocksize >> 3) - 1);
2041
2042 inode = super->s_segfile_inode;
2043 page = logfs_get_write_page(inode, page_no, 0);
2044 BUG_ON(!page); /* FIXME: We need some reserve page for this case */
2045 if (!PageUptodate(page))
2046 logfs_read_block(inode, page, WRITE);
2047
2048 if (write)
2049 alloc_indirect_block(inode, page, 0);
2050 se = kmap_atomic(page);
2051 change_se(se + child_no, arg);
2052 if (write) {
2053 logfs_set_alias(sb, logfs_block(page), child_no);
2054 BUG_ON((int)be32_to_cpu(se[child_no].valid) > super->s_segsize);
2055 }
2056 kunmap_atomic(se);
2057
2058 logfs_put_write_page(page);
2059}
2060
2061static void __get_segment_entry(struct logfs_segment_entry *se, long _target)
2062{
2063 struct logfs_segment_entry *target = (void *)_target;
2064
2065 *target = *se;
2066}
2067
2068void logfs_get_segment_entry(struct super_block *sb, u32 segno,
2069 struct logfs_segment_entry *se)
2070{
2071 logfs_mod_segment_entry(sb, segno, 0, __get_segment_entry, (long)se);
2072}
2073
2074static void __set_segment_used(struct logfs_segment_entry *se, long increment)
2075{
2076 u32 valid;
2077
2078 valid = be32_to_cpu(se->valid);
2079 valid += increment;
2080 se->valid = cpu_to_be32(valid);
2081}
2082
2083void logfs_set_segment_used(struct super_block *sb, u64 ofs, int increment)
2084{
2085 struct logfs_super *super = logfs_super(sb);
2086 u32 segno = ofs >> super->s_segshift;
2087
2088 if (!increment)
2089 return;
2090
2091 logfs_mod_segment_entry(sb, segno, 1, __set_segment_used, increment);
2092}
2093
2094static void __set_segment_erased(struct logfs_segment_entry *se, long ec_level)
2095{
2096 se->ec_level = cpu_to_be32(ec_level);
2097}
2098
2099void logfs_set_segment_erased(struct super_block *sb, u32 segno, u32 ec,
2100 gc_level_t gc_level)
2101{
2102 u32 ec_level = ec << 4 | (__force u8)gc_level;
2103
2104 logfs_mod_segment_entry(sb, segno, 1, __set_segment_erased, ec_level);
2105}
2106
2107static void __set_segment_reserved(struct logfs_segment_entry *se, long ignore)
2108{
2109 se->valid = cpu_to_be32(RESERVED);
2110}
2111
2112void logfs_set_segment_reserved(struct super_block *sb, u32 segno)
2113{
2114 logfs_mod_segment_entry(sb, segno, 1, __set_segment_reserved, 0);
2115}
2116
2117static void __set_segment_unreserved(struct logfs_segment_entry *se,
2118 long ec_level)
2119{
2120 se->valid = 0;
2121 se->ec_level = cpu_to_be32(ec_level);
2122}
2123
2124void logfs_set_segment_unreserved(struct super_block *sb, u32 segno, u32 ec)
2125{
2126 u32 ec_level = ec << 4;
2127
2128 logfs_mod_segment_entry(sb, segno, 1, __set_segment_unreserved,
2129 ec_level);
2130}
2131
2132int __logfs_write_inode(struct inode *inode, struct page *page, long flags)
2133{
2134 struct super_block *sb = inode->i_sb;
2135 int ret;
2136
2137 logfs_get_wblocks(sb, page, flags & WF_LOCK);
2138 ret = do_write_inode(inode);
2139 logfs_put_wblocks(sb, page, flags & WF_LOCK);
2140 return ret;
2141}
2142
2143static int do_delete_inode(struct inode *inode)
2144{
2145 struct super_block *sb = inode->i_sb;
2146 struct inode *master_inode = logfs_super(sb)->s_master_inode;
2147 struct page *page;
2148 int ret;
2149
2150 page = logfs_get_write_page(master_inode, inode->i_ino, 0);
2151 if (!page)
2152 return -ENOMEM;
2153
2154 move_inode_to_page(page, inode);
2155
2156 logfs_get_wblocks(sb, page, 1);
2157 ret = __logfs_delete(master_inode, page);
2158 logfs_put_wblocks(sb, page, 1);
2159
2160 logfs_put_write_page(page);
2161 return ret;
2162}
2163
2164/*
2165 * ZOMBIE inodes have already been deleted before and should remain dead,
2166 * if it weren't for valid checking. No need to kill them again here.
2167 */
2168void logfs_evict_inode(struct inode *inode)
2169{
2170 struct super_block *sb = inode->i_sb;
2171 struct logfs_inode *li = logfs_inode(inode);
2172 struct logfs_block *block = li->li_block;
2173 struct page *page;
2174
2175 if (!inode->i_nlink) {
2176 if (!(li->li_flags & LOGFS_IF_ZOMBIE)) {
2177 li->li_flags |= LOGFS_IF_ZOMBIE;
2178 if (i_size_read(inode) > 0)
2179 logfs_truncate(inode, 0);
2180 do_delete_inode(inode);
2181 }
2182 }
2183 truncate_inode_pages_final(&inode->i_data);
2184 clear_inode(inode);
2185
2186 /* Cheaper version of write_inode. All changes are concealed in
2187 * aliases, which are moved back. No write to the medium happens.
2188 */
2189 /* Only deleted files may be dirty at this point */
2190 BUG_ON(inode->i_state & I_DIRTY && inode->i_nlink);
2191 if (!block)
2192 return;
2193 if ((logfs_super(sb)->s_flags & LOGFS_SB_FLAG_SHUTDOWN)) {
2194 block->ops->free_block(inode->i_sb, block);
2195 return;
2196 }
2197
2198 page = inode_to_page(inode);
2199 BUG_ON(!page); /* FIXME: Use emergency page */
2200 logfs_put_write_page(page);
2201}
2202
2203void btree_write_block(struct logfs_block *block)
2204{
2205 struct inode *inode;
2206 struct page *page;
2207 int err, cookie;
2208
2209 inode = logfs_safe_iget(block->sb, block->ino, &cookie);
2210 page = logfs_get_write_page(inode, block->bix, block->level);
2211
2212 err = logfs_readpage_nolock(page);
2213 BUG_ON(err);
2214 BUG_ON(!PagePrivate(page));
2215 BUG_ON(logfs_block(page) != block);
2216 err = __logfs_write_buf(inode, page, 0);
2217 BUG_ON(err);
2218 BUG_ON(PagePrivate(page) || page->private);
2219
2220 logfs_put_write_page(page);
2221 logfs_safe_iput(inode, cookie);
2222}
2223
2224/**
2225 * logfs_inode_write - write inode or dentry objects
2226 *
2227 * @inode: parent inode (ifile or directory)
2228 * @buf: object to write (inode or dentry)
2229 * @count: object size
2230 * @bix: block index
2231 * @flags: write flags
2232 * @shadow_tree: shadow below this inode
2233 *
2234 * FIXME: All caller of this put a 200-300 byte variable on the stack,
2235 * only to call here and do a memcpy from that stack variable. A good
2236 * example of wasted performance and stack space.
2237 */
2238int logfs_inode_write(struct inode *inode, const void *buf, size_t count,
2239 loff_t bix, long flags, struct shadow_tree *shadow_tree)
2240{
2241 loff_t pos = bix << inode->i_sb->s_blocksize_bits;
2242 int err;
2243 struct page *page;
2244 void *pagebuf;
2245
2246 BUG_ON(pos & (LOGFS_BLOCKSIZE-1));
2247 BUG_ON(count > LOGFS_BLOCKSIZE);
2248 page = logfs_get_write_page(inode, bix, 0);
2249 if (!page)
2250 return -ENOMEM;
2251
2252 pagebuf = kmap_atomic(page);
2253 memcpy(pagebuf, buf, count);
2254 flush_dcache_page(page);
2255 kunmap_atomic(pagebuf);
2256
2257 if (i_size_read(inode) < pos + LOGFS_BLOCKSIZE)
2258 i_size_write(inode, pos + LOGFS_BLOCKSIZE);
2259
2260 err = logfs_write_buf(inode, page, flags);
2261 logfs_put_write_page(page);
2262 return err;
2263}
2264
2265int logfs_open_segfile(struct super_block *sb)
2266{
2267 struct logfs_super *super = logfs_super(sb);
2268 struct inode *inode;
2269
2270 inode = logfs_read_meta_inode(sb, LOGFS_INO_SEGFILE);
2271 if (IS_ERR(inode))
2272 return PTR_ERR(inode);
2273 super->s_segfile_inode = inode;
2274 return 0;
2275}
2276
2277int logfs_init_rw(struct super_block *sb)
2278{
2279 struct logfs_super *super = logfs_super(sb);
2280 int min_fill = 3 * super->s_no_blocks;
2281
2282 INIT_LIST_HEAD(&super->s_object_alias);
2283 INIT_LIST_HEAD(&super->s_writeback_list);
2284 mutex_init(&super->s_write_mutex);
2285 super->s_block_pool = mempool_create_kmalloc_pool(min_fill,
2286 sizeof(struct logfs_block));
2287 super->s_shadow_pool = mempool_create_kmalloc_pool(min_fill,
2288 sizeof(struct logfs_shadow));
2289 return 0;
2290}
2291
2292void logfs_cleanup_rw(struct super_block *sb)
2293{
2294 struct logfs_super *super = logfs_super(sb);
2295
2296 logfs_mempool_destroy(super->s_block_pool);
2297 logfs_mempool_destroy(super->s_shadow_pool);
2298}
diff --git a/fs/logfs/segment.c b/fs/logfs/segment.c
deleted file mode 100644
index 1efd6055f4b0..000000000000
--- a/fs/logfs/segment.c
+++ /dev/null
@@ -1,961 +0,0 @@
1/*
2 * fs/logfs/segment.c - Handling the Object Store
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 *
8 * Object store or ostore makes up the complete device with exception of
9 * the superblock and journal areas. Apart from its own metadata it stores
10 * three kinds of objects: inodes, dentries and blocks, both data and indirect.
11 */
12#include "logfs.h"
13#include <linux/slab.h>
14
15static int logfs_mark_segment_bad(struct super_block *sb, u32 segno)
16{
17 struct logfs_super *super = logfs_super(sb);
18 struct btree_head32 *head = &super->s_reserved_segments;
19 int err;
20
21 err = btree_insert32(head, segno, (void *)1, GFP_NOFS);
22 if (err)
23 return err;
24 logfs_super(sb)->s_bad_segments++;
25 /* FIXME: write to journal */
26 return 0;
27}
28
29int logfs_erase_segment(struct super_block *sb, u32 segno, int ensure_erase)
30{
31 struct logfs_super *super = logfs_super(sb);
32
33 super->s_gec++;
34
35 return super->s_devops->erase(sb, (u64)segno << super->s_segshift,
36 super->s_segsize, ensure_erase);
37}
38
39static s64 logfs_get_free_bytes(struct logfs_area *area, size_t bytes)
40{
41 s32 ofs;
42
43 logfs_open_area(area, bytes);
44
45 ofs = area->a_used_bytes;
46 area->a_used_bytes += bytes;
47 BUG_ON(area->a_used_bytes >= logfs_super(area->a_sb)->s_segsize);
48
49 return dev_ofs(area->a_sb, area->a_segno, ofs);
50}
51
52static struct page *get_mapping_page(struct super_block *sb, pgoff_t index,
53 int use_filler)
54{
55 struct logfs_super *super = logfs_super(sb);
56 struct address_space *mapping = super->s_mapping_inode->i_mapping;
57 filler_t *filler = super->s_devops->readpage;
58 struct page *page;
59
60 BUG_ON(mapping_gfp_constraint(mapping, __GFP_FS));
61 if (use_filler)
62 page = read_cache_page(mapping, index, filler, sb);
63 else {
64 page = find_or_create_page(mapping, index, GFP_NOFS);
65 if (page)
66 unlock_page(page);
67 }
68 return page;
69}
70
71int __logfs_buf_write(struct logfs_area *area, u64 ofs, void *buf, size_t len,
72 int use_filler)
73{
74 pgoff_t index = ofs >> PAGE_SHIFT;
75 struct page *page;
76 long offset = ofs & (PAGE_SIZE-1);
77 long copylen;
78
79 /* Only logfs_wbuf_recover may use len==0 */
80 BUG_ON(!len && !use_filler);
81 do {
82 copylen = min((ulong)len, PAGE_SIZE - offset);
83
84 page = get_mapping_page(area->a_sb, index, use_filler);
85 if (IS_ERR(page))
86 return PTR_ERR(page);
87 BUG_ON(!page); /* FIXME: reserve a pool */
88 SetPageUptodate(page);
89 memcpy(page_address(page) + offset, buf, copylen);
90
91 if (!PagePrivate(page)) {
92 SetPagePrivate(page);
93 get_page(page);
94 }
95 put_page(page);
96
97 buf += copylen;
98 len -= copylen;
99 offset = 0;
100 index++;
101 } while (len);
102 return 0;
103}
104
105static void pad_partial_page(struct logfs_area *area)
106{
107 struct super_block *sb = area->a_sb;
108 struct page *page;
109 u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes);
110 pgoff_t index = ofs >> PAGE_SHIFT;
111 long offset = ofs & (PAGE_SIZE-1);
112 u32 len = PAGE_SIZE - offset;
113
114 if (len % PAGE_SIZE) {
115 page = get_mapping_page(sb, index, 0);
116 BUG_ON(!page); /* FIXME: reserve a pool */
117 memset(page_address(page) + offset, 0xff, len);
118 if (!PagePrivate(page)) {
119 SetPagePrivate(page);
120 get_page(page);
121 }
122 put_page(page);
123 }
124}
125
126static void pad_full_pages(struct logfs_area *area)
127{
128 struct super_block *sb = area->a_sb;
129 struct logfs_super *super = logfs_super(sb);
130 u64 ofs = dev_ofs(sb, area->a_segno, area->a_used_bytes);
131 u32 len = super->s_segsize - area->a_used_bytes;
132 pgoff_t index = PAGE_ALIGN(ofs) >> PAGE_SHIFT;
133 pgoff_t no_indizes = len >> PAGE_SHIFT;
134 struct page *page;
135
136 while (no_indizes) {
137 page = get_mapping_page(sb, index, 0);
138 BUG_ON(!page); /* FIXME: reserve a pool */
139 SetPageUptodate(page);
140 memset(page_address(page), 0xff, PAGE_SIZE);
141 if (!PagePrivate(page)) {
142 SetPagePrivate(page);
143 get_page(page);
144 }
145 put_page(page);
146 index++;
147 no_indizes--;
148 }
149}
150
151/*
152 * bdev_writeseg will write full pages. Memset the tail to prevent data leaks.
153 * Also make sure we allocate (and memset) all pages for final writeout.
154 */
155static void pad_wbuf(struct logfs_area *area, int final)
156{
157 pad_partial_page(area);
158 if (final)
159 pad_full_pages(area);
160}
161
162/*
163 * We have to be careful with the alias tree. Since lookup is done by bix,
164 * it needs to be normalized, so 14, 15, 16, etc. all match when dealing with
165 * indirect blocks. So always use it through accessor functions.
166 */
167static void *alias_tree_lookup(struct super_block *sb, u64 ino, u64 bix,
168 level_t level)
169{
170 struct btree_head128 *head = &logfs_super(sb)->s_object_alias_tree;
171 pgoff_t index = logfs_pack_index(bix, level);
172
173 return btree_lookup128(head, ino, index);
174}
175
176static int alias_tree_insert(struct super_block *sb, u64 ino, u64 bix,
177 level_t level, void *val)
178{
179 struct btree_head128 *head = &logfs_super(sb)->s_object_alias_tree;
180 pgoff_t index = logfs_pack_index(bix, level);
181
182 return btree_insert128(head, ino, index, val, GFP_NOFS);
183}
184
185static int btree_write_alias(struct super_block *sb, struct logfs_block *block,
186 write_alias_t *write_one_alias)
187{
188 struct object_alias_item *item;
189 int err;
190
191 list_for_each_entry(item, &block->item_list, list) {
192 err = write_alias_journal(sb, block->ino, block->bix,
193 block->level, item->child_no, item->val);
194 if (err)
195 return err;
196 }
197 return 0;
198}
199
200static const struct logfs_block_ops btree_block_ops = {
201 .write_block = btree_write_block,
202 .free_block = __free_block,
203 .write_alias = btree_write_alias,
204};
205
206int logfs_load_object_aliases(struct super_block *sb,
207 struct logfs_obj_alias *oa, int count)
208{
209 struct logfs_super *super = logfs_super(sb);
210 struct logfs_block *block;
211 struct object_alias_item *item;
212 u64 ino, bix;
213 level_t level;
214 int i, err;
215
216 super->s_flags |= LOGFS_SB_FLAG_OBJ_ALIAS;
217 count /= sizeof(*oa);
218 for (i = 0; i < count; i++) {
219 item = mempool_alloc(super->s_alias_pool, GFP_NOFS);
220 if (!item)
221 return -ENOMEM;
222 memset(item, 0, sizeof(*item));
223
224 super->s_no_object_aliases++;
225 item->val = oa[i].val;
226 item->child_no = be16_to_cpu(oa[i].child_no);
227
228 ino = be64_to_cpu(oa[i].ino);
229 bix = be64_to_cpu(oa[i].bix);
230 level = LEVEL(oa[i].level);
231
232 log_aliases("logfs_load_object_aliases(%llx, %llx, %x, %x) %llx\n",
233 ino, bix, level, item->child_no,
234 be64_to_cpu(item->val));
235 block = alias_tree_lookup(sb, ino, bix, level);
236 if (!block) {
237 block = __alloc_block(sb, ino, bix, level);
238 block->ops = &btree_block_ops;
239 err = alias_tree_insert(sb, ino, bix, level, block);
240 BUG_ON(err); /* mempool empty */
241 }
242 if (test_and_set_bit(item->child_no, block->alias_map)) {
243 printk(KERN_ERR"LogFS: Alias collision detected\n");
244 return -EIO;
245 }
246 list_move_tail(&block->alias_list, &super->s_object_alias);
247 list_add(&item->list, &block->item_list);
248 }
249 return 0;
250}
251
252static void kill_alias(void *_block, unsigned long ignore0,
253 u64 ignore1, u64 ignore2, size_t ignore3)
254{
255 struct logfs_block *block = _block;
256 struct super_block *sb = block->sb;
257 struct logfs_super *super = logfs_super(sb);
258 struct object_alias_item *item;
259
260 while (!list_empty(&block->item_list)) {
261 item = list_entry(block->item_list.next, typeof(*item), list);
262 list_del(&item->list);
263 mempool_free(item, super->s_alias_pool);
264 }
265 block->ops->free_block(sb, block);
266}
267
268static int obj_type(struct inode *inode, level_t level)
269{
270 if (level == 0) {
271 if (S_ISDIR(inode->i_mode))
272 return OBJ_DENTRY;
273 if (inode->i_ino == LOGFS_INO_MASTER)
274 return OBJ_INODE;
275 }
276 return OBJ_BLOCK;
277}
278
279static int obj_len(struct super_block *sb, int obj_type)
280{
281 switch (obj_type) {
282 case OBJ_DENTRY:
283 return sizeof(struct logfs_disk_dentry);
284 case OBJ_INODE:
285 return sizeof(struct logfs_disk_inode);
286 case OBJ_BLOCK:
287 return sb->s_blocksize;
288 default:
289 BUG();
290 }
291}
292
293static int __logfs_segment_write(struct inode *inode, void *buf,
294 struct logfs_shadow *shadow, int type, int len, int compr)
295{
296 struct logfs_area *area;
297 struct super_block *sb = inode->i_sb;
298 s64 ofs;
299 struct logfs_object_header h;
300 int acc_len;
301
302 if (shadow->gc_level == 0)
303 acc_len = len;
304 else
305 acc_len = obj_len(sb, type);
306
307 area = get_area(sb, shadow->gc_level);
308 ofs = logfs_get_free_bytes(area, len + LOGFS_OBJECT_HEADERSIZE);
309 LOGFS_BUG_ON(ofs <= 0, sb);
310 /*
311 * Order is important. logfs_get_free_bytes(), by modifying the
312 * segment file, may modify the content of the very page we're about
313 * to write now. Which is fine, as long as the calculated crc and
314 * written data still match. So do the modifications _before_
315 * calculating the crc.
316 */
317
318 h.len = cpu_to_be16(len);
319 h.type = type;
320 h.compr = compr;
321 h.ino = cpu_to_be64(inode->i_ino);
322 h.bix = cpu_to_be64(shadow->bix);
323 h.crc = logfs_crc32(&h, sizeof(h) - 4, 4);
324 h.data_crc = logfs_crc32(buf, len, 0);
325
326 logfs_buf_write(area, ofs, &h, sizeof(h));
327 logfs_buf_write(area, ofs + LOGFS_OBJECT_HEADERSIZE, buf, len);
328
329 shadow->new_ofs = ofs;
330 shadow->new_len = acc_len + LOGFS_OBJECT_HEADERSIZE;
331
332 return 0;
333}
334
335static s64 logfs_segment_write_compress(struct inode *inode, void *buf,
336 struct logfs_shadow *shadow, int type, int len)
337{
338 struct super_block *sb = inode->i_sb;
339 void *compressor_buf = logfs_super(sb)->s_compressed_je;
340 ssize_t compr_len;
341 int ret;
342
343 mutex_lock(&logfs_super(sb)->s_journal_mutex);
344 compr_len = logfs_compress(buf, compressor_buf, len, len);
345
346 if (compr_len >= 0) {
347 ret = __logfs_segment_write(inode, compressor_buf, shadow,
348 type, compr_len, COMPR_ZLIB);
349 } else {
350 ret = __logfs_segment_write(inode, buf, shadow, type, len,
351 COMPR_NONE);
352 }
353 mutex_unlock(&logfs_super(sb)->s_journal_mutex);
354 return ret;
355}
356
357/**
358 * logfs_segment_write - write data block to object store
359 * @inode: inode containing data
360 *
361 * Returns an errno or zero.
362 */
363int logfs_segment_write(struct inode *inode, struct page *page,
364 struct logfs_shadow *shadow)
365{
366 struct super_block *sb = inode->i_sb;
367 struct logfs_super *super = logfs_super(sb);
368 int do_compress, type, len;
369 int ret;
370 void *buf;
371
372 super->s_flags |= LOGFS_SB_FLAG_DIRTY;
373 BUG_ON(super->s_flags & LOGFS_SB_FLAG_SHUTDOWN);
374 do_compress = logfs_inode(inode)->li_flags & LOGFS_IF_COMPRESSED;
375 if (shadow->gc_level != 0) {
376 /* temporarily disable compression for indirect blocks */
377 do_compress = 0;
378 }
379
380 type = obj_type(inode, shrink_level(shadow->gc_level));
381 len = obj_len(sb, type);
382 buf = kmap(page);
383 if (do_compress)
384 ret = logfs_segment_write_compress(inode, buf, shadow, type,
385 len);
386 else
387 ret = __logfs_segment_write(inode, buf, shadow, type, len,
388 COMPR_NONE);
389 kunmap(page);
390
391 log_segment("logfs_segment_write(%llx, %llx, %x) %llx->%llx %x->%x\n",
392 shadow->ino, shadow->bix, shadow->gc_level,
393 shadow->old_ofs, shadow->new_ofs,
394 shadow->old_len, shadow->new_len);
395 /* this BUG_ON did catch a locking bug. useful */
396 BUG_ON(!(shadow->new_ofs & (super->s_segsize - 1)));
397 return ret;
398}
399
400int wbuf_read(struct super_block *sb, u64 ofs, size_t len, void *buf)
401{
402 pgoff_t index = ofs >> PAGE_SHIFT;
403 struct page *page;
404 long offset = ofs & (PAGE_SIZE-1);
405 long copylen;
406
407 while (len) {
408 copylen = min((ulong)len, PAGE_SIZE - offset);
409
410 page = get_mapping_page(sb, index, 1);
411 if (IS_ERR(page))
412 return PTR_ERR(page);
413 memcpy(buf, page_address(page) + offset, copylen);
414 put_page(page);
415
416 buf += copylen;
417 len -= copylen;
418 offset = 0;
419 index++;
420 }
421 return 0;
422}
423
424/*
425 * The "position" of indirect blocks is ambiguous. It can be the position
426 * of any data block somewhere behind this indirect block. So we need to
427 * normalize the positions through logfs_block_mask() before comparing.
428 */
429static int check_pos(struct super_block *sb, u64 pos1, u64 pos2, level_t level)
430{
431 return (pos1 & logfs_block_mask(sb, level)) !=
432 (pos2 & logfs_block_mask(sb, level));
433}
434
435#if 0
436static int read_seg_header(struct super_block *sb, u64 ofs,
437 struct logfs_segment_header *sh)
438{
439 __be32 crc;
440 int err;
441
442 err = wbuf_read(sb, ofs, sizeof(*sh), sh);
443 if (err)
444 return err;
445 crc = logfs_crc32(sh, sizeof(*sh), 4);
446 if (crc != sh->crc) {
447 printk(KERN_ERR"LOGFS: header crc error at %llx: expected %x, "
448 "got %x\n", ofs, be32_to_cpu(sh->crc),
449 be32_to_cpu(crc));
450 return -EIO;
451 }
452 return 0;
453}
454#endif
455
456static int read_obj_header(struct super_block *sb, u64 ofs,
457 struct logfs_object_header *oh)
458{
459 __be32 crc;
460 int err;
461
462 err = wbuf_read(sb, ofs, sizeof(*oh), oh);
463 if (err)
464 return err;
465 crc = logfs_crc32(oh, sizeof(*oh) - 4, 4);
466 if (crc != oh->crc) {
467 printk(KERN_ERR"LOGFS: header crc error at %llx: expected %x, "
468 "got %x\n", ofs, be32_to_cpu(oh->crc),
469 be32_to_cpu(crc));
470 return -EIO;
471 }
472 return 0;
473}
474
475static void move_btree_to_page(struct inode *inode, struct page *page,
476 __be64 *data)
477{
478 struct super_block *sb = inode->i_sb;
479 struct logfs_super *super = logfs_super(sb);
480 struct btree_head128 *head = &super->s_object_alias_tree;
481 struct logfs_block *block;
482 struct object_alias_item *item, *next;
483
484 if (!(super->s_flags & LOGFS_SB_FLAG_OBJ_ALIAS))
485 return;
486
487 block = btree_remove128(head, inode->i_ino, page->index);
488 if (!block)
489 return;
490
491 log_blockmove("move_btree_to_page(%llx, %llx, %x)\n",
492 block->ino, block->bix, block->level);
493 list_for_each_entry_safe(item, next, &block->item_list, list) {
494 data[item->child_no] = item->val;
495 list_del(&item->list);
496 mempool_free(item, super->s_alias_pool);
497 }
498 block->page = page;
499
500 if (!PagePrivate(page)) {
501 SetPagePrivate(page);
502 get_page(page);
503 set_page_private(page, (unsigned long) block);
504 }
505 block->ops = &indirect_block_ops;
506 initialize_block_counters(page, block, data, 0);
507}
508
509/*
510 * This silences a false, yet annoying gcc warning. I hate it when my editor
511 * jumps into bitops.h each time I recompile this file.
512 * TODO: Complain to gcc folks about this and upgrade compiler.
513 */
514static unsigned long fnb(const unsigned long *addr,
515 unsigned long size, unsigned long offset)
516{
517 return find_next_bit(addr, size, offset);
518}
519
520void move_page_to_btree(struct page *page)
521{
522 struct logfs_block *block = logfs_block(page);
523 struct super_block *sb = block->sb;
524 struct logfs_super *super = logfs_super(sb);
525 struct object_alias_item *item;
526 unsigned long pos;
527 __be64 *child;
528 int err;
529
530 if (super->s_flags & LOGFS_SB_FLAG_SHUTDOWN) {
531 block->ops->free_block(sb, block);
532 return;
533 }
534 log_blockmove("move_page_to_btree(%llx, %llx, %x)\n",
535 block->ino, block->bix, block->level);
536 super->s_flags |= LOGFS_SB_FLAG_OBJ_ALIAS;
537
538 for (pos = 0; ; pos++) {
539 pos = fnb(block->alias_map, LOGFS_BLOCK_FACTOR, pos);
540 if (pos >= LOGFS_BLOCK_FACTOR)
541 break;
542
543 item = mempool_alloc(super->s_alias_pool, GFP_NOFS);
544 BUG_ON(!item); /* mempool empty */
545 memset(item, 0, sizeof(*item));
546
547 child = kmap_atomic(page);
548 item->val = child[pos];
549 kunmap_atomic(child);
550 item->child_no = pos;
551 list_add(&item->list, &block->item_list);
552 }
553 block->page = NULL;
554
555 if (PagePrivate(page)) {
556 ClearPagePrivate(page);
557 put_page(page);
558 set_page_private(page, 0);
559 }
560 block->ops = &btree_block_ops;
561 err = alias_tree_insert(block->sb, block->ino, block->bix, block->level,
562 block);
563 BUG_ON(err); /* mempool empty */
564 ClearPageUptodate(page);
565}
566
567static int __logfs_segment_read(struct inode *inode, void *buf,
568 u64 ofs, u64 bix, level_t level)
569{
570 struct super_block *sb = inode->i_sb;
571 void *compressor_buf = logfs_super(sb)->s_compressed_je;
572 struct logfs_object_header oh;
573 __be32 crc;
574 u16 len;
575 int err, block_len;
576
577 block_len = obj_len(sb, obj_type(inode, level));
578 err = read_obj_header(sb, ofs, &oh);
579 if (err)
580 goto out_err;
581
582 err = -EIO;
583 if (be64_to_cpu(oh.ino) != inode->i_ino
584 || check_pos(sb, be64_to_cpu(oh.bix), bix, level)) {
585 printk(KERN_ERR"LOGFS: (ino, bix) don't match at %llx: "
586 "expected (%lx, %llx), got (%llx, %llx)\n",
587 ofs, inode->i_ino, bix,
588 be64_to_cpu(oh.ino), be64_to_cpu(oh.bix));
589 goto out_err;
590 }
591
592 len = be16_to_cpu(oh.len);
593
594 switch (oh.compr) {
595 case COMPR_NONE:
596 err = wbuf_read(sb, ofs + LOGFS_OBJECT_HEADERSIZE, len, buf);
597 if (err)
598 goto out_err;
599 crc = logfs_crc32(buf, len, 0);
600 if (crc != oh.data_crc) {
601 printk(KERN_ERR"LOGFS: uncompressed data crc error at "
602 "%llx: expected %x, got %x\n", ofs,
603 be32_to_cpu(oh.data_crc),
604 be32_to_cpu(crc));
605 goto out_err;
606 }
607 break;
608 case COMPR_ZLIB:
609 mutex_lock(&logfs_super(sb)->s_journal_mutex);
610 err = wbuf_read(sb, ofs + LOGFS_OBJECT_HEADERSIZE, len,
611 compressor_buf);
612 if (err) {
613 mutex_unlock(&logfs_super(sb)->s_journal_mutex);
614 goto out_err;
615 }
616 crc = logfs_crc32(compressor_buf, len, 0);
617 if (crc != oh.data_crc) {
618 printk(KERN_ERR"LOGFS: compressed data crc error at "
619 "%llx: expected %x, got %x\n", ofs,
620 be32_to_cpu(oh.data_crc),
621 be32_to_cpu(crc));
622 mutex_unlock(&logfs_super(sb)->s_journal_mutex);
623 goto out_err;
624 }
625 err = logfs_uncompress(compressor_buf, buf, len, block_len);
626 mutex_unlock(&logfs_super(sb)->s_journal_mutex);
627 if (err) {
628 printk(KERN_ERR"LOGFS: uncompress error at %llx\n", ofs);
629 goto out_err;
630 }
631 break;
632 default:
633 LOGFS_BUG(sb);
634 err = -EIO;
635 goto out_err;
636 }
637 return 0;
638
639out_err:
640 logfs_set_ro(sb);
641 printk(KERN_ERR"LOGFS: device is read-only now\n");
642 LOGFS_BUG(sb);
643 return err;
644}
645
646/**
647 * logfs_segment_read - read data block from object store
648 * @inode: inode containing data
649 * @buf: data buffer
650 * @ofs: physical data offset
651 * @bix: block index
652 * @level: block level
653 *
654 * Returns 0 on success or a negative errno.
655 */
656int logfs_segment_read(struct inode *inode, struct page *page,
657 u64 ofs, u64 bix, level_t level)
658{
659 int err;
660 void *buf;
661
662 if (PageUptodate(page))
663 return 0;
664
665 ofs &= ~LOGFS_FULLY_POPULATED;
666
667 buf = kmap(page);
668 err = __logfs_segment_read(inode, buf, ofs, bix, level);
669 if (!err) {
670 move_btree_to_page(inode, page, buf);
671 SetPageUptodate(page);
672 }
673 kunmap(page);
674 log_segment("logfs_segment_read(%lx, %llx, %x) %llx (%d)\n",
675 inode->i_ino, bix, level, ofs, err);
676 return err;
677}
678
679int logfs_segment_delete(struct inode *inode, struct logfs_shadow *shadow)
680{
681 struct super_block *sb = inode->i_sb;
682 struct logfs_super *super = logfs_super(sb);
683 struct logfs_object_header h;
684 u16 len;
685 int err;
686
687 super->s_flags |= LOGFS_SB_FLAG_DIRTY;
688 BUG_ON(super->s_flags & LOGFS_SB_FLAG_SHUTDOWN);
689 BUG_ON(shadow->old_ofs & LOGFS_FULLY_POPULATED);
690 if (!shadow->old_ofs)
691 return 0;
692
693 log_segment("logfs_segment_delete(%llx, %llx, %x) %llx->%llx %x->%x\n",
694 shadow->ino, shadow->bix, shadow->gc_level,
695 shadow->old_ofs, shadow->new_ofs,
696 shadow->old_len, shadow->new_len);
697 err = read_obj_header(sb, shadow->old_ofs, &h);
698 LOGFS_BUG_ON(err, sb);
699 LOGFS_BUG_ON(be64_to_cpu(h.ino) != inode->i_ino, sb);
700 LOGFS_BUG_ON(check_pos(sb, shadow->bix, be64_to_cpu(h.bix),
701 shrink_level(shadow->gc_level)), sb);
702
703 if (shadow->gc_level == 0)
704 len = be16_to_cpu(h.len);
705 else
706 len = obj_len(sb, h.type);
707 shadow->old_len = len + sizeof(h);
708 return 0;
709}
710
711void freeseg(struct super_block *sb, u32 segno)
712{
713 struct logfs_super *super = logfs_super(sb);
714 struct address_space *mapping = super->s_mapping_inode->i_mapping;
715 struct page *page;
716 u64 ofs, start, end;
717
718 start = dev_ofs(sb, segno, 0);
719 end = dev_ofs(sb, segno + 1, 0);
720 for (ofs = start; ofs < end; ofs += PAGE_SIZE) {
721 page = find_get_page(mapping, ofs >> PAGE_SHIFT);
722 if (!page)
723 continue;
724 if (PagePrivate(page)) {
725 ClearPagePrivate(page);
726 put_page(page);
727 }
728 put_page(page);
729 }
730}
731
732int logfs_open_area(struct logfs_area *area, size_t bytes)
733{
734 struct super_block *sb = area->a_sb;
735 struct logfs_super *super = logfs_super(sb);
736 int err, closed = 0;
737
738 if (area->a_is_open && area->a_used_bytes + bytes <= super->s_segsize)
739 return 0;
740
741 if (area->a_is_open) {
742 u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
743 u32 len = super->s_segsize - area->a_written_bytes;
744
745 log_gc("logfs_close_area(%x)\n", area->a_segno);
746 pad_wbuf(area, 1);
747 super->s_devops->writeseg(area->a_sb, ofs, len);
748 freeseg(sb, area->a_segno);
749 closed = 1;
750 }
751
752 area->a_used_bytes = 0;
753 area->a_written_bytes = 0;
754again:
755 area->a_ops->get_free_segment(area);
756 area->a_ops->get_erase_count(area);
757
758 log_gc("logfs_open_area(%x, %x)\n", area->a_segno, area->a_level);
759 err = area->a_ops->erase_segment(area);
760 if (err) {
761 printk(KERN_WARNING "LogFS: Error erasing segment %x\n",
762 area->a_segno);
763 logfs_mark_segment_bad(sb, area->a_segno);
764 goto again;
765 }
766 area->a_is_open = 1;
767 return closed;
768}
769
770void logfs_sync_area(struct logfs_area *area)
771{
772 struct super_block *sb = area->a_sb;
773 struct logfs_super *super = logfs_super(sb);
774 u64 ofs = dev_ofs(sb, area->a_segno, area->a_written_bytes);
775 u32 len = (area->a_used_bytes - area->a_written_bytes);
776
777 if (super->s_writesize)
778 len &= ~(super->s_writesize - 1);
779 if (len == 0)
780 return;
781 pad_wbuf(area, 0);
782 super->s_devops->writeseg(sb, ofs, len);
783 area->a_written_bytes += len;
784}
785
786void logfs_sync_segments(struct super_block *sb)
787{
788 struct logfs_super *super = logfs_super(sb);
789 int i;
790
791 for_each_area(i)
792 logfs_sync_area(super->s_area[i]);
793}
794
795/*
796 * Pick a free segment to be used for this area. Effectively takes a
797 * candidate from the free list (not really a candidate anymore).
798 */
799static void ostore_get_free_segment(struct logfs_area *area)
800{
801 struct super_block *sb = area->a_sb;
802 struct logfs_super *super = logfs_super(sb);
803
804 if (super->s_free_list.count == 0) {
805 printk(KERN_ERR"LOGFS: ran out of free segments\n");
806 LOGFS_BUG(sb);
807 }
808
809 area->a_segno = get_best_cand(sb, &super->s_free_list, NULL);
810}
811
812static void ostore_get_erase_count(struct logfs_area *area)
813{
814 struct logfs_segment_entry se;
815 u32 ec_level;
816
817 logfs_get_segment_entry(area->a_sb, area->a_segno, &se);
818 BUG_ON(se.ec_level == cpu_to_be32(BADSEG) ||
819 se.valid == cpu_to_be32(RESERVED));
820
821 ec_level = be32_to_cpu(se.ec_level);
822 area->a_erase_count = (ec_level >> 4) + 1;
823}
824
825static int ostore_erase_segment(struct logfs_area *area)
826{
827 struct super_block *sb = area->a_sb;
828 struct logfs_segment_header sh;
829 u64 ofs;
830 int err;
831
832 err = logfs_erase_segment(sb, area->a_segno, 0);
833 if (err)
834 return err;
835
836 sh.pad = 0;
837 sh.type = SEG_OSTORE;
838 sh.level = (__force u8)area->a_level;
839 sh.segno = cpu_to_be32(area->a_segno);
840 sh.ec = cpu_to_be32(area->a_erase_count);
841 sh.gec = cpu_to_be64(logfs_super(sb)->s_gec);
842 sh.crc = logfs_crc32(&sh, sizeof(sh), 4);
843
844 logfs_set_segment_erased(sb, area->a_segno, area->a_erase_count,
845 area->a_level);
846
847 ofs = dev_ofs(sb, area->a_segno, 0);
848 area->a_used_bytes = sizeof(sh);
849 logfs_buf_write(area, ofs, &sh, sizeof(sh));
850 return 0;
851}
852
853static const struct logfs_area_ops ostore_area_ops = {
854 .get_free_segment = ostore_get_free_segment,
855 .get_erase_count = ostore_get_erase_count,
856 .erase_segment = ostore_erase_segment,
857};
858
859static void free_area(struct logfs_area *area)
860{
861 if (area)
862 freeseg(area->a_sb, area->a_segno);
863 kfree(area);
864}
865
866void free_areas(struct super_block *sb)
867{
868 struct logfs_super *super = logfs_super(sb);
869 int i;
870
871 for_each_area(i)
872 free_area(super->s_area[i]);
873 free_area(super->s_journal_area);
874}
875
876static struct logfs_area *alloc_area(struct super_block *sb)
877{
878 struct logfs_area *area;
879
880 area = kzalloc(sizeof(*area), GFP_KERNEL);
881 if (!area)
882 return NULL;
883
884 area->a_sb = sb;
885 return area;
886}
887
888static void map_invalidatepage(struct page *page, unsigned int o,
889 unsigned int l)
890{
891 return;
892}
893
894static int map_releasepage(struct page *page, gfp_t g)
895{
896 /* Don't release these pages */
897 return 0;
898}
899
900static const struct address_space_operations mapping_aops = {
901 .invalidatepage = map_invalidatepage,
902 .releasepage = map_releasepage,
903 .set_page_dirty = __set_page_dirty_nobuffers,
904};
905
906int logfs_init_mapping(struct super_block *sb)
907{
908 struct logfs_super *super = logfs_super(sb);
909 struct address_space *mapping;
910 struct inode *inode;
911
912 inode = logfs_new_meta_inode(sb, LOGFS_INO_MAPPING);
913 if (IS_ERR(inode))
914 return PTR_ERR(inode);
915 super->s_mapping_inode = inode;
916 mapping = inode->i_mapping;
917 mapping->a_ops = &mapping_aops;
918 /* Would it be possible to use __GFP_HIGHMEM as well? */
919 mapping_set_gfp_mask(mapping, GFP_NOFS);
920 return 0;
921}
922
923int logfs_init_areas(struct super_block *sb)
924{
925 struct logfs_super *super = logfs_super(sb);
926 int i = -1;
927
928 super->s_alias_pool = mempool_create_kmalloc_pool(600,
929 sizeof(struct object_alias_item));
930 if (!super->s_alias_pool)
931 return -ENOMEM;
932
933 super->s_journal_area = alloc_area(sb);
934 if (!super->s_journal_area)
935 goto err;
936
937 for_each_area(i) {
938 super->s_area[i] = alloc_area(sb);
939 if (!super->s_area[i])
940 goto err;
941 super->s_area[i]->a_level = GC_LEVEL(i);
942 super->s_area[i]->a_ops = &ostore_area_ops;
943 }
944 btree_init_mempool128(&super->s_object_alias_tree,
945 super->s_btree_pool);
946 return 0;
947
948err:
949 for (i--; i >= 0; i--)
950 free_area(super->s_area[i]);
951 free_area(super->s_journal_area);
952 logfs_mempool_destroy(super->s_alias_pool);
953 return -ENOMEM;
954}
955
956void logfs_cleanup_areas(struct super_block *sb)
957{
958 struct logfs_super *super = logfs_super(sb);
959
960 btree_grim_visitor128(&super->s_object_alias_tree, 0, kill_alias);
961}
diff --git a/fs/logfs/super.c b/fs/logfs/super.c
deleted file mode 100644
index 5751082dba52..000000000000
--- a/fs/logfs/super.c
+++ /dev/null
@@ -1,653 +0,0 @@
1/*
2 * fs/logfs/super.c
3 *
4 * As should be obvious for Linux kernel code, license is GPLv2
5 *
6 * Copyright (c) 2005-2008 Joern Engel <joern@logfs.org>
7 *
8 * Generally contains mount/umount code and also serves as a dump area for
9 * any functions that don't fit elsewhere and neither justify a file of their
10 * own.
11 */
12#include "logfs.h"
13#include <linux/bio.h>
14#include <linux/slab.h>
15#include <linux/blkdev.h>
16#include <linux/module.h>
17#include <linux/mtd/mtd.h>
18#include <linux/statfs.h>
19#include <linux/buffer_head.h>
20
21static DEFINE_MUTEX(emergency_mutex);
22static struct page *emergency_page;
23
24struct page *emergency_read_begin(struct address_space *mapping, pgoff_t index)
25{
26 filler_t *filler = (filler_t *)mapping->a_ops->readpage;
27 struct page *page;
28 int err;
29
30 page = read_cache_page(mapping, index, filler, NULL);
31 if (page)
32 return page;
33
34 /* No more pages available, switch to emergency page */
35 printk(KERN_INFO"Logfs: Using emergency page\n");
36 mutex_lock(&emergency_mutex);
37 err = filler(NULL, emergency_page);
38 if (err) {
39 mutex_unlock(&emergency_mutex);
40 printk(KERN_EMERG"Logfs: Error reading emergency page\n");
41 return ERR_PTR(err);
42 }
43 return emergency_page;
44}
45
46void emergency_read_end(struct page *page)
47{
48 if (page == emergency_page)
49 mutex_unlock(&emergency_mutex);
50 else
51 put_page(page);
52}
53
54static void dump_segfile(struct super_block *sb)
55{
56 struct logfs_super *super = logfs_super(sb);
57 struct logfs_segment_entry se;
58 u32 segno;
59
60 for (segno = 0; segno < super->s_no_segs; segno++) {
61 logfs_get_segment_entry(sb, segno, &se);
62 printk("%3x: %6x %8x", segno, be32_to_cpu(se.ec_level),
63 be32_to_cpu(se.valid));
64 if (++segno < super->s_no_segs) {
65 logfs_get_segment_entry(sb, segno, &se);
66 printk(" %6x %8x", be32_to_cpu(se.ec_level),
67 be32_to_cpu(se.valid));
68 }
69 if (++segno < super->s_no_segs) {
70 logfs_get_segment_entry(sb, segno, &se);
71 printk(" %6x %8x", be32_to_cpu(se.ec_level),
72 be32_to_cpu(se.valid));
73 }
74 if (++segno < super->s_no_segs) {
75 logfs_get_segment_entry(sb, segno, &se);
76 printk(" %6x %8x", be32_to_cpu(se.ec_level),
77 be32_to_cpu(se.valid));
78 }
79 printk("\n");
80 }
81}
82
83/*
84 * logfs_crash_dump - dump debug information to device
85 *
86 * The LogFS superblock only occupies part of a segment. This function will
87 * write as much debug information as it can gather into the spare space.
88 */
89void logfs_crash_dump(struct super_block *sb)
90{
91 dump_segfile(sb);
92}
93
94/*
95 * FIXME: There should be a reserve for root, similar to ext2.
96 */
97int logfs_statfs(struct dentry *dentry, struct kstatfs *stats)
98{
99 struct super_block *sb = dentry->d_sb;
100 struct logfs_super *super = logfs_super(sb);
101
102 stats->f_type = LOGFS_MAGIC_U32;
103 stats->f_bsize = sb->s_blocksize;
104 stats->f_blocks = super->s_size >> LOGFS_BLOCK_BITS >> 3;
105 stats->f_bfree = super->s_free_bytes >> sb->s_blocksize_bits;
106 stats->f_bavail = super->s_free_bytes >> sb->s_blocksize_bits;
107 stats->f_files = 0;
108 stats->f_ffree = 0;
109 stats->f_namelen = LOGFS_MAX_NAMELEN;
110 return 0;
111}
112
113static int logfs_sb_set(struct super_block *sb, void *_super)
114{
115 struct logfs_super *super = _super;
116
117 sb->s_fs_info = super;
118 sb->s_mtd = super->s_mtd;
119 sb->s_bdev = super->s_bdev;
120#ifdef CONFIG_BLOCK
121 if (sb->s_bdev)
122 sb->s_bdi = &bdev_get_queue(sb->s_bdev)->backing_dev_info;
123#endif
124#ifdef CONFIG_MTD
125 if (sb->s_mtd)
126 sb->s_bdi = sb->s_mtd->backing_dev_info;
127#endif
128 return 0;
129}
130
131static int logfs_sb_test(struct super_block *sb, void *_super)
132{
133 struct logfs_super *super = _super;
134 struct mtd_info *mtd = super->s_mtd;
135
136 if (mtd && sb->s_mtd == mtd)
137 return 1;
138 if (super->s_bdev && sb->s_bdev == super->s_bdev)
139 return 1;
140 return 0;
141}
142
143static void set_segment_header(struct logfs_segment_header *sh, u8 type,
144 u8 level, u32 segno, u32 ec)
145{
146 sh->pad = 0;
147 sh->type = type;
148 sh->level = level;
149 sh->segno = cpu_to_be32(segno);
150 sh->ec = cpu_to_be32(ec);
151 sh->gec = cpu_to_be64(segno);
152 sh->crc = logfs_crc32(sh, LOGFS_SEGMENT_HEADERSIZE, 4);
153}
154
155static void logfs_write_ds(struct super_block *sb, struct logfs_disk_super *ds,
156 u32 segno, u32 ec)
157{
158 struct logfs_super *super = logfs_super(sb);
159 struct logfs_segment_header *sh = &ds->ds_sh;
160 int i;
161
162 memset(ds, 0, sizeof(*ds));
163 set_segment_header(sh, SEG_SUPER, 0, segno, ec);
164
165 ds->ds_ifile_levels = super->s_ifile_levels;
166 ds->ds_iblock_levels = super->s_iblock_levels;
167 ds->ds_data_levels = super->s_data_levels; /* XXX: Remove */
168 ds->ds_segment_shift = super->s_segshift;
169 ds->ds_block_shift = sb->s_blocksize_bits;
170 ds->ds_write_shift = super->s_writeshift;
171 ds->ds_filesystem_size = cpu_to_be64(super->s_size);
172 ds->ds_segment_size = cpu_to_be32(super->s_segsize);
173 ds->ds_bad_seg_reserve = cpu_to_be32(super->s_bad_seg_reserve);
174 ds->ds_feature_incompat = cpu_to_be64(super->s_feature_incompat);
175 ds->ds_feature_ro_compat= cpu_to_be64(super->s_feature_ro_compat);
176 ds->ds_feature_compat = cpu_to_be64(super->s_feature_compat);
177 ds->ds_feature_flags = cpu_to_be64(super->s_feature_flags);
178 ds->ds_root_reserve = cpu_to_be64(super->s_root_reserve);
179 ds->ds_speed_reserve = cpu_to_be64(super->s_speed_reserve);
180 journal_for_each(i)
181 ds->ds_journal_seg[i] = cpu_to_be32(super->s_journal_seg[i]);
182 ds->ds_magic = cpu_to_be64(LOGFS_MAGIC);
183 ds->ds_crc = logfs_crc32(ds, sizeof(*ds),
184 LOGFS_SEGMENT_HEADERSIZE + 12);
185}
186
187static int write_one_sb(struct super_block *sb,
188 struct page *(*find_sb)(struct super_block *sb, u64 *ofs))
189{
190 struct logfs_super *super = logfs_super(sb);
191 struct logfs_disk_super *ds;
192 struct logfs_segment_entry se;
193 struct page *page;
194 u64 ofs;
195 u32 ec, segno;
196 int err;
197
198 page = find_sb(sb, &ofs);
199 if (!page)
200 return -EIO;
201 ds = page_address(page);
202 segno = seg_no(sb, ofs);
203 logfs_get_segment_entry(sb, segno, &se);
204 ec = be32_to_cpu(se.ec_level) >> 4;
205 ec++;
206 logfs_set_segment_erased(sb, segno, ec, 0);
207 logfs_write_ds(sb, ds, segno, ec);
208 err = super->s_devops->write_sb(sb, page);
209 put_page(page);
210 return err;
211}
212
213int logfs_write_sb(struct super_block *sb)
214{
215 struct logfs_super *super = logfs_super(sb);
216 int err;
217
218 /* First superblock */
219 err = write_one_sb(sb, super->s_devops->find_first_sb);
220 if (err)
221 return err;
222
223 /* Last superblock */
224 err = write_one_sb(sb, super->s_devops->find_last_sb);
225 if (err)
226 return err;
227 return 0;
228}
229
230static int ds_cmp(const void *ds0, const void *ds1)
231{
232 size_t len = sizeof(struct logfs_disk_super);
233
234 /* We know the segment headers differ, so ignore them */
235 len -= LOGFS_SEGMENT_HEADERSIZE;
236 ds0 += LOGFS_SEGMENT_HEADERSIZE;
237 ds1 += LOGFS_SEGMENT_HEADERSIZE;
238 return memcmp(ds0, ds1, len);
239}
240
241static int logfs_recover_sb(struct super_block *sb)
242{
243 struct logfs_super *super = logfs_super(sb);
244 struct logfs_disk_super _ds0, *ds0 = &_ds0;
245 struct logfs_disk_super _ds1, *ds1 = &_ds1;
246 int err, valid0, valid1;
247
248 /* read first superblock */
249 err = wbuf_read(sb, super->s_sb_ofs[0], sizeof(*ds0), ds0);
250 if (err)
251 return err;
252 /* read last superblock */
253 err = wbuf_read(sb, super->s_sb_ofs[1], sizeof(*ds1), ds1);
254 if (err)
255 return err;
256 valid0 = logfs_check_ds(ds0) == 0;
257 valid1 = logfs_check_ds(ds1) == 0;
258
259 if (!valid0 && valid1) {
260 printk(KERN_INFO"First superblock is invalid - fixing.\n");
261 return write_one_sb(sb, super->s_devops->find_first_sb);
262 }
263 if (valid0 && !valid1) {
264 printk(KERN_INFO"Last superblock is invalid - fixing.\n");
265 return write_one_sb(sb, super->s_devops->find_last_sb);
266 }
267 if (valid0 && valid1 && ds_cmp(ds0, ds1)) {
268 printk(KERN_INFO"Superblocks don't match - fixing.\n");
269 return logfs_write_sb(sb);
270 }
271 /* If neither is valid now, something's wrong. Didn't we properly
272 * check them before?!? */
273 BUG_ON(!valid0 && !valid1);
274 return 0;
275}
276
277static int logfs_make_writeable(struct super_block *sb)
278{
279 int err;
280
281 err = logfs_open_segfile(sb);
282 if (err)
283 return err;
284
285 /* Repair any broken superblock copies */
286 err = logfs_recover_sb(sb);
287 if (err)
288 return err;
289
290 /* Check areas for trailing unaccounted data */
291 err = logfs_check_areas(sb);
292 if (err)
293 return err;
294
295 /* Do one GC pass before any data gets dirtied */
296 logfs_gc_pass(sb);
297
298 /* after all initializations are done, replay the journal
299 * for rw-mounts, if necessary */
300 err = logfs_replay_journal(sb);
301 if (err)
302 return err;
303
304 return 0;
305}
306
307static int logfs_get_sb_final(struct super_block *sb)
308{
309 struct logfs_super *super = logfs_super(sb);
310 struct inode *rootdir;
311 int err;
312
313 /* root dir */
314 rootdir = logfs_iget(sb, LOGFS_INO_ROOT);
315 if (IS_ERR(rootdir))
316 goto fail;
317
318 sb->s_root = d_make_root(rootdir);
319 if (!sb->s_root)
320 goto fail;
321
322 /* at that point we know that ->put_super() will be called */
323 super->s_erase_page = alloc_pages(GFP_KERNEL, 0);
324 if (!super->s_erase_page)
325 return -ENOMEM;
326 memset(page_address(super->s_erase_page), 0xFF, PAGE_SIZE);
327
328 /* FIXME: check for read-only mounts */
329 err = logfs_make_writeable(sb);
330 if (err) {
331 __free_page(super->s_erase_page);
332 return err;
333 }
334
335 log_super("LogFS: Finished mounting\n");
336 return 0;
337
338fail:
339 iput(super->s_master_inode);
340 iput(super->s_segfile_inode);
341 iput(super->s_mapping_inode);
342 return -EIO;
343}
344
345int logfs_check_ds(struct logfs_disk_super *ds)
346{
347 struct logfs_segment_header *sh = &ds->ds_sh;
348
349 if (ds->ds_magic != cpu_to_be64(LOGFS_MAGIC))
350 return -EINVAL;
351 if (sh->crc != logfs_crc32(sh, LOGFS_SEGMENT_HEADERSIZE, 4))
352 return -EINVAL;
353 if (ds->ds_crc != logfs_crc32(ds, sizeof(*ds),
354 LOGFS_SEGMENT_HEADERSIZE + 12))
355 return -EINVAL;
356 return 0;
357}
358
359static struct page *find_super_block(struct super_block *sb)
360{
361 struct logfs_super *super = logfs_super(sb);
362 struct page *first, *last;
363
364 first = super->s_devops->find_first_sb(sb, &super->s_sb_ofs[0]);
365 if (!first || IS_ERR(first))
366 return NULL;
367 last = super->s_devops->find_last_sb(sb, &super->s_sb_ofs[1]);
368 if (!last || IS_ERR(last)) {
369 put_page(first);
370 return NULL;
371 }
372
373 if (!logfs_check_ds(page_address(first))) {
374 put_page(last);
375 return first;
376 }
377
378 /* First one didn't work, try the second superblock */
379 if (!logfs_check_ds(page_address(last))) {
380 put_page(first);
381 return last;
382 }
383
384 /* Neither worked, sorry folks */
385 put_page(first);
386 put_page(last);
387 return NULL;
388}
389
390static int __logfs_read_sb(struct super_block *sb)
391{
392 struct logfs_super *super = logfs_super(sb);
393 struct page *page;
394 struct logfs_disk_super *ds;
395 int i;
396
397 page = find_super_block(sb);
398 if (!page)
399 return -EINVAL;
400
401 ds = page_address(page);
402 super->s_size = be64_to_cpu(ds->ds_filesystem_size);
403 super->s_root_reserve = be64_to_cpu(ds->ds_root_reserve);
404 super->s_speed_reserve = be64_to_cpu(ds->ds_speed_reserve);
405 super->s_bad_seg_reserve = be32_to_cpu(ds->ds_bad_seg_reserve);
406 super->s_segsize = 1 << ds->ds_segment_shift;
407 super->s_segmask = (1 << ds->ds_segment_shift) - 1;
408 super->s_segshift = ds->ds_segment_shift;
409 sb->s_blocksize = 1 << ds->ds_block_shift;
410 sb->s_blocksize_bits = ds->ds_block_shift;
411 super->s_writesize = 1 << ds->ds_write_shift;
412 super->s_writeshift = ds->ds_write_shift;
413 super->s_no_segs = super->s_size >> super->s_segshift;
414 super->s_no_blocks = super->s_segsize >> sb->s_blocksize_bits;
415 super->s_feature_incompat = be64_to_cpu(ds->ds_feature_incompat);
416 super->s_feature_ro_compat = be64_to_cpu(ds->ds_feature_ro_compat);
417 super->s_feature_compat = be64_to_cpu(ds->ds_feature_compat);
418 super->s_feature_flags = be64_to_cpu(ds->ds_feature_flags);
419
420 journal_for_each(i)
421 super->s_journal_seg[i] = be32_to_cpu(ds->ds_journal_seg[i]);
422
423 super->s_ifile_levels = ds->ds_ifile_levels;
424 super->s_iblock_levels = ds->ds_iblock_levels;
425 super->s_data_levels = ds->ds_data_levels;
426 super->s_total_levels = super->s_ifile_levels + super->s_iblock_levels
427 + super->s_data_levels;
428 put_page(page);
429 return 0;
430}
431
432static int logfs_read_sb(struct super_block *sb, int read_only)
433{
434 struct logfs_super *super = logfs_super(sb);
435 int ret;
436
437 super->s_btree_pool = mempool_create(32, btree_alloc, btree_free, NULL);
438 if (!super->s_btree_pool)
439 return -ENOMEM;
440
441 btree_init_mempool64(&super->s_shadow_tree.new, super->s_btree_pool);
442 btree_init_mempool64(&super->s_shadow_tree.old, super->s_btree_pool);
443 btree_init_mempool32(&super->s_shadow_tree.segment_map,
444 super->s_btree_pool);
445
446 ret = logfs_init_mapping(sb);
447 if (ret)
448 return ret;
449
450 ret = __logfs_read_sb(sb);
451 if (ret)
452 return ret;
453
454 if (super->s_feature_incompat & ~LOGFS_FEATURES_INCOMPAT)
455 return -EIO;
456 if ((super->s_feature_ro_compat & ~LOGFS_FEATURES_RO_COMPAT) &&
457 !read_only)
458 return -EIO;
459
460 ret = logfs_init_rw(sb);
461 if (ret)
462 return ret;
463
464 ret = logfs_init_areas(sb);
465 if (ret)
466 return ret;
467
468 ret = logfs_init_gc(sb);
469 if (ret)
470 return ret;
471
472 ret = logfs_init_journal(sb);
473 if (ret)
474 return ret;
475
476 return 0;
477}
478
479static void logfs_kill_sb(struct super_block *sb)
480{
481 struct logfs_super *super = logfs_super(sb);
482
483 log_super("LogFS: Start unmounting\n");
484 /* Alias entries slow down mount, so evict as many as possible */
485 sync_filesystem(sb);
486 logfs_write_anchor(sb);
487 free_areas(sb);
488
489 /*
490 * From this point on alias entries are simply dropped - and any
491 * writes to the object store are considered bugs.
492 */
493 log_super("LogFS: Now in shutdown\n");
494 generic_shutdown_super(sb);
495 super->s_flags |= LOGFS_SB_FLAG_SHUTDOWN;
496
497 BUG_ON(super->s_dirty_used_bytes || super->s_dirty_free_bytes);
498
499 logfs_cleanup_gc(sb);
500 logfs_cleanup_journal(sb);
501 logfs_cleanup_areas(sb);
502 logfs_cleanup_rw(sb);
503 if (super->s_erase_page)
504 __free_page(super->s_erase_page);
505 super->s_devops->put_device(super);
506 logfs_mempool_destroy(super->s_btree_pool);
507 logfs_mempool_destroy(super->s_alias_pool);
508 kfree(super);
509 log_super("LogFS: Finished unmounting\n");
510}
511
512static struct dentry *logfs_get_sb_device(struct logfs_super *super,
513 struct file_system_type *type, int flags)
514{
515 struct super_block *sb;
516 int err = -ENOMEM;
517 static int mount_count;
518
519 log_super("LogFS: Start mount %x\n", mount_count++);
520
521 err = -EINVAL;
522 sb = sget(type, logfs_sb_test, logfs_sb_set, flags | MS_NOATIME, super);
523 if (IS_ERR(sb)) {
524 super->s_devops->put_device(super);
525 kfree(super);
526 return ERR_CAST(sb);
527 }
528
529 if (sb->s_root) {
530 /* Device is already in use */
531 super->s_devops->put_device(super);
532 kfree(super);
533 return dget(sb->s_root);
534 }
535
536 /*
537 * sb->s_maxbytes is limited to 8TB. On 32bit systems, the page cache
538 * only covers 16TB and the upper 8TB are used for indirect blocks.
539 * On 64bit system we could bump up the limit, but that would make
540 * the filesystem incompatible with 32bit systems.
541 */
542 sb->s_maxbytes = (1ull << 43) - 1;
543 sb->s_max_links = LOGFS_LINK_MAX;
544 sb->s_op = &logfs_super_operations;
545
546 err = logfs_read_sb(sb, sb->s_flags & MS_RDONLY);
547 if (err)
548 goto err1;
549
550 sb->s_flags |= MS_ACTIVE;
551 err = logfs_get_sb_final(sb);
552 if (err) {
553 deactivate_locked_super(sb);
554 return ERR_PTR(err);
555 }
556 return dget(sb->s_root);
557
558err1:
559 /* no ->s_root, no ->put_super() */
560 iput(super->s_master_inode);
561 iput(super->s_segfile_inode);
562 iput(super->s_mapping_inode);
563 deactivate_locked_super(sb);
564 return ERR_PTR(err);
565}
566
567static struct dentry *logfs_mount(struct file_system_type *type, int flags,
568 const char *devname, void *data)
569{
570 ulong mtdnr;
571 struct logfs_super *super;
572 int err;
573
574 super = kzalloc(sizeof(*super), GFP_KERNEL);
575 if (!super)
576 return ERR_PTR(-ENOMEM);
577
578 mutex_init(&super->s_dirop_mutex);
579 mutex_init(&super->s_object_alias_mutex);
580 INIT_LIST_HEAD(&super->s_freeing_list);
581
582 if (!devname)
583 err = logfs_get_sb_bdev(super, type, devname);
584 else if (strncmp(devname, "mtd", 3))
585 err = logfs_get_sb_bdev(super, type, devname);
586 else {
587 char *garbage;
588 mtdnr = simple_strtoul(devname+3, &garbage, 0);
589 if (*garbage)
590 err = -EINVAL;
591 else
592 err = logfs_get_sb_mtd(super, mtdnr);
593 }
594
595 if (err) {
596 kfree(super);
597 return ERR_PTR(err);
598 }
599
600 return logfs_get_sb_device(super, type, flags);
601}
602
603static struct file_system_type logfs_fs_type = {
604 .owner = THIS_MODULE,
605 .name = "logfs",
606 .mount = logfs_mount,
607 .kill_sb = logfs_kill_sb,
608 .fs_flags = FS_REQUIRES_DEV,
609
610};
611MODULE_ALIAS_FS("logfs");
612
613static int __init logfs_init(void)
614{
615 int ret;
616
617 emergency_page = alloc_pages(GFP_KERNEL, 0);
618 if (!emergency_page)
619 return -ENOMEM;
620
621 ret = logfs_compr_init();
622 if (ret)
623 goto out1;
624
625 ret = logfs_init_inode_cache();
626 if (ret)
627 goto out2;
628
629 ret = register_filesystem(&logfs_fs_type);
630 if (!ret)
631 return 0;
632 logfs_destroy_inode_cache();
633out2:
634 logfs_compr_exit();
635out1:
636 __free_pages(emergency_page, 0);
637 return ret;
638}
639
640static void __exit logfs_exit(void)
641{
642 unregister_filesystem(&logfs_fs_type);
643 logfs_destroy_inode_cache();
644 logfs_compr_exit();
645 __free_pages(emergency_page, 0);
646}
647
648module_init(logfs_init);
649module_exit(logfs_exit);
650
651MODULE_LICENSE("GPL v2");
652MODULE_AUTHOR("Joern Engel <joern@logfs.org>");
653MODULE_DESCRIPTION("scalable flash filesystem");