summaryrefslogtreecommitdiffstats
path: root/fs/erofs
diff options
context:
space:
mode:
Diffstat (limited to 'fs/erofs')
-rw-r--r--fs/erofs/Kconfig98
-rw-r--r--fs/erofs/Makefile11
-rw-r--r--fs/erofs/compress.h60
-rw-r--r--fs/erofs/data.c423
-rw-r--r--fs/erofs/decompressor.c358
-rw-r--r--fs/erofs/dir.c139
-rw-r--r--fs/erofs/erofs_fs.h307
-rw-r--r--fs/erofs/inode.c332
-rw-r--r--fs/erofs/internal.h553
-rw-r--r--fs/erofs/namei.c251
-rw-r--r--fs/erofs/super.c669
-rw-r--r--fs/erofs/tagptr.h110
-rw-r--r--fs/erofs/utils.c333
-rw-r--r--fs/erofs/xattr.c703
-rw-r--r--fs/erofs/xattr.h92
-rw-r--r--fs/erofs/zdata.c1432
-rw-r--r--fs/erofs/zdata.h193
-rw-r--r--fs/erofs/zmap.c466
-rw-r--r--fs/erofs/zpvec.h157
19 files changed, 6687 insertions, 0 deletions
diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
new file mode 100644
index 000000000000..16316d1adca3
--- /dev/null
+++ b/fs/erofs/Kconfig
@@ -0,0 +1,98 @@
1# SPDX-License-Identifier: GPL-2.0-only
2
3config EROFS_FS
4 tristate "EROFS filesystem support"
5 depends on BLOCK
6 help
7 EROFS (Enhanced Read-Only File System) is a lightweight
8 read-only file system with modern designs (eg. page-sized
9 blocks, inline xattrs/data, etc.) for scenarios which need
10 high-performance read-only requirements, e.g. Android OS
11 for mobile phones and LIVECDs.
12
13 It also provides fixed-sized output compression support,
14 which improves storage density, keeps relatively higher
15 compression ratios, which is more useful to achieve high
16 performance for embedded devices with limited memory.
17
18 If unsure, say N.
19
20config EROFS_FS_DEBUG
21 bool "EROFS debugging feature"
22 depends on EROFS_FS
23 help
24 Print debugging messages and enable more BUG_ONs which check
25 filesystem consistency and find potential issues aggressively,
26 which can be used for Android eng build, for example.
27
28 For daily use, say N.
29
30config EROFS_FAULT_INJECTION
31 bool "EROFS fault injection facility"
32 depends on EROFS_FS
33 help
34 Test EROFS to inject faults such as ENOMEM, EIO, and so on.
35 If unsure, say N.
36
37config EROFS_FS_XATTR
38 bool "EROFS extended attributes"
39 depends on EROFS_FS
40 default y
41 help
42 Extended attributes are name:value pairs associated with inodes by
43 the kernel or by users (see the attr(5) manual page, or visit
44 <http://acl.bestbits.at/> for details).
45
46 If unsure, say N.
47
48config EROFS_FS_POSIX_ACL
49 bool "EROFS Access Control Lists"
50 depends on EROFS_FS_XATTR
51 select FS_POSIX_ACL
52 default y
53 help
54 Posix Access Control Lists (ACLs) support permissions for users and
55 groups beyond the owner/group/world scheme.
56
57 To learn more about Access Control Lists, visit the POSIX ACLs for
58 Linux website <http://acl.bestbits.at/>.
59
60 If you don't know what Access Control Lists are, say N.
61
62config EROFS_FS_SECURITY
63 bool "EROFS Security Labels"
64 depends on EROFS_FS_XATTR
65 default y
66 help
67 Security labels provide an access control facility to support Linux
68 Security Models (LSMs) accepted by AppArmor, SELinux, Smack and TOMOYO
69 Linux. This option enables an extended attribute handler for file
70 security labels in the erofs filesystem, so that it requires enabling
71 the extended attribute support in advance.
72
73 If you are not using a security module, say N.
74
75config EROFS_FS_ZIP
76 bool "EROFS Data Compression Support"
77 depends on EROFS_FS
78 select LZ4_DECOMPRESS
79 default y
80 help
81 Enable fixed-sized output compression for EROFS.
82
83 If you don't want to enable compression feature, say N.
84
85config EROFS_FS_CLUSTER_PAGE_LIMIT
86 int "EROFS Cluster Pages Hard Limit"
87 depends on EROFS_FS_ZIP
88 range 1 256
89 default "1"
90 help
91 Indicates maximum # of pages of a compressed
92 physical cluster.
93
94 For example, if files in a image were compressed
95 into 8k-unit, hard limit should not be configured
96 less than 2. Otherwise, the image will be refused
97 to mount on this kernel.
98
diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
new file mode 100644
index 000000000000..46f2aa4ba46c
--- /dev/null
+++ b/fs/erofs/Makefile
@@ -0,0 +1,11 @@
1# SPDX-License-Identifier: GPL-2.0-only
2
3EROFS_VERSION = "1.0"
4
5ccflags-y += -DEROFS_VERSION=\"$(EROFS_VERSION)\"
6
7obj-$(CONFIG_EROFS_FS) += erofs.o
8erofs-objs := super.o inode.o data.o namei.o dir.o utils.o
9erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
10erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o
11
diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h
new file mode 100644
index 000000000000..07d279fd5d67
--- /dev/null
+++ b/fs/erofs/compress.h
@@ -0,0 +1,60 @@
1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * Copyright (C) 2019 HUAWEI, Inc.
4 * http://www.huawei.com/
5 * Created by Gao Xiang <gaoxiang25@huawei.com>
6 */
7#ifndef __EROFS_FS_COMPRESS_H
8#define __EROFS_FS_COMPRESS_H
9
10#include "internal.h"
11
12enum {
13 Z_EROFS_COMPRESSION_SHIFTED = Z_EROFS_COMPRESSION_MAX,
14 Z_EROFS_COMPRESSION_RUNTIME_MAX
15};
16
17struct z_erofs_decompress_req {
18 struct super_block *sb;
19 struct page **in, **out;
20
21 unsigned short pageofs_out;
22 unsigned int inputsize, outputsize;
23
24 /* indicate the algorithm will be used for decompression */
25 unsigned int alg;
26 bool inplace_io, partial_decoding;
27};
28
29/*
30 * - 0x5A110C8D ('sallocated', Z_EROFS_MAPPING_STAGING) -
31 * used to mark temporary allocated pages from other
32 * file/cached pages and NULL mapping pages.
33 */
34#define Z_EROFS_MAPPING_STAGING ((void *)0x5A110C8D)
35
36/* check if a page is marked as staging */
37static inline bool z_erofs_page_is_staging(struct page *page)
38{
39 return page->mapping == Z_EROFS_MAPPING_STAGING;
40}
41
42static inline bool z_erofs_put_stagingpage(struct list_head *pagepool,
43 struct page *page)
44{
45 if (!z_erofs_page_is_staging(page))
46 return false;
47
48 /* staging pages should not be used by others at the same time */
49 if (page_ref_count(page) > 1)
50 put_page(page);
51 else
52 list_add(&page->lru, pagepool);
53 return true;
54}
55
56int z_erofs_decompress(struct z_erofs_decompress_req *rq,
57 struct list_head *pagepool);
58
59#endif
60
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
new file mode 100644
index 000000000000..fda16ec8863e
--- /dev/null
+++ b/fs/erofs/data.c
@@ -0,0 +1,423 @@
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright (C) 2017-2018 HUAWEI, Inc.
4 * http://www.huawei.com/
5 * Created by Gao Xiang <gaoxiang25@huawei.com>
6 */
7#include "internal.h"
8#include <linux/prefetch.h>
9
10#include <trace/events/erofs.h>
11
12static inline void read_endio(struct bio *bio)
13{
14 struct super_block *const sb = bio->bi_private;
15 struct bio_vec *bvec;
16 blk_status_t err = bio->bi_status;
17 struct bvec_iter_all iter_all;
18
19 if (time_to_inject(EROFS_SB(sb), FAULT_READ_IO)) {
20 erofs_show_injection_info(FAULT_READ_IO);
21 err = BLK_STS_IOERR;
22 }
23
24 bio_for_each_segment_all(bvec, bio, iter_all) {
25 struct page *page = bvec->bv_page;
26
27 /* page is already locked */
28 DBG_BUGON(PageUptodate(page));
29
30 if (unlikely(err))
31 SetPageError(page);
32 else
33 SetPageUptodate(page);
34
35 unlock_page(page);
36 /* page could be reclaimed now */
37 }
38 bio_put(bio);
39}
40
41/* prio -- true is used for dir */
42struct page *__erofs_get_meta_page(struct super_block *sb,
43 erofs_blk_t blkaddr, bool prio, bool nofail)
44{
45 struct inode *const bd_inode = sb->s_bdev->bd_inode;
46 struct address_space *const mapping = bd_inode->i_mapping;
47 /* prefer retrying in the allocator to blindly looping below */
48 const gfp_t gfp = mapping_gfp_constraint(mapping, ~__GFP_FS) |
49 (nofail ? __GFP_NOFAIL : 0);
50 unsigned int io_retries = nofail ? EROFS_IO_MAX_RETRIES_NOFAIL : 0;
51 struct page *page;
52 int err;
53
54repeat:
55 page = find_or_create_page(mapping, blkaddr, gfp);
56 if (unlikely(!page)) {
57 DBG_BUGON(nofail);
58 return ERR_PTR(-ENOMEM);
59 }
60 DBG_BUGON(!PageLocked(page));
61
62 if (!PageUptodate(page)) {
63 struct bio *bio;
64
65 bio = erofs_grab_bio(sb, blkaddr, 1, sb, read_endio, nofail);
66 if (IS_ERR(bio)) {
67 DBG_BUGON(nofail);
68 err = PTR_ERR(bio);
69 goto err_out;
70 }
71
72 err = bio_add_page(bio, page, PAGE_SIZE, 0);
73 if (unlikely(err != PAGE_SIZE)) {
74 err = -EFAULT;
75 goto err_out;
76 }
77
78 __submit_bio(bio, REQ_OP_READ,
79 REQ_META | (prio ? REQ_PRIO : 0));
80
81 lock_page(page);
82
83 /* this page has been truncated by others */
84 if (unlikely(page->mapping != mapping)) {
85unlock_repeat:
86 unlock_page(page);
87 put_page(page);
88 goto repeat;
89 }
90
91 /* more likely a read error */
92 if (unlikely(!PageUptodate(page))) {
93 if (io_retries) {
94 --io_retries;
95 goto unlock_repeat;
96 }
97 err = -EIO;
98 goto err_out;
99 }
100 }
101 return page;
102
103err_out:
104 unlock_page(page);
105 put_page(page);
106 return ERR_PTR(err);
107}
108
109static int erofs_map_blocks_flatmode(struct inode *inode,
110 struct erofs_map_blocks *map,
111 int flags)
112{
113 int err = 0;
114 erofs_blk_t nblocks, lastblk;
115 u64 offset = map->m_la;
116 struct erofs_vnode *vi = EROFS_V(inode);
117
118 trace_erofs_map_blocks_flatmode_enter(inode, map, flags);
119
120 nblocks = DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
121 lastblk = nblocks - is_inode_flat_inline(inode);
122
123 if (unlikely(offset >= inode->i_size)) {
124 /* leave out-of-bound access unmapped */
125 map->m_flags = 0;
126 map->m_plen = 0;
127 goto out;
128 }
129
130 /* there is no hole in flatmode */
131 map->m_flags = EROFS_MAP_MAPPED;
132
133 if (offset < blknr_to_addr(lastblk)) {
134 map->m_pa = blknr_to_addr(vi->raw_blkaddr) + map->m_la;
135 map->m_plen = blknr_to_addr(lastblk) - offset;
136 } else if (is_inode_flat_inline(inode)) {
137 /* 2 - inode inline B: inode, [xattrs], inline last blk... */
138 struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb);
139
140 map->m_pa = iloc(sbi, vi->nid) + vi->inode_isize +
141 vi->xattr_isize + erofs_blkoff(map->m_la);
142 map->m_plen = inode->i_size - offset;
143
144 /* inline data should be located in one meta block */
145 if (erofs_blkoff(map->m_pa) + map->m_plen > PAGE_SIZE) {
146 errln("inline data cross block boundary @ nid %llu",
147 vi->nid);
148 DBG_BUGON(1);
149 err = -EFSCORRUPTED;
150 goto err_out;
151 }
152
153 map->m_flags |= EROFS_MAP_META;
154 } else {
155 errln("internal error @ nid: %llu (size %llu), m_la 0x%llx",
156 vi->nid, inode->i_size, map->m_la);
157 DBG_BUGON(1);
158 err = -EIO;
159 goto err_out;
160 }
161
162out:
163 map->m_llen = map->m_plen;
164
165err_out:
166 trace_erofs_map_blocks_flatmode_exit(inode, map, flags, 0);
167 return err;
168}
169
170int erofs_map_blocks(struct inode *inode,
171 struct erofs_map_blocks *map, int flags)
172{
173 if (unlikely(is_inode_layout_compression(inode))) {
174 int err = z_erofs_map_blocks_iter(inode, map, flags);
175
176 if (map->mpage) {
177 put_page(map->mpage);
178 map->mpage = NULL;
179 }
180 return err;
181 }
182 return erofs_map_blocks_flatmode(inode, map, flags);
183}
184
185static inline struct bio *erofs_read_raw_page(struct bio *bio,
186 struct address_space *mapping,
187 struct page *page,
188 erofs_off_t *last_block,
189 unsigned int nblocks,
190 bool ra)
191{
192 struct inode *const inode = mapping->host;
193 struct super_block *const sb = inode->i_sb;
194 erofs_off_t current_block = (erofs_off_t)page->index;
195 int err;
196
197 DBG_BUGON(!nblocks);
198
199 if (PageUptodate(page)) {
200 err = 0;
201 goto has_updated;
202 }
203
204 /* note that for readpage case, bio also equals to NULL */
205 if (bio &&
206 /* not continuous */
207 *last_block + 1 != current_block) {
208submit_bio_retry:
209 __submit_bio(bio, REQ_OP_READ, 0);
210 bio = NULL;
211 }
212
213 if (!bio) {
214 struct erofs_map_blocks map = {
215 .m_la = blknr_to_addr(current_block),
216 };
217 erofs_blk_t blknr;
218 unsigned int blkoff;
219
220 err = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW);
221 if (unlikely(err))
222 goto err_out;
223
224 /* zero out the holed page */
225 if (unlikely(!(map.m_flags & EROFS_MAP_MAPPED))) {
226 zero_user_segment(page, 0, PAGE_SIZE);
227 SetPageUptodate(page);
228
229 /* imply err = 0, see erofs_map_blocks */
230 goto has_updated;
231 }
232
233 /* for RAW access mode, m_plen must be equal to m_llen */
234 DBG_BUGON(map.m_plen != map.m_llen);
235
236 blknr = erofs_blknr(map.m_pa);
237 blkoff = erofs_blkoff(map.m_pa);
238
239 /* deal with inline page */
240 if (map.m_flags & EROFS_MAP_META) {
241 void *vsrc, *vto;
242 struct page *ipage;
243
244 DBG_BUGON(map.m_plen > PAGE_SIZE);
245
246 ipage = erofs_get_meta_page(inode->i_sb, blknr, 0);
247
248 if (IS_ERR(ipage)) {
249 err = PTR_ERR(ipage);
250 goto err_out;
251 }
252
253 vsrc = kmap_atomic(ipage);
254 vto = kmap_atomic(page);
255 memcpy(vto, vsrc + blkoff, map.m_plen);
256 memset(vto + map.m_plen, 0, PAGE_SIZE - map.m_plen);
257 kunmap_atomic(vto);
258 kunmap_atomic(vsrc);
259 flush_dcache_page(page);
260
261 SetPageUptodate(page);
262 /* TODO: could we unlock the page earlier? */
263 unlock_page(ipage);
264 put_page(ipage);
265
266 /* imply err = 0, see erofs_map_blocks */
267 goto has_updated;
268 }
269
270 /* pa must be block-aligned for raw reading */
271 DBG_BUGON(erofs_blkoff(map.m_pa));
272
273 /* max # of continuous pages */
274 if (nblocks > DIV_ROUND_UP(map.m_plen, PAGE_SIZE))
275 nblocks = DIV_ROUND_UP(map.m_plen, PAGE_SIZE);
276 if (nblocks > BIO_MAX_PAGES)
277 nblocks = BIO_MAX_PAGES;
278
279 bio = erofs_grab_bio(sb, blknr, nblocks, sb,
280 read_endio, false);
281 if (IS_ERR(bio)) {
282 err = PTR_ERR(bio);
283 bio = NULL;
284 goto err_out;
285 }
286 }
287
288 err = bio_add_page(bio, page, PAGE_SIZE, 0);
289 /* out of the extent or bio is full */
290 if (err < PAGE_SIZE)
291 goto submit_bio_retry;
292
293 *last_block = current_block;
294
295 /* shift in advance in case of it followed by too many gaps */
296 if (bio->bi_iter.bi_size >= bio->bi_max_vecs * PAGE_SIZE) {
297 /* err should reassign to 0 after submitting */
298 err = 0;
299 goto submit_bio_out;
300 }
301
302 return bio;
303
304err_out:
305 /* for sync reading, set page error immediately */
306 if (!ra) {
307 SetPageError(page);
308 ClearPageUptodate(page);
309 }
310has_updated:
311 unlock_page(page);
312
313 /* if updated manually, continuous pages has a gap */
314 if (bio)
315submit_bio_out:
316 __submit_bio(bio, REQ_OP_READ, 0);
317
318 return unlikely(err) ? ERR_PTR(err) : NULL;
319}
320
321/*
322 * since we dont have write or truncate flows, so no inode
323 * locking needs to be held at the moment.
324 */
325static int erofs_raw_access_readpage(struct file *file, struct page *page)
326{
327 erofs_off_t last_block;
328 struct bio *bio;
329
330 trace_erofs_readpage(page, true);
331
332 bio = erofs_read_raw_page(NULL, page->mapping,
333 page, &last_block, 1, false);
334
335 if (IS_ERR(bio))
336 return PTR_ERR(bio);
337
338 DBG_BUGON(bio); /* since we have only one bio -- must be NULL */
339 return 0;
340}
341
342static int erofs_raw_access_readpages(struct file *filp,
343 struct address_space *mapping,
344 struct list_head *pages,
345 unsigned int nr_pages)
346{
347 erofs_off_t last_block;
348 struct bio *bio = NULL;
349 gfp_t gfp = readahead_gfp_mask(mapping);
350 struct page *page = list_last_entry(pages, struct page, lru);
351
352 trace_erofs_readpages(mapping->host, page, nr_pages, true);
353
354 for (; nr_pages; --nr_pages) {
355 page = list_entry(pages->prev, struct page, lru);
356
357 prefetchw(&page->flags);
358 list_del(&page->lru);
359
360 if (!add_to_page_cache_lru(page, mapping, page->index, gfp)) {
361 bio = erofs_read_raw_page(bio, mapping, page,
362 &last_block, nr_pages, true);
363
364 /* all the page errors are ignored when readahead */
365 if (IS_ERR(bio)) {
366 pr_err("%s, readahead error at page %lu of nid %llu\n",
367 __func__, page->index,
368 EROFS_V(mapping->host)->nid);
369
370 bio = NULL;
371 }
372 }
373
374 /* pages could still be locked */
375 put_page(page);
376 }
377 DBG_BUGON(!list_empty(pages));
378
379 /* the rare case (end in gaps) */
380 if (unlikely(bio))
381 __submit_bio(bio, REQ_OP_READ, 0);
382 return 0;
383}
384
385static int erofs_get_block(struct inode *inode, sector_t iblock,
386 struct buffer_head *bh, int create)
387{
388 struct erofs_map_blocks map = {
389 .m_la = iblock << 9,
390 };
391 int err;
392
393 err = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW);
394 if (err)
395 return err;
396
397 if (map.m_flags & EROFS_MAP_MAPPED)
398 bh->b_blocknr = erofs_blknr(map.m_pa);
399
400 return err;
401}
402
403static sector_t erofs_bmap(struct address_space *mapping, sector_t block)
404{
405 struct inode *inode = mapping->host;
406
407 if (is_inode_flat_inline(inode)) {
408 erofs_blk_t blks = i_size_read(inode) >> LOG_BLOCK_SIZE;
409
410 if (block >> LOG_SECTORS_PER_BLOCK >= blks)
411 return 0;
412 }
413
414 return generic_block_bmap(mapping, block, erofs_get_block);
415}
416
417/* for uncompressed (aligned) files and raw access for other files */
418const struct address_space_operations erofs_raw_access_aops = {
419 .readpage = erofs_raw_access_readpage,
420 .readpages = erofs_raw_access_readpages,
421 .bmap = erofs_bmap,
422};
423
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
new file mode 100644
index 000000000000..5f4b7f302863
--- /dev/null
+++ b/fs/erofs/decompressor.c
@@ -0,0 +1,358 @@
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright (C) 2019 HUAWEI, Inc.
4 * http://www.huawei.com/
5 * Created by Gao Xiang <gaoxiang25@huawei.com>
6 */
7#include "compress.h"
8#include <linux/module.h>
9#include <linux/lz4.h>
10
11#ifndef LZ4_DISTANCE_MAX /* history window size */
12#define LZ4_DISTANCE_MAX 65535 /* set to maximum value by default */
13#endif
14
15#define LZ4_MAX_DISTANCE_PAGES (DIV_ROUND_UP(LZ4_DISTANCE_MAX, PAGE_SIZE) + 1)
16#ifndef LZ4_DECOMPRESS_INPLACE_MARGIN
17#define LZ4_DECOMPRESS_INPLACE_MARGIN(srcsize) (((srcsize) >> 8) + 32)
18#endif
19
20struct z_erofs_decompressor {
21 /*
22 * if destpages have sparsed pages, fill them with bounce pages.
23 * it also check whether destpages indicate continuous physical memory.
24 */
25 int (*prepare_destpages)(struct z_erofs_decompress_req *rq,
26 struct list_head *pagepool);
27 int (*decompress)(struct z_erofs_decompress_req *rq, u8 *out);
28 char *name;
29};
30
31static bool use_vmap;
32module_param(use_vmap, bool, 0444);
33MODULE_PARM_DESC(use_vmap, "Use vmap() instead of vm_map_ram() (default 0)");
34
35static int lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
36 struct list_head *pagepool)
37{
38 const unsigned int nr =
39 PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
40 struct page *availables[LZ4_MAX_DISTANCE_PAGES] = { NULL };
41 unsigned long bounced[DIV_ROUND_UP(LZ4_MAX_DISTANCE_PAGES,
42 BITS_PER_LONG)] = { 0 };
43 void *kaddr = NULL;
44 unsigned int i, j, top;
45
46 top = 0;
47 for (i = j = 0; i < nr; ++i, ++j) {
48 struct page *const page = rq->out[i];
49 struct page *victim;
50
51 if (j >= LZ4_MAX_DISTANCE_PAGES)
52 j = 0;
53
54 /* 'valid' bounced can only be tested after a complete round */
55 if (test_bit(j, bounced)) {
56 DBG_BUGON(i < LZ4_MAX_DISTANCE_PAGES);
57 DBG_BUGON(top >= LZ4_MAX_DISTANCE_PAGES);
58 availables[top++] = rq->out[i - LZ4_MAX_DISTANCE_PAGES];
59 }
60
61 if (page) {
62 __clear_bit(j, bounced);
63 if (kaddr) {
64 if (kaddr + PAGE_SIZE == page_address(page))
65 kaddr += PAGE_SIZE;
66 else
67 kaddr = NULL;
68 } else if (!i) {
69 kaddr = page_address(page);
70 }
71 continue;
72 }
73 kaddr = NULL;
74 __set_bit(j, bounced);
75
76 if (top) {
77 victim = availables[--top];
78 get_page(victim);
79 } else {
80 victim = erofs_allocpage(pagepool, GFP_KERNEL, false);
81 if (unlikely(!victim))
82 return -ENOMEM;
83 victim->mapping = Z_EROFS_MAPPING_STAGING;
84 }
85 rq->out[i] = victim;
86 }
87 return kaddr ? 1 : 0;
88}
89
90static void *generic_copy_inplace_data(struct z_erofs_decompress_req *rq,
91 u8 *src, unsigned int pageofs_in)
92{
93 /*
94 * if in-place decompression is ongoing, those decompressed
95 * pages should be copied in order to avoid being overlapped.
96 */
97 struct page **in = rq->in;
98 u8 *const tmp = erofs_get_pcpubuf(0);
99 u8 *tmpp = tmp;
100 unsigned int inlen = rq->inputsize - pageofs_in;
101 unsigned int count = min_t(uint, inlen, PAGE_SIZE - pageofs_in);
102
103 while (tmpp < tmp + inlen) {
104 if (!src)
105 src = kmap_atomic(*in);
106 memcpy(tmpp, src + pageofs_in, count);
107 kunmap_atomic(src);
108 src = NULL;
109 tmpp += count;
110 pageofs_in = 0;
111 count = PAGE_SIZE;
112 ++in;
113 }
114 return tmp;
115}
116
117static int lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out)
118{
119 unsigned int inputmargin, inlen;
120 u8 *src;
121 bool copied, support_0padding;
122 int ret;
123
124 if (rq->inputsize > PAGE_SIZE)
125 return -EOPNOTSUPP;
126
127 src = kmap_atomic(*rq->in);
128 inputmargin = 0;
129 support_0padding = false;
130
131 /* decompression inplace is only safe when 0padding is enabled */
132 if (EROFS_SB(rq->sb)->requirements & EROFS_REQUIREMENT_LZ4_0PADDING) {
133 support_0padding = true;
134
135 while (!src[inputmargin & ~PAGE_MASK])
136 if (!(++inputmargin & ~PAGE_MASK))
137 break;
138
139 if (inputmargin >= rq->inputsize) {
140 kunmap_atomic(src);
141 return -EIO;
142 }
143 }
144
145 copied = false;
146 inlen = rq->inputsize - inputmargin;
147 if (rq->inplace_io) {
148 const uint oend = (rq->pageofs_out +
149 rq->outputsize) & ~PAGE_MASK;
150 const uint nr = PAGE_ALIGN(rq->pageofs_out +
151 rq->outputsize) >> PAGE_SHIFT;
152
153 if (rq->partial_decoding || !support_0padding ||
154 rq->out[nr - 1] != rq->in[0] ||
155 rq->inputsize - oend <
156 LZ4_DECOMPRESS_INPLACE_MARGIN(inlen)) {
157 src = generic_copy_inplace_data(rq, src, inputmargin);
158 inputmargin = 0;
159 copied = true;
160 }
161 }
162
163 ret = LZ4_decompress_safe_partial(src + inputmargin, out,
164 inlen, rq->outputsize,
165 rq->outputsize);
166 if (ret < 0) {
167 errln("%s, failed to decompress, in[%p, %u, %u] out[%p, %u]",
168 __func__, src + inputmargin, inlen, inputmargin,
169 out, rq->outputsize);
170 WARN_ON(1);
171 print_hex_dump(KERN_DEBUG, "[ in]: ", DUMP_PREFIX_OFFSET,
172 16, 1, src + inputmargin, inlen, true);
173 print_hex_dump(KERN_DEBUG, "[out]: ", DUMP_PREFIX_OFFSET,
174 16, 1, out, rq->outputsize, true);
175 ret = -EIO;
176 }
177
178 if (copied)
179 erofs_put_pcpubuf(src);
180 else
181 kunmap_atomic(src);
182 return ret;
183}
184
185static struct z_erofs_decompressor decompressors[] = {
186 [Z_EROFS_COMPRESSION_SHIFTED] = {
187 .name = "shifted"
188 },
189 [Z_EROFS_COMPRESSION_LZ4] = {
190 .prepare_destpages = lz4_prepare_destpages,
191 .decompress = lz4_decompress,
192 .name = "lz4"
193 },
194};
195
196static void copy_from_pcpubuf(struct page **out, const char *dst,
197 unsigned short pageofs_out,
198 unsigned int outputsize)
199{
200 const char *end = dst + outputsize;
201 const unsigned int righthalf = PAGE_SIZE - pageofs_out;
202 const char *cur = dst - pageofs_out;
203
204 while (cur < end) {
205 struct page *const page = *out++;
206
207 if (page) {
208 char *buf = kmap_atomic(page);
209
210 if (cur >= dst) {
211 memcpy(buf, cur, min_t(uint, PAGE_SIZE,
212 end - cur));
213 } else {
214 memcpy(buf + pageofs_out, cur + pageofs_out,
215 min_t(uint, righthalf, end - cur));
216 }
217 kunmap_atomic(buf);
218 }
219 cur += PAGE_SIZE;
220 }
221}
222
223static void *erofs_vmap(struct page **pages, unsigned int count)
224{
225 int i = 0;
226
227 if (use_vmap)
228 return vmap(pages, count, VM_MAP, PAGE_KERNEL);
229
230 while (1) {
231 void *addr = vm_map_ram(pages, count, -1, PAGE_KERNEL);
232
233 /* retry two more times (totally 3 times) */
234 if (addr || ++i >= 3)
235 return addr;
236 vm_unmap_aliases();
237 }
238 return NULL;
239}
240
241static void erofs_vunmap(const void *mem, unsigned int count)
242{
243 if (!use_vmap)
244 vm_unmap_ram(mem, count);
245 else
246 vunmap(mem);
247}
248
249static int decompress_generic(struct z_erofs_decompress_req *rq,
250 struct list_head *pagepool)
251{
252 const unsigned int nrpages_out =
253 PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
254 const struct z_erofs_decompressor *alg = decompressors + rq->alg;
255 unsigned int dst_maptype;
256 void *dst;
257 int ret;
258
259 if (nrpages_out == 1 && !rq->inplace_io) {
260 DBG_BUGON(!*rq->out);
261 dst = kmap_atomic(*rq->out);
262 dst_maptype = 0;
263 goto dstmap_out;
264 }
265
266 /*
267 * For the case of small output size (especially much less
268 * than PAGE_SIZE), memcpy the decompressed data rather than
269 * compressed data is preferred.
270 */
271 if (rq->outputsize <= PAGE_SIZE * 7 / 8) {
272 dst = erofs_get_pcpubuf(0);
273 if (IS_ERR(dst))
274 return PTR_ERR(dst);
275
276 rq->inplace_io = false;
277 ret = alg->decompress(rq, dst);
278 if (!ret)
279 copy_from_pcpubuf(rq->out, dst, rq->pageofs_out,
280 rq->outputsize);
281
282 erofs_put_pcpubuf(dst);
283 return ret;
284 }
285
286 ret = alg->prepare_destpages(rq, pagepool);
287 if (ret < 0) {
288 return ret;
289 } else if (ret) {
290 dst = page_address(*rq->out);
291 dst_maptype = 1;
292 goto dstmap_out;
293 }
294
295 dst = erofs_vmap(rq->out, nrpages_out);
296 if (!dst)
297 return -ENOMEM;
298 dst_maptype = 2;
299
300dstmap_out:
301 ret = alg->decompress(rq, dst + rq->pageofs_out);
302
303 if (!dst_maptype)
304 kunmap_atomic(dst);
305 else if (dst_maptype == 2)
306 erofs_vunmap(dst, nrpages_out);
307 return ret;
308}
309
310static int shifted_decompress(const struct z_erofs_decompress_req *rq,
311 struct list_head *pagepool)
312{
313 const unsigned int nrpages_out =
314 PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
315 const unsigned int righthalf = PAGE_SIZE - rq->pageofs_out;
316 unsigned char *src, *dst;
317
318 if (nrpages_out > 2) {
319 DBG_BUGON(1);
320 return -EIO;
321 }
322
323 if (rq->out[0] == *rq->in) {
324 DBG_BUGON(nrpages_out != 1);
325 return 0;
326 }
327
328 src = kmap_atomic(*rq->in);
329 if (!rq->out[0]) {
330 dst = NULL;
331 } else {
332 dst = kmap_atomic(rq->out[0]);
333 memcpy(dst + rq->pageofs_out, src, righthalf);
334 }
335
336 if (rq->out[1] == *rq->in) {
337 memmove(src, src + righthalf, rq->pageofs_out);
338 } else if (nrpages_out == 2) {
339 if (dst)
340 kunmap_atomic(dst);
341 DBG_BUGON(!rq->out[1]);
342 dst = kmap_atomic(rq->out[1]);
343 memcpy(dst, src + righthalf, rq->pageofs_out);
344 }
345 if (dst)
346 kunmap_atomic(dst);
347 kunmap_atomic(src);
348 return 0;
349}
350
351int z_erofs_decompress(struct z_erofs_decompress_req *rq,
352 struct list_head *pagepool)
353{
354 if (rq->alg == Z_EROFS_COMPRESSION_SHIFTED)
355 return shifted_decompress(rq, pagepool);
356 return decompress_generic(rq, pagepool);
357}
358
diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c
new file mode 100644
index 000000000000..1976e60e5174
--- /dev/null
+++ b/fs/erofs/dir.c
@@ -0,0 +1,139 @@
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright (C) 2017-2018 HUAWEI, Inc.
4 * http://www.huawei.com/
5 * Created by Gao Xiang <gaoxiang25@huawei.com>
6 */
7#include "internal.h"
8
9static void debug_one_dentry(unsigned char d_type, const char *de_name,
10 unsigned int de_namelen)
11{
12#ifdef CONFIG_EROFS_FS_DEBUG
13 /* since the on-disk name could not have the trailing '\0' */
14 unsigned char dbg_namebuf[EROFS_NAME_LEN + 1];
15
16 memcpy(dbg_namebuf, de_name, de_namelen);
17 dbg_namebuf[de_namelen] = '\0';
18
19 debugln("found dirent %s de_len %u d_type %d", dbg_namebuf,
20 de_namelen, d_type);
21#endif
22}
23
24static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx,
25 void *dentry_blk, unsigned int *ofs,
26 unsigned int nameoff, unsigned int maxsize)
27{
28 struct erofs_dirent *de = dentry_blk + *ofs;
29 const struct erofs_dirent *end = dentry_blk + nameoff;
30
31 while (de < end) {
32 const char *de_name;
33 unsigned int de_namelen;
34 unsigned char d_type;
35
36 d_type = fs_ftype_to_dtype(de->file_type);
37
38 nameoff = le16_to_cpu(de->nameoff);
39 de_name = (char *)dentry_blk + nameoff;
40
41 /* the last dirent in the block? */
42 if (de + 1 >= end)
43 de_namelen = strnlen(de_name, maxsize - nameoff);
44 else
45 de_namelen = le16_to_cpu(de[1].nameoff) - nameoff;
46
47 /* a corrupted entry is found */
48 if (unlikely(nameoff + de_namelen > maxsize ||
49 de_namelen > EROFS_NAME_LEN)) {
50 errln("bogus dirent @ nid %llu", EROFS_V(dir)->nid);
51 DBG_BUGON(1);
52 return -EFSCORRUPTED;
53 }
54
55 debug_one_dentry(d_type, de_name, de_namelen);
56 if (!dir_emit(ctx, de_name, de_namelen,
57 le64_to_cpu(de->nid), d_type))
58 /* stopped by some reason */
59 return 1;
60 ++de;
61 *ofs += sizeof(struct erofs_dirent);
62 }
63 *ofs = maxsize;
64 return 0;
65}
66
67static int erofs_readdir(struct file *f, struct dir_context *ctx)
68{
69 struct inode *dir = file_inode(f);
70 struct address_space *mapping = dir->i_mapping;
71 const size_t dirsize = i_size_read(dir);
72 unsigned int i = ctx->pos / EROFS_BLKSIZ;
73 unsigned int ofs = ctx->pos % EROFS_BLKSIZ;
74 int err = 0;
75 bool initial = true;
76
77 while (ctx->pos < dirsize) {
78 struct page *dentry_page;
79 struct erofs_dirent *de;
80 unsigned int nameoff, maxsize;
81
82 dentry_page = read_mapping_page(mapping, i, NULL);
83 if (dentry_page == ERR_PTR(-ENOMEM)) {
84 err = -ENOMEM;
85 break;
86 } else if (IS_ERR(dentry_page)) {
87 errln("fail to readdir of logical block %u of nid %llu",
88 i, EROFS_V(dir)->nid);
89 err = -EFSCORRUPTED;
90 break;
91 }
92
93 de = (struct erofs_dirent *)kmap(dentry_page);
94
95 nameoff = le16_to_cpu(de->nameoff);
96
97 if (unlikely(nameoff < sizeof(struct erofs_dirent) ||
98 nameoff >= PAGE_SIZE)) {
99 errln("%s, invalid de[0].nameoff %u @ nid %llu",
100 __func__, nameoff, EROFS_V(dir)->nid);
101 err = -EFSCORRUPTED;
102 goto skip_this;
103 }
104
105 maxsize = min_t(unsigned int,
106 dirsize - ctx->pos + ofs, PAGE_SIZE);
107
108 /* search dirents at the arbitrary position */
109 if (unlikely(initial)) {
110 initial = false;
111
112 ofs = roundup(ofs, sizeof(struct erofs_dirent));
113 if (unlikely(ofs >= nameoff))
114 goto skip_this;
115 }
116
117 err = erofs_fill_dentries(dir, ctx, de, &ofs,
118 nameoff, maxsize);
119skip_this:
120 kunmap(dentry_page);
121
122 put_page(dentry_page);
123
124 ctx->pos = blknr_to_addr(i) + ofs;
125
126 if (unlikely(err))
127 break;
128 ++i;
129 ofs = 0;
130 }
131 return err < 0 ? err : 0;
132}
133
134const struct file_operations erofs_dir_fops = {
135 .llseek = generic_file_llseek,
136 .read = generic_read_dir,
137 .iterate_shared = erofs_readdir,
138};
139
diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
new file mode 100644
index 000000000000..afa7d45ca958
--- /dev/null
+++ b/fs/erofs/erofs_fs.h
@@ -0,0 +1,307 @@
1/* SPDX-License-Identifier: GPL-2.0-only OR Apache-2.0 */
2/*
3 * Copyright (C) 2017-2018 HUAWEI, Inc.
4 * http://www.huawei.com/
5 * Created by Gao Xiang <gaoxiang25@huawei.com>
6 */
7#ifndef __EROFS_FS_H
8#define __EROFS_FS_H
9
10/* Enhanced(Extended) ROM File System */
11#define EROFS_SUPER_OFFSET 1024
12
13/*
14 * Any bits that aren't in EROFS_ALL_REQUIREMENTS should be
15 * incompatible with this kernel version.
16 */
17#define EROFS_REQUIREMENT_LZ4_0PADDING 0x00000001
18#define EROFS_ALL_REQUIREMENTS EROFS_REQUIREMENT_LZ4_0PADDING
19
20struct erofs_super_block {
21/* 0 */__le32 magic; /* in the little endian */
22/* 4 */__le32 checksum; /* crc32c(super_block) */
23/* 8 */__le32 features; /* (aka. feature_compat) */
24/* 12 */__u8 blkszbits; /* support block_size == PAGE_SIZE only */
25/* 13 */__u8 reserved;
26
27/* 14 */__le16 root_nid;
28/* 16 */__le64 inos; /* total valid ino # (== f_files - f_favail) */
29
30/* 24 */__le64 build_time; /* inode v1 time derivation */
31/* 32 */__le32 build_time_nsec;
32/* 36 */__le32 blocks; /* used for statfs */
33/* 40 */__le32 meta_blkaddr;
34/* 44 */__le32 xattr_blkaddr;
35/* 48 */__u8 uuid[16]; /* 128-bit uuid for volume */
36/* 64 */__u8 volume_name[16]; /* volume name */
37/* 80 */__le32 requirements; /* (aka. feature_incompat) */
38
39/* 84 */__u8 reserved2[44];
40} __packed; /* 128 bytes */
41
42/*
43 * erofs inode data mapping:
44 * 0 - inode plain without inline data A:
45 * inode, [xattrs], ... | ... | no-holed data
46 * 1 - inode VLE compression B (legacy):
47 * inode, [xattrs], extents ... | ...
48 * 2 - inode plain with inline data C:
49 * inode, [xattrs], last_inline_data, ... | ... | no-holed data
50 * 3 - inode compression D:
51 * inode, [xattrs], map_header, extents ... | ...
52 * 4~7 - reserved
53 */
54enum {
55 EROFS_INODE_FLAT_PLAIN,
56 EROFS_INODE_FLAT_COMPRESSION_LEGACY,
57 EROFS_INODE_FLAT_INLINE,
58 EROFS_INODE_FLAT_COMPRESSION,
59 EROFS_INODE_LAYOUT_MAX
60};
61
62static inline bool erofs_inode_is_data_compressed(unsigned int datamode)
63{
64 if (datamode == EROFS_INODE_FLAT_COMPRESSION)
65 return true;
66 return datamode == EROFS_INODE_FLAT_COMPRESSION_LEGACY;
67}
68
69/* bit definitions of inode i_advise */
70#define EROFS_I_VERSION_BITS 1
71#define EROFS_I_DATA_MAPPING_BITS 3
72
73#define EROFS_I_VERSION_BIT 0
74#define EROFS_I_DATA_MAPPING_BIT 1
75
76struct erofs_inode_v1 {
77/* 0 */__le16 i_advise;
78
79/* 1 header + n-1 * 4 bytes inline xattr to keep continuity */
80/* 2 */__le16 i_xattr_icount;
81/* 4 */__le16 i_mode;
82/* 6 */__le16 i_nlink;
83/* 8 */__le32 i_size;
84/* 12 */__le32 i_reserved;
85/* 16 */union {
86 /* file total compressed blocks for data mapping 1 */
87 __le32 compressed_blocks;
88 __le32 raw_blkaddr;
89
90 /* for device files, used to indicate old/new device # */
91 __le32 rdev;
92 } i_u __packed;
93/* 20 */__le32 i_ino; /* only used for 32-bit stat compatibility */
94/* 24 */__le16 i_uid;
95/* 26 */__le16 i_gid;
96/* 28 */__le32 i_reserved2;
97} __packed;
98
99/* 32 bytes on-disk inode */
100#define EROFS_INODE_LAYOUT_V1 0
101/* 64 bytes on-disk inode */
102#define EROFS_INODE_LAYOUT_V2 1
103
104struct erofs_inode_v2 {
105/* 0 */__le16 i_advise;
106
107/* 1 header + n-1 * 4 bytes inline xattr to keep continuity */
108/* 2 */__le16 i_xattr_icount;
109/* 4 */__le16 i_mode;
110/* 6 */__le16 i_reserved;
111/* 8 */__le64 i_size;
112/* 16 */union {
113 /* file total compressed blocks for data mapping 1 */
114 __le32 compressed_blocks;
115 __le32 raw_blkaddr;
116
117 /* for device files, used to indicate old/new device # */
118 __le32 rdev;
119 } i_u __packed;
120
121 /* only used for 32-bit stat compatibility */
122/* 20 */__le32 i_ino;
123
124/* 24 */__le32 i_uid;
125/* 28 */__le32 i_gid;
126/* 32 */__le64 i_ctime;
127/* 40 */__le32 i_ctime_nsec;
128/* 44 */__le32 i_nlink;
129/* 48 */__u8 i_reserved2[16];
130} __packed; /* 64 bytes */
131
132#define EROFS_MAX_SHARED_XATTRS (128)
133/* h_shared_count between 129 ... 255 are special # */
134#define EROFS_SHARED_XATTR_EXTENT (255)
135
136/*
137 * inline xattrs (n == i_xattr_icount):
138 * erofs_xattr_ibody_header(1) + (n - 1) * 4 bytes
139 * 12 bytes / \
140 * / \
141 * /-----------------------\
142 * | erofs_xattr_entries+ |
143 * +-----------------------+
144 * inline xattrs must starts in erofs_xattr_ibody_header,
145 * for read-only fs, no need to introduce h_refcount
146 */
147struct erofs_xattr_ibody_header {
148 __le32 h_reserved;
149 __u8 h_shared_count;
150 __u8 h_reserved2[7];
151 __le32 h_shared_xattrs[0]; /* shared xattr id array */
152} __packed;
153
154/* Name indexes */
155#define EROFS_XATTR_INDEX_USER 1
156#define EROFS_XATTR_INDEX_POSIX_ACL_ACCESS 2
157#define EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT 3
158#define EROFS_XATTR_INDEX_TRUSTED 4
159#define EROFS_XATTR_INDEX_LUSTRE 5
160#define EROFS_XATTR_INDEX_SECURITY 6
161
162/* xattr entry (for both inline & shared xattrs) */
163struct erofs_xattr_entry {
164 __u8 e_name_len; /* length of name */
165 __u8 e_name_index; /* attribute name index */
166 __le16 e_value_size; /* size of attribute value */
167 /* followed by e_name and e_value */
168 char e_name[0]; /* attribute name */
169} __packed;
170
171#define ondisk_xattr_ibody_size(count) ({\
172 u32 __count = le16_to_cpu(count); \
173 ((__count) == 0) ? 0 : \
174 sizeof(struct erofs_xattr_ibody_header) + \
175 sizeof(__u32) * ((__count) - 1); })
176
177#define EROFS_XATTR_ALIGN(size) round_up(size, sizeof(struct erofs_xattr_entry))
178#define EROFS_XATTR_ENTRY_SIZE(entry) EROFS_XATTR_ALIGN( \
179 sizeof(struct erofs_xattr_entry) + \
180 (entry)->e_name_len + le16_to_cpu((entry)->e_value_size))
181
182/* available compression algorithm types */
183enum {
184 Z_EROFS_COMPRESSION_LZ4,
185 Z_EROFS_COMPRESSION_MAX
186};
187
188/*
189 * bit 0 : COMPACTED_2B indexes (0 - off; 1 - on)
190 * e.g. for 4k logical cluster size, 4B if compacted 2B is off;
191 * (4B) + 2B + (4B) if compacted 2B is on.
192 */
193#define Z_EROFS_ADVISE_COMPACTED_2B_BIT 0
194
195#define Z_EROFS_ADVISE_COMPACTED_2B (1 << Z_EROFS_ADVISE_COMPACTED_2B_BIT)
196
197struct z_erofs_map_header {
198 __le32 h_reserved1;
199 __le16 h_advise;
200 /*
201 * bit 0-3 : algorithm type of head 1 (logical cluster type 01);
202 * bit 4-7 : algorithm type of head 2 (logical cluster type 11).
203 */
204 __u8 h_algorithmtype;
205 /*
206 * bit 0-2 : logical cluster bits - 12, e.g. 0 for 4096;
207 * bit 3-4 : (physical - logical) cluster bits of head 1:
208 * For example, if logical clustersize = 4096, 1 for 8192.
209 * bit 5-7 : (physical - logical) cluster bits of head 2.
210 */
211 __u8 h_clusterbits;
212};
213
214#define Z_EROFS_VLE_LEGACY_HEADER_PADDING 8
215
216/*
217 * Z_EROFS Variable-sized Logical Extent cluster type:
218 * 0 - literal (uncompressed) cluster
219 * 1 - compressed cluster (for the head logical cluster)
220 * 2 - compressed cluster (for the other logical clusters)
221 *
222 * In detail,
223 * 0 - literal (uncompressed) cluster,
224 * di_advise = 0
225 * di_clusterofs = the literal data offset of the cluster
226 * di_blkaddr = the blkaddr of the literal cluster
227 *
228 * 1 - compressed cluster (for the head logical cluster)
229 * di_advise = 1
230 * di_clusterofs = the decompressed data offset of the cluster
231 * di_blkaddr = the blkaddr of the compressed cluster
232 *
233 * 2 - compressed cluster (for the other logical clusters)
234 * di_advise = 2
235 * di_clusterofs =
236 * the decompressed data offset in its own head cluster
237 * di_u.delta[0] = distance to its corresponding head cluster
238 * di_u.delta[1] = distance to its corresponding tail cluster
239 * (di_advise could be 0, 1 or 2)
240 */
241enum {
242 Z_EROFS_VLE_CLUSTER_TYPE_PLAIN,
243 Z_EROFS_VLE_CLUSTER_TYPE_HEAD,
244 Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD,
245 Z_EROFS_VLE_CLUSTER_TYPE_RESERVED,
246 Z_EROFS_VLE_CLUSTER_TYPE_MAX
247};
248
249#define Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS 2
250#define Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT 0
251
252struct z_erofs_vle_decompressed_index {
253 __le16 di_advise;
254 /* where to decompress in the head cluster */
255 __le16 di_clusterofs;
256
257 union {
258 /* for the head cluster */
259 __le32 blkaddr;
260 /*
261 * for the rest clusters
262 * eg. for 4k page-sized cluster, maximum 4K*64k = 256M)
263 * [0] - pointing to the head cluster
264 * [1] - pointing to the tail cluster
265 */
266 __le16 delta[2];
267 } di_u __packed; /* 8 bytes */
268} __packed;
269
270#define Z_EROFS_VLE_LEGACY_INDEX_ALIGN(size) \
271 (round_up(size, sizeof(struct z_erofs_vle_decompressed_index)) + \
272 sizeof(struct z_erofs_map_header) + Z_EROFS_VLE_LEGACY_HEADER_PADDING)
273
274/* dirent sorts in alphabet order, thus we can do binary search */
275struct erofs_dirent {
276 __le64 nid; /* 0, node number */
277 __le16 nameoff; /* 8, start offset of file name */
278 __u8 file_type; /* 10, file type */
279 __u8 reserved; /* 11, reserved */
280} __packed;
281
282/*
283 * EROFS file types should match generic FT_* types and
284 * it seems no need to add BUILD_BUG_ONs since potential
285 * unmatchness will break other fses as well...
286 */
287
288#define EROFS_NAME_LEN 255
289
290/* check the EROFS on-disk layout strictly at compile time */
291static inline void erofs_check_ondisk_layout_definitions(void)
292{
293 BUILD_BUG_ON(sizeof(struct erofs_super_block) != 128);
294 BUILD_BUG_ON(sizeof(struct erofs_inode_v1) != 32);
295 BUILD_BUG_ON(sizeof(struct erofs_inode_v2) != 64);
296 BUILD_BUG_ON(sizeof(struct erofs_xattr_ibody_header) != 12);
297 BUILD_BUG_ON(sizeof(struct erofs_xattr_entry) != 4);
298 BUILD_BUG_ON(sizeof(struct z_erofs_map_header) != 8);
299 BUILD_BUG_ON(sizeof(struct z_erofs_vle_decompressed_index) != 8);
300 BUILD_BUG_ON(sizeof(struct erofs_dirent) != 12);
301
302 BUILD_BUG_ON(BIT(Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS) <
303 Z_EROFS_VLE_CLUSTER_TYPE_MAX - 1);
304}
305
306#endif
307
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
new file mode 100644
index 000000000000..80f4fe919ee7
--- /dev/null
+++ b/fs/erofs/inode.c
@@ -0,0 +1,332 @@
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright (C) 2017-2018 HUAWEI, Inc.
4 * http://www.huawei.com/
5 * Created by Gao Xiang <gaoxiang25@huawei.com>
6 */
7#include "xattr.h"
8
9#include <trace/events/erofs.h>
10
11/* no locking */
12static int read_inode(struct inode *inode, void *data)
13{
14 struct erofs_vnode *vi = EROFS_V(inode);
15 struct erofs_inode_v1 *v1 = data;
16 const unsigned int advise = le16_to_cpu(v1->i_advise);
17 erofs_blk_t nblks = 0;
18
19 vi->datamode = __inode_data_mapping(advise);
20
21 if (unlikely(vi->datamode >= EROFS_INODE_LAYOUT_MAX)) {
22 errln("unsupported data mapping %u of nid %llu",
23 vi->datamode, vi->nid);
24 DBG_BUGON(1);
25 return -EOPNOTSUPP;
26 }
27
28 if (__inode_version(advise) == EROFS_INODE_LAYOUT_V2) {
29 struct erofs_inode_v2 *v2 = data;
30
31 vi->inode_isize = sizeof(struct erofs_inode_v2);
32 vi->xattr_isize = ondisk_xattr_ibody_size(v2->i_xattr_icount);
33
34 inode->i_mode = le16_to_cpu(v2->i_mode);
35 if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
36 S_ISLNK(inode->i_mode))
37 vi->raw_blkaddr = le32_to_cpu(v2->i_u.raw_blkaddr);
38 else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
39 inode->i_rdev =
40 new_decode_dev(le32_to_cpu(v2->i_u.rdev));
41 else if (S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode))
42 inode->i_rdev = 0;
43 else
44 goto bogusimode;
45
46 i_uid_write(inode, le32_to_cpu(v2->i_uid));
47 i_gid_write(inode, le32_to_cpu(v2->i_gid));
48 set_nlink(inode, le32_to_cpu(v2->i_nlink));
49
50 /* ns timestamp */
51 inode->i_mtime.tv_sec = inode->i_ctime.tv_sec =
52 le64_to_cpu(v2->i_ctime);
53 inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec =
54 le32_to_cpu(v2->i_ctime_nsec);
55
56 inode->i_size = le64_to_cpu(v2->i_size);
57
58 /* total blocks for compressed files */
59 if (is_inode_layout_compression(inode))
60 nblks = le32_to_cpu(v2->i_u.compressed_blocks);
61 } else if (__inode_version(advise) == EROFS_INODE_LAYOUT_V1) {
62 struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb);
63
64 vi->inode_isize = sizeof(struct erofs_inode_v1);
65 vi->xattr_isize = ondisk_xattr_ibody_size(v1->i_xattr_icount);
66
67 inode->i_mode = le16_to_cpu(v1->i_mode);
68 if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
69 S_ISLNK(inode->i_mode))
70 vi->raw_blkaddr = le32_to_cpu(v1->i_u.raw_blkaddr);
71 else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
72 inode->i_rdev =
73 new_decode_dev(le32_to_cpu(v1->i_u.rdev));
74 else if (S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode))
75 inode->i_rdev = 0;
76 else
77 goto bogusimode;
78
79 i_uid_write(inode, le16_to_cpu(v1->i_uid));
80 i_gid_write(inode, le16_to_cpu(v1->i_gid));
81 set_nlink(inode, le16_to_cpu(v1->i_nlink));
82
83 /* use build time to derive all file time */
84 inode->i_mtime.tv_sec = inode->i_ctime.tv_sec =
85 sbi->build_time;
86 inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec =
87 sbi->build_time_nsec;
88
89 inode->i_size = le32_to_cpu(v1->i_size);
90 if (is_inode_layout_compression(inode))
91 nblks = le32_to_cpu(v1->i_u.compressed_blocks);
92 } else {
93 errln("unsupported on-disk inode version %u of nid %llu",
94 __inode_version(advise), vi->nid);
95 DBG_BUGON(1);
96 return -EOPNOTSUPP;
97 }
98
99 if (!nblks)
100 /* measure inode.i_blocks as generic filesystems */
101 inode->i_blocks = roundup(inode->i_size, EROFS_BLKSIZ) >> 9;
102 else
103 inode->i_blocks = nblks << LOG_SECTORS_PER_BLOCK;
104 return 0;
105
106bogusimode:
107 errln("bogus i_mode (%o) @ nid %llu", inode->i_mode, vi->nid);
108 DBG_BUGON(1);
109 return -EFSCORRUPTED;
110}
111
112/*
113 * try_lock can be required since locking order is:
114 * file data(fs_inode)
115 * meta(bd_inode)
116 * but the majority of the callers is "iget",
117 * in that case we are pretty sure no deadlock since
118 * no data operations exist. However I tend to
119 * try_lock since it takes no much overhead and
120 * will success immediately.
121 */
122static int fill_inline_data(struct inode *inode, void *data,
123 unsigned int m_pofs)
124{
125 struct erofs_vnode *vi = EROFS_V(inode);
126 struct erofs_sb_info *sbi = EROFS_I_SB(inode);
127
128 /* should be inode inline C */
129 if (!is_inode_flat_inline(inode))
130 return 0;
131
132 /* fast symlink (following ext4) */
133 if (S_ISLNK(inode->i_mode) && inode->i_size < PAGE_SIZE) {
134 char *lnk = erofs_kmalloc(sbi, inode->i_size + 1, GFP_KERNEL);
135
136 if (unlikely(!lnk))
137 return -ENOMEM;
138
139 m_pofs += vi->inode_isize + vi->xattr_isize;
140
141 /* inline symlink data shouldn't across page boundary as well */
142 if (unlikely(m_pofs + inode->i_size > PAGE_SIZE)) {
143 kfree(lnk);
144 errln("inline data cross block boundary @ nid %llu",
145 vi->nid);
146 DBG_BUGON(1);
147 return -EFSCORRUPTED;
148 }
149
150 /* get in-page inline data */
151 memcpy(lnk, data + m_pofs, inode->i_size);
152 lnk[inode->i_size] = '\0';
153
154 inode->i_link = lnk;
155 set_inode_fast_symlink(inode);
156 }
157 return 0;
158}
159
160static int fill_inode(struct inode *inode, int isdir)
161{
162 struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb);
163 struct erofs_vnode *vi = EROFS_V(inode);
164 struct page *page;
165 void *data;
166 int err;
167 erofs_blk_t blkaddr;
168 unsigned int ofs;
169 erofs_off_t inode_loc;
170
171 trace_erofs_fill_inode(inode, isdir);
172 inode_loc = iloc(sbi, vi->nid);
173 blkaddr = erofs_blknr(inode_loc);
174 ofs = erofs_blkoff(inode_loc);
175
176 debugln("%s, reading inode nid %llu at %u of blkaddr %u",
177 __func__, vi->nid, ofs, blkaddr);
178
179 page = erofs_get_meta_page(inode->i_sb, blkaddr, isdir);
180
181 if (IS_ERR(page)) {
182 errln("failed to get inode (nid: %llu) page, err %ld",
183 vi->nid, PTR_ERR(page));
184 return PTR_ERR(page);
185 }
186
187 DBG_BUGON(!PageUptodate(page));
188 data = page_address(page);
189
190 err = read_inode(inode, data + ofs);
191 if (!err) {
192 /* setup the new inode */
193 if (S_ISREG(inode->i_mode)) {
194 inode->i_op = &erofs_generic_iops;
195 inode->i_fop = &generic_ro_fops;
196 } else if (S_ISDIR(inode->i_mode)) {
197 inode->i_op = &erofs_dir_iops;
198 inode->i_fop = &erofs_dir_fops;
199 } else if (S_ISLNK(inode->i_mode)) {
200 /* by default, page_get_link is used for symlink */
201 inode->i_op = &erofs_symlink_iops;
202 inode_nohighmem(inode);
203 } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
204 S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
205 inode->i_op = &erofs_generic_iops;
206 init_special_inode(inode, inode->i_mode, inode->i_rdev);
207 goto out_unlock;
208 } else {
209 err = -EFSCORRUPTED;
210 goto out_unlock;
211 }
212
213 if (is_inode_layout_compression(inode)) {
214 err = z_erofs_fill_inode(inode);
215 goto out_unlock;
216 }
217
218 inode->i_mapping->a_ops = &erofs_raw_access_aops;
219
220 /* fill last page if inline data is available */
221 err = fill_inline_data(inode, data, ofs);
222 }
223
224out_unlock:
225 unlock_page(page);
226 put_page(page);
227 return err;
228}
229
230/*
231 * erofs nid is 64bits, but i_ino is 'unsigned long', therefore
232 * we should do more for 32-bit platform to find the right inode.
233 */
234#if BITS_PER_LONG == 32
235static int erofs_ilookup_test_actor(struct inode *inode, void *opaque)
236{
237 const erofs_nid_t nid = *(erofs_nid_t *)opaque;
238
239 return EROFS_V(inode)->nid == nid;
240}
241
242static int erofs_iget_set_actor(struct inode *inode, void *opaque)
243{
244 const erofs_nid_t nid = *(erofs_nid_t *)opaque;
245
246 inode->i_ino = erofs_inode_hash(nid);
247 return 0;
248}
249#endif
250
251static inline struct inode *erofs_iget_locked(struct super_block *sb,
252 erofs_nid_t nid)
253{
254 const unsigned long hashval = erofs_inode_hash(nid);
255
256#if BITS_PER_LONG >= 64
257 /* it is safe to use iget_locked for >= 64-bit platform */
258 return iget_locked(sb, hashval);
259#else
260 return iget5_locked(sb, hashval, erofs_ilookup_test_actor,
261 erofs_iget_set_actor, &nid);
262#endif
263}
264
265struct inode *erofs_iget(struct super_block *sb,
266 erofs_nid_t nid,
267 bool isdir)
268{
269 struct inode *inode = erofs_iget_locked(sb, nid);
270
271 if (unlikely(!inode))
272 return ERR_PTR(-ENOMEM);
273
274 if (inode->i_state & I_NEW) {
275 int err;
276 struct erofs_vnode *vi = EROFS_V(inode);
277
278 vi->nid = nid;
279
280 err = fill_inode(inode, isdir);
281 if (likely(!err))
282 unlock_new_inode(inode);
283 else {
284 iget_failed(inode);
285 inode = ERR_PTR(err);
286 }
287 }
288 return inode;
289}
290
291int erofs_getattr(const struct path *path, struct kstat *stat,
292 u32 request_mask, unsigned int query_flags)
293{
294 struct inode *const inode = d_inode(path->dentry);
295
296 if (is_inode_layout_compression(inode))
297 stat->attributes |= STATX_ATTR_COMPRESSED;
298
299 stat->attributes |= STATX_ATTR_IMMUTABLE;
300 stat->attributes_mask |= (STATX_ATTR_COMPRESSED |
301 STATX_ATTR_IMMUTABLE);
302
303 generic_fillattr(inode, stat);
304 return 0;
305}
306
307const struct inode_operations erofs_generic_iops = {
308 .getattr = erofs_getattr,
309#ifdef CONFIG_EROFS_FS_XATTR
310 .listxattr = erofs_listxattr,
311#endif
312 .get_acl = erofs_get_acl,
313};
314
315const struct inode_operations erofs_symlink_iops = {
316 .get_link = page_get_link,
317 .getattr = erofs_getattr,
318#ifdef CONFIG_EROFS_FS_XATTR
319 .listxattr = erofs_listxattr,
320#endif
321 .get_acl = erofs_get_acl,
322};
323
324const struct inode_operations erofs_fast_symlink_iops = {
325 .get_link = simple_get_link,
326 .getattr = erofs_getattr,
327#ifdef CONFIG_EROFS_FS_XATTR
328 .listxattr = erofs_listxattr,
329#endif
330 .get_acl = erofs_get_acl,
331};
332
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
new file mode 100644
index 000000000000..620b73fcc416
--- /dev/null
+++ b/fs/erofs/internal.h
@@ -0,0 +1,553 @@
1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * Copyright (C) 2017-2018 HUAWEI, Inc.
4 * http://www.huawei.com/
5 * Created by Gao Xiang <gaoxiang25@huawei.com>
6 */
7#ifndef __EROFS_INTERNAL_H
8#define __EROFS_INTERNAL_H
9
10#include <linux/fs.h>
11#include <linux/dcache.h>
12#include <linux/mm.h>
13#include <linux/pagemap.h>
14#include <linux/bio.h>
15#include <linux/buffer_head.h>
16#include <linux/magic.h>
17#include <linux/slab.h>
18#include <linux/vmalloc.h>
19#include "erofs_fs.h"
20
21/* redefine pr_fmt "erofs: " */
22#undef pr_fmt
23#define pr_fmt(fmt) "erofs: " fmt
24
25#define errln(x, ...) pr_err(x "\n", ##__VA_ARGS__)
26#define infoln(x, ...) pr_info(x "\n", ##__VA_ARGS__)
27#ifdef CONFIG_EROFS_FS_DEBUG
28#define debugln(x, ...) pr_debug(x "\n", ##__VA_ARGS__)
29#define DBG_BUGON BUG_ON
30#else
31#define debugln(x, ...) ((void)0)
32#define DBG_BUGON(x) ((void)(x))
33#endif /* !CONFIG_EROFS_FS_DEBUG */
34
35enum {
36 FAULT_KMALLOC,
37 FAULT_READ_IO,
38 FAULT_MAX,
39};
40
41#ifdef CONFIG_EROFS_FAULT_INJECTION
42extern const char *erofs_fault_name[FAULT_MAX];
43#define IS_FAULT_SET(fi, type) ((fi)->inject_type & (1 << (type)))
44
45struct erofs_fault_info {
46 atomic_t inject_ops;
47 unsigned int inject_rate;
48 unsigned int inject_type;
49};
50#endif /* CONFIG_EROFS_FAULT_INJECTION */
51
52/* EROFS_SUPER_MAGIC_V1 to represent the whole file system */
53#define EROFS_SUPER_MAGIC EROFS_SUPER_MAGIC_V1
54
55typedef u64 erofs_nid_t;
56typedef u64 erofs_off_t;
57/* data type for filesystem-wide blocks number */
58typedef u32 erofs_blk_t;
59
60struct erofs_sb_info {
61#ifdef CONFIG_EROFS_FS_ZIP
62 /* list for all registered superblocks, mainly for shrinker */
63 struct list_head list;
64 struct mutex umount_mutex;
65
66 /* the dedicated workstation for compression */
67 struct radix_tree_root workstn_tree;
68
69 /* threshold for decompression synchronously */
70 unsigned int max_sync_decompress_pages;
71
72 unsigned int shrinker_run_no;
73
74 /* current strategy of how to use managed cache */
75 unsigned char cache_strategy;
76
77 /* pseudo inode to manage cached pages */
78 struct inode *managed_cache;
79#endif /* CONFIG_EROFS_FS_ZIP */
80 u32 blocks;
81 u32 meta_blkaddr;
82#ifdef CONFIG_EROFS_FS_XATTR
83 u32 xattr_blkaddr;
84#endif
85
86 /* inode slot unit size in bit shift */
87 unsigned char islotbits;
88
89 u32 build_time_nsec;
90 u64 build_time;
91
92 /* what we really care is nid, rather than ino.. */
93 erofs_nid_t root_nid;
94 /* used for statfs, f_files - f_favail */
95 u64 inos;
96
97 u8 uuid[16]; /* 128-bit uuid for volume */
98 u8 volume_name[16]; /* volume name */
99 u32 requirements;
100
101 unsigned int mount_opt;
102
103#ifdef CONFIG_EROFS_FAULT_INJECTION
104 struct erofs_fault_info fault_info; /* For fault injection */
105#endif
106};
107
108#ifdef CONFIG_EROFS_FAULT_INJECTION
109#define erofs_show_injection_info(type) \
110 infoln("inject %s in %s of %pS", erofs_fault_name[type], \
111 __func__, __builtin_return_address(0))
112
113static inline bool time_to_inject(struct erofs_sb_info *sbi, int type)
114{
115 struct erofs_fault_info *ffi = &sbi->fault_info;
116
117 if (!ffi->inject_rate)
118 return false;
119
120 if (!IS_FAULT_SET(ffi, type))
121 return false;
122
123 atomic_inc(&ffi->inject_ops);
124 if (atomic_read(&ffi->inject_ops) >= ffi->inject_rate) {
125 atomic_set(&ffi->inject_ops, 0);
126 return true;
127 }
128 return false;
129}
130#else
131static inline bool time_to_inject(struct erofs_sb_info *sbi, int type)
132{
133 return false;
134}
135
136static inline void erofs_show_injection_info(int type)
137{
138}
139#endif /* !CONFIG_EROFS_FAULT_INJECTION */
140
141static inline void *erofs_kmalloc(struct erofs_sb_info *sbi,
142 size_t size, gfp_t flags)
143{
144 if (time_to_inject(sbi, FAULT_KMALLOC)) {
145 erofs_show_injection_info(FAULT_KMALLOC);
146 return NULL;
147 }
148 return kmalloc(size, flags);
149}
150
151#define EROFS_SB(sb) ((struct erofs_sb_info *)(sb)->s_fs_info)
152#define EROFS_I_SB(inode) ((struct erofs_sb_info *)(inode)->i_sb->s_fs_info)
153
154/* Mount flags set via mount options or defaults */
155#define EROFS_MOUNT_XATTR_USER 0x00000010
156#define EROFS_MOUNT_POSIX_ACL 0x00000020
157#define EROFS_MOUNT_FAULT_INJECTION 0x00000040
158
159#define clear_opt(sbi, option) ((sbi)->mount_opt &= ~EROFS_MOUNT_##option)
160#define set_opt(sbi, option) ((sbi)->mount_opt |= EROFS_MOUNT_##option)
161#define test_opt(sbi, option) ((sbi)->mount_opt & EROFS_MOUNT_##option)
162
163#ifdef CONFIG_EROFS_FS_ZIP
164enum {
165 EROFS_ZIP_CACHE_DISABLED,
166 EROFS_ZIP_CACHE_READAHEAD,
167 EROFS_ZIP_CACHE_READAROUND
168};
169
170#define EROFS_LOCKED_MAGIC (INT_MIN | 0xE0F510CCL)
171
172/* basic unit of the workstation of a super_block */
173struct erofs_workgroup {
174 /* the workgroup index in the workstation */
175 pgoff_t index;
176
177 /* overall workgroup reference count */
178 atomic_t refcount;
179};
180
181#if defined(CONFIG_SMP)
182static inline bool erofs_workgroup_try_to_freeze(struct erofs_workgroup *grp,
183 int val)
184{
185 preempt_disable();
186 if (val != atomic_cmpxchg(&grp->refcount, val, EROFS_LOCKED_MAGIC)) {
187 preempt_enable();
188 return false;
189 }
190 return true;
191}
192
193static inline void erofs_workgroup_unfreeze(struct erofs_workgroup *grp,
194 int orig_val)
195{
196 /*
197 * other observers should notice all modifications
198 * in the freezing period.
199 */
200 smp_mb();
201 atomic_set(&grp->refcount, orig_val);
202 preempt_enable();
203}
204
205static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp)
206{
207 return atomic_cond_read_relaxed(&grp->refcount,
208 VAL != EROFS_LOCKED_MAGIC);
209}
210#else
211static inline bool erofs_workgroup_try_to_freeze(struct erofs_workgroup *grp,
212 int val)
213{
214 preempt_disable();
215 /* no need to spin on UP platforms, let's just disable preemption. */
216 if (val != atomic_read(&grp->refcount)) {
217 preempt_enable();
218 return false;
219 }
220 return true;
221}
222
223static inline void erofs_workgroup_unfreeze(struct erofs_workgroup *grp,
224 int orig_val)
225{
226 preempt_enable();
227}
228
229static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp)
230{
231 int v = atomic_read(&grp->refcount);
232
233 /* workgroup is never freezed on uniprocessor systems */
234 DBG_BUGON(v == EROFS_LOCKED_MAGIC);
235 return v;
236}
237#endif /* !CONFIG_SMP */
238
239/* hard limit of pages per compressed cluster */
240#define Z_EROFS_CLUSTER_MAX_PAGES (CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT)
241#define EROFS_PCPUBUF_NR_PAGES Z_EROFS_CLUSTER_MAX_PAGES
242#else
243#define EROFS_PCPUBUF_NR_PAGES 0
244#endif /* !CONFIG_EROFS_FS_ZIP */
245
246/* we strictly follow PAGE_SIZE and no buffer head yet */
247#define LOG_BLOCK_SIZE PAGE_SHIFT
248
249#undef LOG_SECTORS_PER_BLOCK
250#define LOG_SECTORS_PER_BLOCK (PAGE_SHIFT - 9)
251
252#undef SECTORS_PER_BLOCK
253#define SECTORS_PER_BLOCK (1 << SECTORS_PER_BLOCK)
254
255#define EROFS_BLKSIZ (1 << LOG_BLOCK_SIZE)
256
257#if (EROFS_BLKSIZ % 4096 || !EROFS_BLKSIZ)
258#error erofs cannot be used in this platform
259#endif
260
261#define EROFS_IO_MAX_RETRIES_NOFAIL 5
262
263#define ROOT_NID(sb) ((sb)->root_nid)
264
265#define erofs_blknr(addr) ((addr) / EROFS_BLKSIZ)
266#define erofs_blkoff(addr) ((addr) % EROFS_BLKSIZ)
267#define blknr_to_addr(nr) ((erofs_off_t)(nr) * EROFS_BLKSIZ)
268
269static inline erofs_off_t iloc(struct erofs_sb_info *sbi, erofs_nid_t nid)
270{
271 return blknr_to_addr(sbi->meta_blkaddr) + (nid << sbi->islotbits);
272}
273
274/* atomic flag definitions */
275#define EROFS_V_EA_INITED_BIT 0
276#define EROFS_V_Z_INITED_BIT 1
277
278/* bitlock definitions (arranged in reverse order) */
279#define EROFS_V_BL_XATTR_BIT (BITS_PER_LONG - 1)
280#define EROFS_V_BL_Z_BIT (BITS_PER_LONG - 2)
281
282struct erofs_vnode {
283 erofs_nid_t nid;
284
285 /* atomic flags (including bitlocks) */
286 unsigned long flags;
287
288 unsigned char datamode;
289 unsigned char inode_isize;
290 unsigned short xattr_isize;
291
292 unsigned int xattr_shared_count;
293 unsigned int *xattr_shared_xattrs;
294
295 union {
296 erofs_blk_t raw_blkaddr;
297#ifdef CONFIG_EROFS_FS_ZIP
298 struct {
299 unsigned short z_advise;
300 unsigned char z_algorithmtype[2];
301 unsigned char z_logical_clusterbits;
302 unsigned char z_physical_clusterbits[2];
303 };
304#endif /* CONFIG_EROFS_FS_ZIP */
305 };
306 /* the corresponding vfs inode */
307 struct inode vfs_inode;
308};
309
310#define EROFS_V(ptr) \
311 container_of(ptr, struct erofs_vnode, vfs_inode)
312
313#define __inode_advise(x, bit, bits) \
314 (((x) >> (bit)) & ((1 << (bits)) - 1))
315
316#define __inode_version(advise) \
317 __inode_advise(advise, EROFS_I_VERSION_BIT, \
318 EROFS_I_VERSION_BITS)
319
320#define __inode_data_mapping(advise) \
321 __inode_advise(advise, EROFS_I_DATA_MAPPING_BIT,\
322 EROFS_I_DATA_MAPPING_BITS)
323
324static inline unsigned long inode_datablocks(struct inode *inode)
325{
326 /* since i_size cannot be changed */
327 return DIV_ROUND_UP(inode->i_size, EROFS_BLKSIZ);
328}
329
330static inline bool is_inode_layout_compression(struct inode *inode)
331{
332 return erofs_inode_is_data_compressed(EROFS_V(inode)->datamode);
333}
334
335static inline bool is_inode_flat_inline(struct inode *inode)
336{
337 return EROFS_V(inode)->datamode == EROFS_INODE_FLAT_INLINE;
338}
339
340extern const struct super_operations erofs_sops;
341
342extern const struct address_space_operations erofs_raw_access_aops;
343#ifdef CONFIG_EROFS_FS_ZIP
344extern const struct address_space_operations z_erofs_vle_normalaccess_aops;
345#endif
346
347/*
348 * Logical to physical block mapping, used by erofs_map_blocks()
349 *
350 * Different with other file systems, it is used for 2 access modes:
351 *
352 * 1) RAW access mode:
353 *
354 * Users pass a valid (m_lblk, m_lofs -- usually 0) pair,
355 * and get the valid m_pblk, m_pofs and the longest m_len(in bytes).
356 *
357 * Note that m_lblk in the RAW access mode refers to the number of
358 * the compressed ondisk block rather than the uncompressed
359 * in-memory block for the compressed file.
360 *
361 * m_pofs equals to m_lofs except for the inline data page.
362 *
363 * 2) Normal access mode:
364 *
365 * If the inode is not compressed, it has no difference with
366 * the RAW access mode. However, if the inode is compressed,
367 * users should pass a valid (m_lblk, m_lofs) pair, and get
368 * the needed m_pblk, m_pofs, m_len to get the compressed data
369 * and the updated m_lblk, m_lofs which indicates the start
370 * of the corresponding uncompressed data in the file.
371 */
372enum {
373 BH_Zipped = BH_PrivateStart,
374 BH_FullMapped,
375};
376
377/* Has a disk mapping */
378#define EROFS_MAP_MAPPED (1 << BH_Mapped)
379/* Located in metadata (could be copied from bd_inode) */
380#define EROFS_MAP_META (1 << BH_Meta)
381/* The extent has been compressed */
382#define EROFS_MAP_ZIPPED (1 << BH_Zipped)
383/* The length of extent is full */
384#define EROFS_MAP_FULL_MAPPED (1 << BH_FullMapped)
385
386struct erofs_map_blocks {
387 erofs_off_t m_pa, m_la;
388 u64 m_plen, m_llen;
389
390 unsigned int m_flags;
391
392 struct page *mpage;
393};
394
395/* Flags used by erofs_map_blocks() */
396#define EROFS_GET_BLOCKS_RAW 0x0001
397
398/* zmap.c */
399#ifdef CONFIG_EROFS_FS_ZIP
400int z_erofs_fill_inode(struct inode *inode);
401int z_erofs_map_blocks_iter(struct inode *inode,
402 struct erofs_map_blocks *map,
403 int flags);
404#else
405static inline int z_erofs_fill_inode(struct inode *inode) { return -EOPNOTSUPP; }
406static inline int z_erofs_map_blocks_iter(struct inode *inode,
407 struct erofs_map_blocks *map,
408 int flags)
409{
410 return -EOPNOTSUPP;
411}
412#endif /* !CONFIG_EROFS_FS_ZIP */
413
414/* data.c */
415static inline struct bio *erofs_grab_bio(struct super_block *sb,
416 erofs_blk_t blkaddr,
417 unsigned int nr_pages,
418 void *bi_private, bio_end_io_t endio,
419 bool nofail)
420{
421 const gfp_t gfp = GFP_NOIO;
422 struct bio *bio;
423
424 do {
425 if (nr_pages == 1) {
426 bio = bio_alloc(gfp | (nofail ? __GFP_NOFAIL : 0), 1);
427 if (unlikely(!bio)) {
428 DBG_BUGON(nofail);
429 return ERR_PTR(-ENOMEM);
430 }
431 break;
432 }
433 bio = bio_alloc(gfp, nr_pages);
434 nr_pages /= 2;
435 } while (unlikely(!bio));
436
437 bio->bi_end_io = endio;
438 bio_set_dev(bio, sb->s_bdev);
439 bio->bi_iter.bi_sector = (sector_t)blkaddr << LOG_SECTORS_PER_BLOCK;
440 bio->bi_private = bi_private;
441 return bio;
442}
443
444static inline void __submit_bio(struct bio *bio, unsigned int op,
445 unsigned int op_flags)
446{
447 bio_set_op_attrs(bio, op, op_flags);
448 submit_bio(bio);
449}
450
451struct page *__erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr,
452 bool prio, bool nofail);
453
454static inline struct page *erofs_get_meta_page(struct super_block *sb,
455 erofs_blk_t blkaddr, bool prio)
456{
457 return __erofs_get_meta_page(sb, blkaddr, prio, false);
458}
459
460int erofs_map_blocks(struct inode *, struct erofs_map_blocks *, int);
461
462static inline struct page *erofs_get_inline_page(struct inode *inode,
463 erofs_blk_t blkaddr)
464{
465 return erofs_get_meta_page(inode->i_sb, blkaddr,
466 S_ISDIR(inode->i_mode));
467}
468
469/* inode.c */
470static inline unsigned long erofs_inode_hash(erofs_nid_t nid)
471{
472#if BITS_PER_LONG == 32
473 return (nid >> 32) ^ (nid & 0xffffffff);
474#else
475 return nid;
476#endif
477}
478
479extern const struct inode_operations erofs_generic_iops;
480extern const struct inode_operations erofs_symlink_iops;
481extern const struct inode_operations erofs_fast_symlink_iops;
482
483static inline void set_inode_fast_symlink(struct inode *inode)
484{
485 inode->i_op = &erofs_fast_symlink_iops;
486}
487
488static inline bool is_inode_fast_symlink(struct inode *inode)
489{
490 return inode->i_op == &erofs_fast_symlink_iops;
491}
492
493struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid, bool dir);
494int erofs_getattr(const struct path *path, struct kstat *stat,
495 u32 request_mask, unsigned int query_flags);
496
497/* namei.c */
498extern const struct inode_operations erofs_dir_iops;
499
500int erofs_namei(struct inode *dir, struct qstr *name,
501 erofs_nid_t *nid, unsigned int *d_type);
502
503/* dir.c */
504extern const struct file_operations erofs_dir_fops;
505
506/* utils.c / zdata.c */
507struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp, bool nofail);
508
509#if (EROFS_PCPUBUF_NR_PAGES > 0)
510void *erofs_get_pcpubuf(unsigned int pagenr);
511#define erofs_put_pcpubuf(buf) do { \
512 (void)&(buf); \
513 preempt_enable(); \
514} while (0)
515#else
516static inline void *erofs_get_pcpubuf(unsigned int pagenr)
517{
518 return ERR_PTR(-EOPNOTSUPP);
519}
520
521#define erofs_put_pcpubuf(buf) do {} while (0)
522#endif
523
524#ifdef CONFIG_EROFS_FS_ZIP
525int erofs_workgroup_put(struct erofs_workgroup *grp);
526struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
527 pgoff_t index, bool *tag);
528int erofs_register_workgroup(struct super_block *sb,
529 struct erofs_workgroup *grp, bool tag);
530void erofs_workgroup_free_rcu(struct erofs_workgroup *grp);
531void erofs_shrinker_register(struct super_block *sb);
532void erofs_shrinker_unregister(struct super_block *sb);
533int __init erofs_init_shrinker(void);
534void erofs_exit_shrinker(void);
535int __init z_erofs_init_zip_subsystem(void);
536void z_erofs_exit_zip_subsystem(void);
537int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
538 struct erofs_workgroup *egrp);
539int erofs_try_to_free_cached_page(struct address_space *mapping,
540 struct page *page);
541#else
542static inline void erofs_shrinker_register(struct super_block *sb) {}
543static inline void erofs_shrinker_unregister(struct super_block *sb) {}
544static inline int erofs_init_shrinker(void) { return 0; }
545static inline void erofs_exit_shrinker(void) {}
546static inline int z_erofs_init_zip_subsystem(void) { return 0; }
547static inline void z_erofs_exit_zip_subsystem(void) {}
548#endif /* !CONFIG_EROFS_FS_ZIP */
549
550#define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */
551
552#endif /* __EROFS_INTERNAL_H */
553
diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c
new file mode 100644
index 000000000000..8832b5d95d91
--- /dev/null
+++ b/fs/erofs/namei.c
@@ -0,0 +1,251 @@
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright (C) 2017-2018 HUAWEI, Inc.
4 * http://www.huawei.com/
5 * Created by Gao Xiang <gaoxiang25@huawei.com>
6 */
7#include "xattr.h"
8
9#include <trace/events/erofs.h>
10
11struct erofs_qstr {
12 const unsigned char *name;
13 const unsigned char *end;
14};
15
16/* based on the end of qn is accurate and it must have the trailing '\0' */
17static inline int dirnamecmp(const struct erofs_qstr *qn,
18 const struct erofs_qstr *qd,
19 unsigned int *matched)
20{
21 unsigned int i = *matched;
22
23 /*
24 * on-disk error, let's only BUG_ON in the debugging mode.
25 * otherwise, it will return 1 to just skip the invalid name
26 * and go on (in consideration of the lookup performance).
27 */
28 DBG_BUGON(qd->name > qd->end);
29
30 /* qd could not have trailing '\0' */
31 /* However it is absolutely safe if < qd->end */
32 while (qd->name + i < qd->end && qd->name[i] != '\0') {
33 if (qn->name[i] != qd->name[i]) {
34 *matched = i;
35 return qn->name[i] > qd->name[i] ? 1 : -1;
36 }
37 ++i;
38 }
39 *matched = i;
40 /* See comments in __d_alloc on the terminating NUL character */
41 return qn->name[i] == '\0' ? 0 : 1;
42}
43
44#define nameoff_from_disk(off, sz) (le16_to_cpu(off) & ((sz) - 1))
45
46static struct erofs_dirent *find_target_dirent(struct erofs_qstr *name,
47 u8 *data,
48 unsigned int dirblksize,
49 const int ndirents)
50{
51 int head, back;
52 unsigned int startprfx, endprfx;
53 struct erofs_dirent *const de = (struct erofs_dirent *)data;
54
55 /* since the 1st dirent has been evaluated previously */
56 head = 1;
57 back = ndirents - 1;
58 startprfx = endprfx = 0;
59
60 while (head <= back) {
61 const int mid = head + (back - head) / 2;
62 const int nameoff = nameoff_from_disk(de[mid].nameoff,
63 dirblksize);
64 unsigned int matched = min(startprfx, endprfx);
65 struct erofs_qstr dname = {
66 .name = data + nameoff,
67 .end = unlikely(mid >= ndirents - 1) ?
68 data + dirblksize :
69 data + nameoff_from_disk(de[mid + 1].nameoff,
70 dirblksize)
71 };
72
73 /* string comparison without already matched prefix */
74 int ret = dirnamecmp(name, &dname, &matched);
75
76 if (unlikely(!ret)) {
77 return de + mid;
78 } else if (ret > 0) {
79 head = mid + 1;
80 startprfx = matched;
81 } else {
82 back = mid - 1;
83 endprfx = matched;
84 }
85 }
86
87 return ERR_PTR(-ENOENT);
88}
89
90static struct page *find_target_block_classic(struct inode *dir,
91 struct erofs_qstr *name,
92 int *_ndirents)
93{
94 unsigned int startprfx, endprfx;
95 int head, back;
96 struct address_space *const mapping = dir->i_mapping;
97 struct page *candidate = ERR_PTR(-ENOENT);
98
99 startprfx = endprfx = 0;
100 head = 0;
101 back = inode_datablocks(dir) - 1;
102
103 while (head <= back) {
104 const int mid = head + (back - head) / 2;
105 struct page *page = read_mapping_page(mapping, mid, NULL);
106
107 if (!IS_ERR(page)) {
108 struct erofs_dirent *de = kmap_atomic(page);
109 const int nameoff = nameoff_from_disk(de->nameoff,
110 EROFS_BLKSIZ);
111 const int ndirents = nameoff / sizeof(*de);
112 int diff;
113 unsigned int matched;
114 struct erofs_qstr dname;
115
116 if (unlikely(!ndirents)) {
117 kunmap_atomic(de);
118 put_page(page);
119 errln("corrupted dir block %d @ nid %llu",
120 mid, EROFS_V(dir)->nid);
121 DBG_BUGON(1);
122 page = ERR_PTR(-EFSCORRUPTED);
123 goto out;
124 }
125
126 matched = min(startprfx, endprfx);
127
128 dname.name = (u8 *)de + nameoff;
129 if (ndirents == 1)
130 dname.end = (u8 *)de + EROFS_BLKSIZ;
131 else
132 dname.end = (u8 *)de +
133 nameoff_from_disk(de[1].nameoff,
134 EROFS_BLKSIZ);
135
136 /* string comparison without already matched prefix */
137 diff = dirnamecmp(name, &dname, &matched);
138 kunmap_atomic(de);
139
140 if (unlikely(!diff)) {
141 *_ndirents = 0;
142 goto out;
143 } else if (diff > 0) {
144 head = mid + 1;
145 startprfx = matched;
146
147 if (!IS_ERR(candidate))
148 put_page(candidate);
149 candidate = page;
150 *_ndirents = ndirents;
151 } else {
152 put_page(page);
153
154 back = mid - 1;
155 endprfx = matched;
156 }
157 continue;
158 }
159out: /* free if the candidate is valid */
160 if (!IS_ERR(candidate))
161 put_page(candidate);
162 return page;
163 }
164 return candidate;
165}
166
167int erofs_namei(struct inode *dir,
168 struct qstr *name,
169 erofs_nid_t *nid, unsigned int *d_type)
170{
171 int ndirents;
172 struct page *page;
173 void *data;
174 struct erofs_dirent *de;
175 struct erofs_qstr qn;
176
177 if (unlikely(!dir->i_size))
178 return -ENOENT;
179
180 qn.name = name->name;
181 qn.end = name->name + name->len;
182
183 ndirents = 0;
184 page = find_target_block_classic(dir, &qn, &ndirents);
185
186 if (IS_ERR(page))
187 return PTR_ERR(page);
188
189 data = kmap_atomic(page);
190 /* the target page has been mapped */
191 if (ndirents)
192 de = find_target_dirent(&qn, data, EROFS_BLKSIZ, ndirents);
193 else
194 de = (struct erofs_dirent *)data;
195
196 if (!IS_ERR(de)) {
197 *nid = le64_to_cpu(de->nid);
198 *d_type = de->file_type;
199 }
200
201 kunmap_atomic(data);
202 put_page(page);
203
204 return PTR_ERR_OR_ZERO(de);
205}
206
207/* NOTE: i_mutex is already held by vfs */
208static struct dentry *erofs_lookup(struct inode *dir,
209 struct dentry *dentry,
210 unsigned int flags)
211{
212 int err;
213 erofs_nid_t nid;
214 unsigned int d_type;
215 struct inode *inode;
216
217 DBG_BUGON(!d_really_is_negative(dentry));
218 /* dentry must be unhashed in lookup, no need to worry about */
219 DBG_BUGON(!d_unhashed(dentry));
220
221 trace_erofs_lookup(dir, dentry, flags);
222
223 /* file name exceeds fs limit */
224 if (unlikely(dentry->d_name.len > EROFS_NAME_LEN))
225 return ERR_PTR(-ENAMETOOLONG);
226
227 /* false uninitialized warnings on gcc 4.8.x */
228 err = erofs_namei(dir, &dentry->d_name, &nid, &d_type);
229
230 if (err == -ENOENT) {
231 /* negative dentry */
232 inode = NULL;
233 } else if (unlikely(err)) {
234 inode = ERR_PTR(err);
235 } else {
236 debugln("%s, %s (nid %llu) found, d_type %u", __func__,
237 dentry->d_name.name, nid, d_type);
238 inode = erofs_iget(dir->i_sb, nid, d_type == FT_DIR);
239 }
240 return d_splice_alias(inode, dentry);
241}
242
243const struct inode_operations erofs_dir_iops = {
244 .lookup = erofs_lookup,
245 .getattr = erofs_getattr,
246#ifdef CONFIG_EROFS_FS_XATTR
247 .listxattr = erofs_listxattr,
248#endif
249 .get_acl = erofs_get_acl,
250};
251
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
new file mode 100644
index 000000000000..6d3a9bcb8daa
--- /dev/null
+++ b/fs/erofs/super.c
@@ -0,0 +1,669 @@
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright (C) 2017-2018 HUAWEI, Inc.
4 * http://www.huawei.com/
5 * Created by Gao Xiang <gaoxiang25@huawei.com>
6 */
7#include <linux/module.h>
8#include <linux/buffer_head.h>
9#include <linux/statfs.h>
10#include <linux/parser.h>
11#include <linux/seq_file.h>
12#include "xattr.h"
13
14#define CREATE_TRACE_POINTS
15#include <trace/events/erofs.h>
16
17static struct kmem_cache *erofs_inode_cachep __read_mostly;
18
19static void init_once(void *ptr)
20{
21 struct erofs_vnode *vi = ptr;
22
23 inode_init_once(&vi->vfs_inode);
24}
25
26static int __init erofs_init_inode_cache(void)
27{
28 erofs_inode_cachep = kmem_cache_create("erofs_inode",
29 sizeof(struct erofs_vnode), 0,
30 SLAB_RECLAIM_ACCOUNT,
31 init_once);
32
33 return erofs_inode_cachep ? 0 : -ENOMEM;
34}
35
36static void erofs_exit_inode_cache(void)
37{
38 kmem_cache_destroy(erofs_inode_cachep);
39}
40
41static struct inode *alloc_inode(struct super_block *sb)
42{
43 struct erofs_vnode *vi =
44 kmem_cache_alloc(erofs_inode_cachep, GFP_KERNEL);
45
46 if (!vi)
47 return NULL;
48
49 /* zero out everything except vfs_inode */
50 memset(vi, 0, offsetof(struct erofs_vnode, vfs_inode));
51 return &vi->vfs_inode;
52}
53
54static void free_inode(struct inode *inode)
55{
56 struct erofs_vnode *vi = EROFS_V(inode);
57
58 /* be careful RCU symlink path (see ext4_inode_info->i_data)! */
59 if (is_inode_fast_symlink(inode))
60 kfree(inode->i_link);
61
62 kfree(vi->xattr_shared_xattrs);
63
64 kmem_cache_free(erofs_inode_cachep, vi);
65}
66
67static bool check_layout_compatibility(struct super_block *sb,
68 struct erofs_super_block *layout)
69{
70 const unsigned int requirements = le32_to_cpu(layout->requirements);
71
72 EROFS_SB(sb)->requirements = requirements;
73
74 /* check if current kernel meets all mandatory requirements */
75 if (requirements & (~EROFS_ALL_REQUIREMENTS)) {
76 errln("unidentified requirements %x, please upgrade kernel version",
77 requirements & ~EROFS_ALL_REQUIREMENTS);
78 return false;
79 }
80 return true;
81}
82
83static int superblock_read(struct super_block *sb)
84{
85 struct erofs_sb_info *sbi;
86 struct buffer_head *bh;
87 struct erofs_super_block *layout;
88 unsigned int blkszbits;
89 int ret;
90
91 bh = sb_bread(sb, 0);
92
93 if (!bh) {
94 errln("cannot read erofs superblock");
95 return -EIO;
96 }
97
98 sbi = EROFS_SB(sb);
99 layout = (struct erofs_super_block *)((u8 *)bh->b_data
100 + EROFS_SUPER_OFFSET);
101
102 ret = -EINVAL;
103 if (le32_to_cpu(layout->magic) != EROFS_SUPER_MAGIC_V1) {
104 errln("cannot find valid erofs superblock");
105 goto out;
106 }
107
108 blkszbits = layout->blkszbits;
109 /* 9(512 bytes) + LOG_SECTORS_PER_BLOCK == LOG_BLOCK_SIZE */
110 if (unlikely(blkszbits != LOG_BLOCK_SIZE)) {
111 errln("blksize %u isn't supported on this platform",
112 1 << blkszbits);
113 goto out;
114 }
115
116 if (!check_layout_compatibility(sb, layout))
117 goto out;
118
119 sbi->blocks = le32_to_cpu(layout->blocks);
120 sbi->meta_blkaddr = le32_to_cpu(layout->meta_blkaddr);
121#ifdef CONFIG_EROFS_FS_XATTR
122 sbi->xattr_blkaddr = le32_to_cpu(layout->xattr_blkaddr);
123#endif
124 sbi->islotbits = ffs(sizeof(struct erofs_inode_v1)) - 1;
125 sbi->root_nid = le16_to_cpu(layout->root_nid);
126 sbi->inos = le64_to_cpu(layout->inos);
127
128 sbi->build_time = le64_to_cpu(layout->build_time);
129 sbi->build_time_nsec = le32_to_cpu(layout->build_time_nsec);
130
131 memcpy(&sb->s_uuid, layout->uuid, sizeof(layout->uuid));
132
133 ret = strscpy(sbi->volume_name, layout->volume_name,
134 sizeof(layout->volume_name));
135 if (ret < 0) { /* -E2BIG */
136 errln("bad volume name without NIL terminator");
137 ret = -EFSCORRUPTED;
138 goto out;
139 }
140 ret = 0;
141out:
142 brelse(bh);
143 return ret;
144}
145
146#ifdef CONFIG_EROFS_FAULT_INJECTION
147const char *erofs_fault_name[FAULT_MAX] = {
148 [FAULT_KMALLOC] = "kmalloc",
149 [FAULT_READ_IO] = "read IO error",
150};
151
152static void __erofs_build_fault_attr(struct erofs_sb_info *sbi,
153 unsigned int rate)
154{
155 struct erofs_fault_info *ffi = &sbi->fault_info;
156
157 if (rate) {
158 atomic_set(&ffi->inject_ops, 0);
159 ffi->inject_rate = rate;
160 ffi->inject_type = (1 << FAULT_MAX) - 1;
161 } else {
162 memset(ffi, 0, sizeof(struct erofs_fault_info));
163 }
164
165 set_opt(sbi, FAULT_INJECTION);
166}
167
168static int erofs_build_fault_attr(struct erofs_sb_info *sbi,
169 substring_t *args)
170{
171 int rate = 0;
172
173 if (args->from && match_int(args, &rate))
174 return -EINVAL;
175
176 __erofs_build_fault_attr(sbi, rate);
177 return 0;
178}
179
180static unsigned int erofs_get_fault_rate(struct erofs_sb_info *sbi)
181{
182 return sbi->fault_info.inject_rate;
183}
184#else
185static void __erofs_build_fault_attr(struct erofs_sb_info *sbi,
186 unsigned int rate)
187{
188}
189
190static int erofs_build_fault_attr(struct erofs_sb_info *sbi,
191 substring_t *args)
192{
193 infoln("fault_injection options not supported");
194 return 0;
195}
196
197static unsigned int erofs_get_fault_rate(struct erofs_sb_info *sbi)
198{
199 return 0;
200}
201#endif
202
203#ifdef CONFIG_EROFS_FS_ZIP
204static int erofs_build_cache_strategy(struct erofs_sb_info *sbi,
205 substring_t *args)
206{
207 const char *cs = match_strdup(args);
208 int err = 0;
209
210 if (!cs) {
211 errln("Not enough memory to store cache strategy");
212 return -ENOMEM;
213 }
214
215 if (!strcmp(cs, "disabled")) {
216 sbi->cache_strategy = EROFS_ZIP_CACHE_DISABLED;
217 } else if (!strcmp(cs, "readahead")) {
218 sbi->cache_strategy = EROFS_ZIP_CACHE_READAHEAD;
219 } else if (!strcmp(cs, "readaround")) {
220 sbi->cache_strategy = EROFS_ZIP_CACHE_READAROUND;
221 } else {
222 errln("Unrecognized cache strategy \"%s\"", cs);
223 err = -EINVAL;
224 }
225 kfree(cs);
226 return err;
227}
228#else
229static int erofs_build_cache_strategy(struct erofs_sb_info *sbi,
230 substring_t *args)
231{
232 infoln("EROFS compression is disabled, so cache strategy is ignored");
233 return 0;
234}
235#endif
236
237/* set up default EROFS parameters */
238static void default_options(struct erofs_sb_info *sbi)
239{
240#ifdef CONFIG_EROFS_FS_ZIP
241 sbi->cache_strategy = EROFS_ZIP_CACHE_READAROUND;
242 sbi->max_sync_decompress_pages = 3;
243#endif
244#ifdef CONFIG_EROFS_FS_XATTR
245 set_opt(sbi, XATTR_USER);
246#endif
247#ifdef CONFIG_EROFS_FS_POSIX_ACL
248 set_opt(sbi, POSIX_ACL);
249#endif
250}
251
252enum {
253 Opt_user_xattr,
254 Opt_nouser_xattr,
255 Opt_acl,
256 Opt_noacl,
257 Opt_fault_injection,
258 Opt_cache_strategy,
259 Opt_err
260};
261
262static match_table_t erofs_tokens = {
263 {Opt_user_xattr, "user_xattr"},
264 {Opt_nouser_xattr, "nouser_xattr"},
265 {Opt_acl, "acl"},
266 {Opt_noacl, "noacl"},
267 {Opt_fault_injection, "fault_injection=%u"},
268 {Opt_cache_strategy, "cache_strategy=%s"},
269 {Opt_err, NULL}
270};
271
272static int parse_options(struct super_block *sb, char *options)
273{
274 substring_t args[MAX_OPT_ARGS];
275 char *p;
276 int err;
277
278 if (!options)
279 return 0;
280
281 while ((p = strsep(&options, ","))) {
282 int token;
283
284 if (!*p)
285 continue;
286
287 args[0].to = args[0].from = NULL;
288 token = match_token(p, erofs_tokens, args);
289
290 switch (token) {
291#ifdef CONFIG_EROFS_FS_XATTR
292 case Opt_user_xattr:
293 set_opt(EROFS_SB(sb), XATTR_USER);
294 break;
295 case Opt_nouser_xattr:
296 clear_opt(EROFS_SB(sb), XATTR_USER);
297 break;
298#else
299 case Opt_user_xattr:
300 infoln("user_xattr options not supported");
301 break;
302 case Opt_nouser_xattr:
303 infoln("nouser_xattr options not supported");
304 break;
305#endif
306#ifdef CONFIG_EROFS_FS_POSIX_ACL
307 case Opt_acl:
308 set_opt(EROFS_SB(sb), POSIX_ACL);
309 break;
310 case Opt_noacl:
311 clear_opt(EROFS_SB(sb), POSIX_ACL);
312 break;
313#else
314 case Opt_acl:
315 infoln("acl options not supported");
316 break;
317 case Opt_noacl:
318 infoln("noacl options not supported");
319 break;
320#endif
321 case Opt_fault_injection:
322 err = erofs_build_fault_attr(EROFS_SB(sb), args);
323 if (err)
324 return err;
325 break;
326 case Opt_cache_strategy:
327 err = erofs_build_cache_strategy(EROFS_SB(sb), args);
328 if (err)
329 return err;
330 break;
331 default:
332 errln("Unrecognized mount option \"%s\" or missing value", p);
333 return -EINVAL;
334 }
335 }
336 return 0;
337}
338
339#ifdef CONFIG_EROFS_FS_ZIP
340static const struct address_space_operations managed_cache_aops;
341
342static int managed_cache_releasepage(struct page *page, gfp_t gfp_mask)
343{
344 int ret = 1; /* 0 - busy */
345 struct address_space *const mapping = page->mapping;
346
347 DBG_BUGON(!PageLocked(page));
348 DBG_BUGON(mapping->a_ops != &managed_cache_aops);
349
350 if (PagePrivate(page))
351 ret = erofs_try_to_free_cached_page(mapping, page);
352
353 return ret;
354}
355
356static void managed_cache_invalidatepage(struct page *page,
357 unsigned int offset,
358 unsigned int length)
359{
360 const unsigned int stop = length + offset;
361
362 DBG_BUGON(!PageLocked(page));
363
364 /* Check for potential overflow in debug mode */
365 DBG_BUGON(stop > PAGE_SIZE || stop < length);
366
367 if (offset == 0 && stop == PAGE_SIZE)
368 while (!managed_cache_releasepage(page, GFP_NOFS))
369 cond_resched();
370}
371
372static const struct address_space_operations managed_cache_aops = {
373 .releasepage = managed_cache_releasepage,
374 .invalidatepage = managed_cache_invalidatepage,
375};
376
377static int erofs_init_managed_cache(struct super_block *sb)
378{
379 struct erofs_sb_info *const sbi = EROFS_SB(sb);
380 struct inode *const inode = new_inode(sb);
381
382 if (unlikely(!inode))
383 return -ENOMEM;
384
385 set_nlink(inode, 1);
386 inode->i_size = OFFSET_MAX;
387
388 inode->i_mapping->a_ops = &managed_cache_aops;
389 mapping_set_gfp_mask(inode->i_mapping,
390 GFP_NOFS | __GFP_HIGHMEM | __GFP_MOVABLE);
391 sbi->managed_cache = inode;
392 return 0;
393}
394#else
395static int erofs_init_managed_cache(struct super_block *sb) { return 0; }
396#endif
397
398static int erofs_fill_super(struct super_block *sb, void *data, int silent)
399{
400 struct inode *inode;
401 struct erofs_sb_info *sbi;
402 int err;
403
404 infoln("fill_super, device -> %s", sb->s_id);
405 infoln("options -> %s", (char *)data);
406
407 sb->s_magic = EROFS_SUPER_MAGIC;
408
409 if (unlikely(!sb_set_blocksize(sb, EROFS_BLKSIZ))) {
410 errln("failed to set erofs blksize");
411 return -EINVAL;
412 }
413
414 sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
415 if (unlikely(!sbi))
416 return -ENOMEM;
417
418 sb->s_fs_info = sbi;
419 err = superblock_read(sb);
420 if (err)
421 return err;
422
423 sb->s_flags |= SB_RDONLY | SB_NOATIME;
424 sb->s_maxbytes = MAX_LFS_FILESIZE;
425 sb->s_time_gran = 1;
426
427 sb->s_op = &erofs_sops;
428
429#ifdef CONFIG_EROFS_FS_XATTR
430 sb->s_xattr = erofs_xattr_handlers;
431#endif
432 /* set erofs default mount options */
433 default_options(sbi);
434
435 err = parse_options(sb, data);
436 if (unlikely(err))
437 return err;
438
439 if (!silent)
440 infoln("root inode @ nid %llu", ROOT_NID(sbi));
441
442 if (test_opt(sbi, POSIX_ACL))
443 sb->s_flags |= SB_POSIXACL;
444 else
445 sb->s_flags &= ~SB_POSIXACL;
446
447#ifdef CONFIG_EROFS_FS_ZIP
448 INIT_RADIX_TREE(&sbi->workstn_tree, GFP_ATOMIC);
449#endif
450
451 /* get the root inode */
452 inode = erofs_iget(sb, ROOT_NID(sbi), true);
453 if (IS_ERR(inode))
454 return PTR_ERR(inode);
455
456 if (unlikely(!S_ISDIR(inode->i_mode))) {
457 errln("rootino(nid %llu) is not a directory(i_mode %o)",
458 ROOT_NID(sbi), inode->i_mode);
459 iput(inode);
460 return -EINVAL;
461 }
462
463 sb->s_root = d_make_root(inode);
464 if (unlikely(!sb->s_root))
465 return -ENOMEM;
466
467 erofs_shrinker_register(sb);
468 /* sb->s_umount is already locked, SB_ACTIVE and SB_BORN are not set */
469 err = erofs_init_managed_cache(sb);
470 if (unlikely(err))
471 return err;
472
473 if (!silent)
474 infoln("mounted on %s with opts: %s.", sb->s_id, (char *)data);
475 return 0;
476}
477
478static struct dentry *erofs_mount(struct file_system_type *fs_type, int flags,
479 const char *dev_name, void *data)
480{
481 return mount_bdev(fs_type, flags, dev_name, data, erofs_fill_super);
482}
483
484/*
485 * could be triggered after deactivate_locked_super()
486 * is called, thus including umount and failed to initialize.
487 */
488static void erofs_kill_sb(struct super_block *sb)
489{
490 struct erofs_sb_info *sbi;
491
492 WARN_ON(sb->s_magic != EROFS_SUPER_MAGIC);
493 infoln("unmounting for %s", sb->s_id);
494
495 kill_block_super(sb);
496
497 sbi = EROFS_SB(sb);
498 if (!sbi)
499 return;
500 kfree(sbi);
501 sb->s_fs_info = NULL;
502}
503
504/* called when ->s_root is non-NULL */
505static void erofs_put_super(struct super_block *sb)
506{
507 struct erofs_sb_info *const sbi = EROFS_SB(sb);
508
509 DBG_BUGON(!sbi);
510
511 erofs_shrinker_unregister(sb);
512#ifdef CONFIG_EROFS_FS_ZIP
513 iput(sbi->managed_cache);
514 sbi->managed_cache = NULL;
515#endif
516}
517
518static struct file_system_type erofs_fs_type = {
519 .owner = THIS_MODULE,
520 .name = "erofs",
521 .mount = erofs_mount,
522 .kill_sb = erofs_kill_sb,
523 .fs_flags = FS_REQUIRES_DEV,
524};
525MODULE_ALIAS_FS("erofs");
526
527static int __init erofs_module_init(void)
528{
529 int err;
530
531 erofs_check_ondisk_layout_definitions();
532 infoln("initializing erofs " EROFS_VERSION);
533
534 err = erofs_init_inode_cache();
535 if (err)
536 goto icache_err;
537
538 err = erofs_init_shrinker();
539 if (err)
540 goto shrinker_err;
541
542 err = z_erofs_init_zip_subsystem();
543 if (err)
544 goto zip_err;
545
546 err = register_filesystem(&erofs_fs_type);
547 if (err)
548 goto fs_err;
549
550 infoln("successfully to initialize erofs");
551 return 0;
552
553fs_err:
554 z_erofs_exit_zip_subsystem();
555zip_err:
556 erofs_exit_shrinker();
557shrinker_err:
558 erofs_exit_inode_cache();
559icache_err:
560 return err;
561}
562
563static void __exit erofs_module_exit(void)
564{
565 unregister_filesystem(&erofs_fs_type);
566 z_erofs_exit_zip_subsystem();
567 erofs_exit_shrinker();
568 erofs_exit_inode_cache();
569 infoln("successfully finalize erofs");
570}
571
572/* get filesystem statistics */
573static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf)
574{
575 struct super_block *sb = dentry->d_sb;
576 struct erofs_sb_info *sbi = EROFS_SB(sb);
577 u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
578
579 buf->f_type = sb->s_magic;
580 buf->f_bsize = EROFS_BLKSIZ;
581 buf->f_blocks = sbi->blocks;
582 buf->f_bfree = buf->f_bavail = 0;
583
584 buf->f_files = ULLONG_MAX;
585 buf->f_ffree = ULLONG_MAX - sbi->inos;
586
587 buf->f_namelen = EROFS_NAME_LEN;
588
589 buf->f_fsid.val[0] = (u32)id;
590 buf->f_fsid.val[1] = (u32)(id >> 32);
591 return 0;
592}
593
594static int erofs_show_options(struct seq_file *seq, struct dentry *root)
595{
596 struct erofs_sb_info *sbi __maybe_unused = EROFS_SB(root->d_sb);
597
598#ifdef CONFIG_EROFS_FS_XATTR
599 if (test_opt(sbi, XATTR_USER))
600 seq_puts(seq, ",user_xattr");
601 else
602 seq_puts(seq, ",nouser_xattr");
603#endif
604#ifdef CONFIG_EROFS_FS_POSIX_ACL
605 if (test_opt(sbi, POSIX_ACL))
606 seq_puts(seq, ",acl");
607 else
608 seq_puts(seq, ",noacl");
609#endif
610 if (test_opt(sbi, FAULT_INJECTION))
611 seq_printf(seq, ",fault_injection=%u",
612 erofs_get_fault_rate(sbi));
613#ifdef CONFIG_EROFS_FS_ZIP
614 if (sbi->cache_strategy == EROFS_ZIP_CACHE_DISABLED) {
615 seq_puts(seq, ",cache_strategy=disabled");
616 } else if (sbi->cache_strategy == EROFS_ZIP_CACHE_READAHEAD) {
617 seq_puts(seq, ",cache_strategy=readahead");
618 } else if (sbi->cache_strategy == EROFS_ZIP_CACHE_READAROUND) {
619 seq_puts(seq, ",cache_strategy=readaround");
620 } else {
621 seq_puts(seq, ",cache_strategy=(unknown)");
622 DBG_BUGON(1);
623 }
624#endif
625 return 0;
626}
627
628static int erofs_remount(struct super_block *sb, int *flags, char *data)
629{
630 struct erofs_sb_info *sbi = EROFS_SB(sb);
631 unsigned int org_mnt_opt = sbi->mount_opt;
632 unsigned int org_inject_rate = erofs_get_fault_rate(sbi);
633 int err;
634
635 DBG_BUGON(!sb_rdonly(sb));
636 err = parse_options(sb, data);
637 if (err)
638 goto out;
639
640 if (test_opt(sbi, POSIX_ACL))
641 sb->s_flags |= SB_POSIXACL;
642 else
643 sb->s_flags &= ~SB_POSIXACL;
644
645 *flags |= SB_RDONLY;
646 return 0;
647out:
648 __erofs_build_fault_attr(sbi, org_inject_rate);
649 sbi->mount_opt = org_mnt_opt;
650
651 return err;
652}
653
654const struct super_operations erofs_sops = {
655 .put_super = erofs_put_super,
656 .alloc_inode = alloc_inode,
657 .free_inode = free_inode,
658 .statfs = erofs_statfs,
659 .show_options = erofs_show_options,
660 .remount_fs = erofs_remount,
661};
662
663module_init(erofs_module_init);
664module_exit(erofs_module_exit);
665
666MODULE_DESCRIPTION("Enhanced ROM File System");
667MODULE_AUTHOR("Gao Xiang, Chao Yu, Miao Xie, CONSUMER BG, HUAWEI Inc.");
668MODULE_LICENSE("GPL");
669
diff --git a/fs/erofs/tagptr.h b/fs/erofs/tagptr.h
new file mode 100644
index 000000000000..a72897c86744
--- /dev/null
+++ b/fs/erofs/tagptr.h
@@ -0,0 +1,110 @@
1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * A tagged pointer implementation
4 *
5 * Copyright (C) 2018 Gao Xiang <gaoxiang25@huawei.com>
6 */
7#ifndef __EROFS_FS_TAGPTR_H
8#define __EROFS_FS_TAGPTR_H
9
10#include <linux/types.h>
11#include <linux/build_bug.h>
12
13/*
14 * the name of tagged pointer types are tagptr{1, 2, 3...}_t
15 * avoid directly using the internal structs __tagptr{1, 2, 3...}
16 */
17#define __MAKE_TAGPTR(n) \
18typedef struct __tagptr##n { \
19 uintptr_t v; \
20} tagptr##n##_t;
21
22__MAKE_TAGPTR(1)
23__MAKE_TAGPTR(2)
24__MAKE_TAGPTR(3)
25__MAKE_TAGPTR(4)
26
27#undef __MAKE_TAGPTR
28
29extern void __compiletime_error("bad tagptr tags")
30 __bad_tagptr_tags(void);
31
32extern void __compiletime_error("bad tagptr type")
33 __bad_tagptr_type(void);
34
35/* fix the broken usage of "#define tagptr2_t tagptr3_t" by users */
36#define __tagptr_mask_1(ptr, n) \
37 __builtin_types_compatible_p(typeof(ptr), struct __tagptr##n) ? \
38 (1UL << (n)) - 1 :
39
40#define __tagptr_mask(ptr) (\
41 __tagptr_mask_1(ptr, 1) ( \
42 __tagptr_mask_1(ptr, 2) ( \
43 __tagptr_mask_1(ptr, 3) ( \
44 __tagptr_mask_1(ptr, 4) ( \
45 __bad_tagptr_type(), 0)))))
46
47/* generate a tagged pointer from a raw value */
48#define tagptr_init(type, val) \
49 ((typeof(type)){ .v = (uintptr_t)(val) })
50
51/*
52 * directly cast a tagged pointer to the native pointer type, which
53 * could be used for backward compatibility of existing code.
54 */
55#define tagptr_cast_ptr(tptr) ((void *)(tptr).v)
56
57/* encode tagged pointers */
58#define tagptr_fold(type, ptr, _tags) ({ \
59 const typeof(_tags) tags = (_tags); \
60 if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(type))) \
61 __bad_tagptr_tags(); \
62tagptr_init(type, (uintptr_t)(ptr) | tags); })
63
64/* decode tagged pointers */
65#define tagptr_unfold_ptr(tptr) \
66 ((void *)((tptr).v & ~__tagptr_mask(tptr)))
67
68#define tagptr_unfold_tags(tptr) \
69 ((tptr).v & __tagptr_mask(tptr))
70
71/* operations for the tagger pointer */
72#define tagptr_eq(_tptr1, _tptr2) ({ \
73 typeof(_tptr1) tptr1 = (_tptr1); \
74 typeof(_tptr2) tptr2 = (_tptr2); \
75 (void)(&tptr1 == &tptr2); \
76(tptr1).v == (tptr2).v; })
77
78/* lock-free CAS operation */
79#define tagptr_cmpxchg(_ptptr, _o, _n) ({ \
80 typeof(_ptptr) ptptr = (_ptptr); \
81 typeof(_o) o = (_o); \
82 typeof(_n) n = (_n); \
83 (void)(&o == &n); \
84 (void)(&o == ptptr); \
85tagptr_init(o, cmpxchg(&ptptr->v, o.v, n.v)); })
86
87/* wrap WRITE_ONCE if atomic update is needed */
88#define tagptr_replace_tags(_ptptr, tags) ({ \
89 typeof(_ptptr) ptptr = (_ptptr); \
90 *ptptr = tagptr_fold(*ptptr, tagptr_unfold_ptr(*ptptr), tags); \
91*ptptr; })
92
93#define tagptr_set_tags(_ptptr, _tags) ({ \
94 typeof(_ptptr) ptptr = (_ptptr); \
95 const typeof(_tags) tags = (_tags); \
96 if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(*ptptr))) \
97 __bad_tagptr_tags(); \
98 ptptr->v |= tags; \
99*ptptr; })
100
101#define tagptr_clear_tags(_ptptr, _tags) ({ \
102 typeof(_ptptr) ptptr = (_ptptr); \
103 const typeof(_tags) tags = (_tags); \
104 if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(*ptptr))) \
105 __bad_tagptr_tags(); \
106 ptptr->v &= ~tags; \
107*ptptr; })
108
109#endif /* __EROFS_FS_TAGPTR_H */
110
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
new file mode 100644
index 000000000000..1dd041aa0f5a
--- /dev/null
+++ b/fs/erofs/utils.c
@@ -0,0 +1,333 @@
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright (C) 2018 HUAWEI, Inc.
4 * http://www.huawei.com/
5 * Created by Gao Xiang <gaoxiang25@huawei.com>
6 */
7#include "internal.h"
8#include <linux/pagevec.h>
9
10struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp, bool nofail)
11{
12 struct page *page;
13
14 if (!list_empty(pool)) {
15 page = lru_to_page(pool);
16 DBG_BUGON(page_ref_count(page) != 1);
17 list_del(&page->lru);
18 } else {
19 page = alloc_pages(gfp | (nofail ? __GFP_NOFAIL : 0), 0);
20 }
21 return page;
22}
23
24#if (EROFS_PCPUBUF_NR_PAGES > 0)
25static struct {
26 u8 data[PAGE_SIZE * EROFS_PCPUBUF_NR_PAGES];
27} ____cacheline_aligned_in_smp erofs_pcpubuf[NR_CPUS];
28
29void *erofs_get_pcpubuf(unsigned int pagenr)
30{
31 preempt_disable();
32 return &erofs_pcpubuf[smp_processor_id()].data[pagenr * PAGE_SIZE];
33}
34#endif
35
36#ifdef CONFIG_EROFS_FS_ZIP
37/* global shrink count (for all mounted EROFS instances) */
38static atomic_long_t erofs_global_shrink_cnt;
39
40#define __erofs_workgroup_get(grp) atomic_inc(&(grp)->refcount)
41#define __erofs_workgroup_put(grp) atomic_dec(&(grp)->refcount)
42
43static int erofs_workgroup_get(struct erofs_workgroup *grp)
44{
45 int o;
46
47repeat:
48 o = erofs_wait_on_workgroup_freezed(grp);
49 if (unlikely(o <= 0))
50 return -1;
51
52 if (unlikely(atomic_cmpxchg(&grp->refcount, o, o + 1) != o))
53 goto repeat;
54
55 /* decrease refcount paired by erofs_workgroup_put */
56 if (unlikely(o == 1))
57 atomic_long_dec(&erofs_global_shrink_cnt);
58 return 0;
59}
60
61struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
62 pgoff_t index, bool *tag)
63{
64 struct erofs_sb_info *sbi = EROFS_SB(sb);
65 struct erofs_workgroup *grp;
66
67repeat:
68 rcu_read_lock();
69 grp = radix_tree_lookup(&sbi->workstn_tree, index);
70 if (grp) {
71 *tag = xa_pointer_tag(grp);
72 grp = xa_untag_pointer(grp);
73
74 if (erofs_workgroup_get(grp)) {
75 /* prefer to relax rcu read side */
76 rcu_read_unlock();
77 goto repeat;
78 }
79
80 DBG_BUGON(index != grp->index);
81 }
82 rcu_read_unlock();
83 return grp;
84}
85
86int erofs_register_workgroup(struct super_block *sb,
87 struct erofs_workgroup *grp,
88 bool tag)
89{
90 struct erofs_sb_info *sbi;
91 int err;
92
93 /* grp shouldn't be broken or used before */
94 if (unlikely(atomic_read(&grp->refcount) != 1)) {
95 DBG_BUGON(1);
96 return -EINVAL;
97 }
98
99 err = radix_tree_preload(GFP_NOFS);
100 if (err)
101 return err;
102
103 sbi = EROFS_SB(sb);
104 xa_lock(&sbi->workstn_tree);
105
106 grp = xa_tag_pointer(grp, tag);
107
108 /*
109 * Bump up reference count before making this workgroup
110 * visible to other users in order to avoid potential UAF
111 * without serialized by workstn_lock.
112 */
113 __erofs_workgroup_get(grp);
114
115 err = radix_tree_insert(&sbi->workstn_tree, grp->index, grp);
116 if (unlikely(err))
117 /*
118 * it's safe to decrease since the workgroup isn't visible
119 * and refcount >= 2 (cannot be freezed).
120 */
121 __erofs_workgroup_put(grp);
122
123 xa_unlock(&sbi->workstn_tree);
124 radix_tree_preload_end();
125 return err;
126}
127
128static void __erofs_workgroup_free(struct erofs_workgroup *grp)
129{
130 atomic_long_dec(&erofs_global_shrink_cnt);
131 erofs_workgroup_free_rcu(grp);
132}
133
134int erofs_workgroup_put(struct erofs_workgroup *grp)
135{
136 int count = atomic_dec_return(&grp->refcount);
137
138 if (count == 1)
139 atomic_long_inc(&erofs_global_shrink_cnt);
140 else if (!count)
141 __erofs_workgroup_free(grp);
142 return count;
143}
144
145static void erofs_workgroup_unfreeze_final(struct erofs_workgroup *grp)
146{
147 erofs_workgroup_unfreeze(grp, 0);
148 __erofs_workgroup_free(grp);
149}
150
151static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi,
152 struct erofs_workgroup *grp,
153 bool cleanup)
154{
155 /*
156 * If managed cache is on, refcount of workgroups
157 * themselves could be < 0 (freezed). In other words,
158 * there is no guarantee that all refcounts > 0.
159 */
160 if (!erofs_workgroup_try_to_freeze(grp, 1))
161 return false;
162
163 /*
164 * Note that all cached pages should be unattached
165 * before deleted from the radix tree. Otherwise some
166 * cached pages could be still attached to the orphan
167 * old workgroup when the new one is available in the tree.
168 */
169 if (erofs_try_to_free_all_cached_pages(sbi, grp)) {
170 erofs_workgroup_unfreeze(grp, 1);
171 return false;
172 }
173
174 /*
175 * It's impossible to fail after the workgroup is freezed,
176 * however in order to avoid some race conditions, add a
177 * DBG_BUGON to observe this in advance.
178 */
179 DBG_BUGON(xa_untag_pointer(radix_tree_delete(&sbi->workstn_tree,
180 grp->index)) != grp);
181
182 /*
183 * If managed cache is on, last refcount should indicate
184 * the related workstation.
185 */
186 erofs_workgroup_unfreeze_final(grp);
187 return true;
188}
189
190static unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
191 unsigned long nr_shrink,
192 bool cleanup)
193{
194 pgoff_t first_index = 0;
195 void *batch[PAGEVEC_SIZE];
196 unsigned int freed = 0;
197
198 int i, found;
199repeat:
200 xa_lock(&sbi->workstn_tree);
201
202 found = radix_tree_gang_lookup(&sbi->workstn_tree,
203 batch, first_index, PAGEVEC_SIZE);
204
205 for (i = 0; i < found; ++i) {
206 struct erofs_workgroup *grp = xa_untag_pointer(batch[i]);
207
208 first_index = grp->index + 1;
209
210 /* try to shrink each valid workgroup */
211 if (!erofs_try_to_release_workgroup(sbi, grp, cleanup))
212 continue;
213
214 ++freed;
215 if (unlikely(!--nr_shrink))
216 break;
217 }
218 xa_unlock(&sbi->workstn_tree);
219
220 if (i && nr_shrink)
221 goto repeat;
222 return freed;
223}
224
225/* protected by 'erofs_sb_list_lock' */
226static unsigned int shrinker_run_no;
227
228/* protects the mounted 'erofs_sb_list' */
229static DEFINE_SPINLOCK(erofs_sb_list_lock);
230static LIST_HEAD(erofs_sb_list);
231
232void erofs_shrinker_register(struct super_block *sb)
233{
234 struct erofs_sb_info *sbi = EROFS_SB(sb);
235
236 mutex_init(&sbi->umount_mutex);
237
238 spin_lock(&erofs_sb_list_lock);
239 list_add(&sbi->list, &erofs_sb_list);
240 spin_unlock(&erofs_sb_list_lock);
241}
242
243void erofs_shrinker_unregister(struct super_block *sb)
244{
245 struct erofs_sb_info *const sbi = EROFS_SB(sb);
246
247 mutex_lock(&sbi->umount_mutex);
248 erofs_shrink_workstation(sbi, ~0UL, true);
249
250 spin_lock(&erofs_sb_list_lock);
251 list_del(&sbi->list);
252 spin_unlock(&erofs_sb_list_lock);
253 mutex_unlock(&sbi->umount_mutex);
254}
255
256static unsigned long erofs_shrink_count(struct shrinker *shrink,
257 struct shrink_control *sc)
258{
259 return atomic_long_read(&erofs_global_shrink_cnt);
260}
261
262static unsigned long erofs_shrink_scan(struct shrinker *shrink,
263 struct shrink_control *sc)
264{
265 struct erofs_sb_info *sbi;
266 struct list_head *p;
267
268 unsigned long nr = sc->nr_to_scan;
269 unsigned int run_no;
270 unsigned long freed = 0;
271
272 spin_lock(&erofs_sb_list_lock);
273 do {
274 run_no = ++shrinker_run_no;
275 } while (run_no == 0);
276
277 /* Iterate over all mounted superblocks and try to shrink them */
278 p = erofs_sb_list.next;
279 while (p != &erofs_sb_list) {
280 sbi = list_entry(p, struct erofs_sb_info, list);
281
282 /*
283 * We move the ones we do to the end of the list, so we stop
284 * when we see one we have already done.
285 */
286 if (sbi->shrinker_run_no == run_no)
287 break;
288
289 if (!mutex_trylock(&sbi->umount_mutex)) {
290 p = p->next;
291 continue;
292 }
293
294 spin_unlock(&erofs_sb_list_lock);
295 sbi->shrinker_run_no = run_no;
296
297 freed += erofs_shrink_workstation(sbi, nr, false);
298
299 spin_lock(&erofs_sb_list_lock);
300 /* Get the next list element before we move this one */
301 p = p->next;
302
303 /*
304 * Move this one to the end of the list to provide some
305 * fairness.
306 */
307 list_move_tail(&sbi->list, &erofs_sb_list);
308 mutex_unlock(&sbi->umount_mutex);
309
310 if (freed >= nr)
311 break;
312 }
313 spin_unlock(&erofs_sb_list_lock);
314 return freed;
315}
316
317static struct shrinker erofs_shrinker_info = {
318 .scan_objects = erofs_shrink_scan,
319 .count_objects = erofs_shrink_count,
320 .seeks = DEFAULT_SEEKS,
321};
322
323int __init erofs_init_shrinker(void)
324{
325 return register_shrinker(&erofs_shrinker_info);
326}
327
328void erofs_exit_shrinker(void)
329{
330 unregister_shrinker(&erofs_shrinker_info);
331}
332#endif /* !CONFIG_EROFS_FS_ZIP */
333
diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c
new file mode 100644
index 000000000000..a8286998a079
--- /dev/null
+++ b/fs/erofs/xattr.c
@@ -0,0 +1,703 @@
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright (C) 2017-2018 HUAWEI, Inc.
4 * http://www.huawei.com/
5 * Created by Gao Xiang <gaoxiang25@huawei.com>
6 */
7#include <linux/security.h>
8#include "xattr.h"
9
10struct xattr_iter {
11 struct super_block *sb;
12 struct page *page;
13 void *kaddr;
14
15 erofs_blk_t blkaddr;
16 unsigned int ofs;
17};
18
19static inline void xattr_iter_end(struct xattr_iter *it, bool atomic)
20{
21 /* the only user of kunmap() is 'init_inode_xattrs' */
22 if (unlikely(!atomic))
23 kunmap(it->page);
24 else
25 kunmap_atomic(it->kaddr);
26
27 unlock_page(it->page);
28 put_page(it->page);
29}
30
31static inline void xattr_iter_end_final(struct xattr_iter *it)
32{
33 if (!it->page)
34 return;
35
36 xattr_iter_end(it, true);
37}
38
39static int init_inode_xattrs(struct inode *inode)
40{
41 struct erofs_vnode *const vi = EROFS_V(inode);
42 struct xattr_iter it;
43 unsigned int i;
44 struct erofs_xattr_ibody_header *ih;
45 struct super_block *sb;
46 struct erofs_sb_info *sbi;
47 bool atomic_map;
48 int ret = 0;
49
50 /* the most case is that xattrs of this inode are initialized. */
51 if (test_bit(EROFS_V_EA_INITED_BIT, &vi->flags))
52 return 0;
53
54 if (wait_on_bit_lock(&vi->flags, EROFS_V_BL_XATTR_BIT, TASK_KILLABLE))
55 return -ERESTARTSYS;
56
57 /* someone has initialized xattrs for us? */
58 if (test_bit(EROFS_V_EA_INITED_BIT, &vi->flags))
59 goto out_unlock;
60
61 /*
62 * bypass all xattr operations if ->xattr_isize is not greater than
63 * sizeof(struct erofs_xattr_ibody_header), in detail:
64 * 1) it is not enough to contain erofs_xattr_ibody_header then
65 * ->xattr_isize should be 0 (it means no xattr);
66 * 2) it is just to contain erofs_xattr_ibody_header, which is on-disk
67 * undefined right now (maybe use later with some new sb feature).
68 */
69 if (vi->xattr_isize == sizeof(struct erofs_xattr_ibody_header)) {
70 errln("xattr_isize %d of nid %llu is not supported yet",
71 vi->xattr_isize, vi->nid);
72 ret = -EOPNOTSUPP;
73 goto out_unlock;
74 } else if (vi->xattr_isize < sizeof(struct erofs_xattr_ibody_header)) {
75 if (unlikely(vi->xattr_isize)) {
76 errln("bogus xattr ibody @ nid %llu", vi->nid);
77 DBG_BUGON(1);
78 ret = -EFSCORRUPTED;
79 goto out_unlock; /* xattr ondisk layout error */
80 }
81 ret = -ENOATTR;
82 goto out_unlock;
83 }
84
85 sb = inode->i_sb;
86 sbi = EROFS_SB(sb);
87 it.blkaddr = erofs_blknr(iloc(sbi, vi->nid) + vi->inode_isize);
88 it.ofs = erofs_blkoff(iloc(sbi, vi->nid) + vi->inode_isize);
89
90 it.page = erofs_get_inline_page(inode, it.blkaddr);
91 if (IS_ERR(it.page)) {
92 ret = PTR_ERR(it.page);
93 goto out_unlock;
94 }
95
96 /* read in shared xattr array (non-atomic, see kmalloc below) */
97 it.kaddr = kmap(it.page);
98 atomic_map = false;
99
100 ih = (struct erofs_xattr_ibody_header *)(it.kaddr + it.ofs);
101
102 vi->xattr_shared_count = ih->h_shared_count;
103 vi->xattr_shared_xattrs = kmalloc_array(vi->xattr_shared_count,
104 sizeof(uint), GFP_KERNEL);
105 if (!vi->xattr_shared_xattrs) {
106 xattr_iter_end(&it, atomic_map);
107 ret = -ENOMEM;
108 goto out_unlock;
109 }
110
111 /* let's skip ibody header */
112 it.ofs += sizeof(struct erofs_xattr_ibody_header);
113
114 for (i = 0; i < vi->xattr_shared_count; ++i) {
115 if (unlikely(it.ofs >= EROFS_BLKSIZ)) {
116 /* cannot be unaligned */
117 DBG_BUGON(it.ofs != EROFS_BLKSIZ);
118 xattr_iter_end(&it, atomic_map);
119
120 it.page = erofs_get_meta_page(sb, ++it.blkaddr,
121 S_ISDIR(inode->i_mode));
122 if (IS_ERR(it.page)) {
123 kfree(vi->xattr_shared_xattrs);
124 vi->xattr_shared_xattrs = NULL;
125 ret = PTR_ERR(it.page);
126 goto out_unlock;
127 }
128
129 it.kaddr = kmap_atomic(it.page);
130 atomic_map = true;
131 it.ofs = 0;
132 }
133 vi->xattr_shared_xattrs[i] =
134 le32_to_cpu(*(__le32 *)(it.kaddr + it.ofs));
135 it.ofs += sizeof(__le32);
136 }
137 xattr_iter_end(&it, atomic_map);
138
139 set_bit(EROFS_V_EA_INITED_BIT, &vi->flags);
140
141out_unlock:
142 clear_and_wake_up_bit(EROFS_V_BL_XATTR_BIT, &vi->flags);
143 return ret;
144}
145
146/*
147 * the general idea for these return values is
148 * if 0 is returned, go on processing the current xattr;
149 * 1 (> 0) is returned, skip this round to process the next xattr;
150 * -err (< 0) is returned, an error (maybe ENOXATTR) occurred
151 * and need to be handled
152 */
153struct xattr_iter_handlers {
154 int (*entry)(struct xattr_iter *_it, struct erofs_xattr_entry *entry);
155 int (*name)(struct xattr_iter *_it, unsigned int processed, char *buf,
156 unsigned int len);
157 int (*alloc_buffer)(struct xattr_iter *_it, unsigned int value_sz);
158 void (*value)(struct xattr_iter *_it, unsigned int processed, char *buf,
159 unsigned int len);
160};
161
162static inline int xattr_iter_fixup(struct xattr_iter *it)
163{
164 if (it->ofs < EROFS_BLKSIZ)
165 return 0;
166
167 xattr_iter_end(it, true);
168
169 it->blkaddr += erofs_blknr(it->ofs);
170
171 it->page = erofs_get_meta_page(it->sb, it->blkaddr, false);
172 if (IS_ERR(it->page)) {
173 int err = PTR_ERR(it->page);
174
175 it->page = NULL;
176 return err;
177 }
178
179 it->kaddr = kmap_atomic(it->page);
180 it->ofs = erofs_blkoff(it->ofs);
181 return 0;
182}
183
184static int inline_xattr_iter_begin(struct xattr_iter *it,
185 struct inode *inode)
186{
187 struct erofs_vnode *const vi = EROFS_V(inode);
188 struct erofs_sb_info *const sbi = EROFS_SB(inode->i_sb);
189 unsigned int xattr_header_sz, inline_xattr_ofs;
190
191 xattr_header_sz = inlinexattr_header_size(inode);
192 if (unlikely(xattr_header_sz >= vi->xattr_isize)) {
193 DBG_BUGON(xattr_header_sz > vi->xattr_isize);
194 return -ENOATTR;
195 }
196
197 inline_xattr_ofs = vi->inode_isize + xattr_header_sz;
198
199 it->blkaddr = erofs_blknr(iloc(sbi, vi->nid) + inline_xattr_ofs);
200 it->ofs = erofs_blkoff(iloc(sbi, vi->nid) + inline_xattr_ofs);
201
202 it->page = erofs_get_inline_page(inode, it->blkaddr);
203 if (IS_ERR(it->page))
204 return PTR_ERR(it->page);
205
206 it->kaddr = kmap_atomic(it->page);
207 return vi->xattr_isize - xattr_header_sz;
208}
209
210/*
211 * Regardless of success or failure, `xattr_foreach' will end up with
212 * `ofs' pointing to the next xattr item rather than an arbitrary position.
213 */
214static int xattr_foreach(struct xattr_iter *it,
215 const struct xattr_iter_handlers *op,
216 unsigned int *tlimit)
217{
218 struct erofs_xattr_entry entry;
219 unsigned int value_sz, processed, slice;
220 int err;
221
222 /* 0. fixup blkaddr, ofs, ipage */
223 err = xattr_iter_fixup(it);
224 if (err)
225 return err;
226
227 /*
228 * 1. read xattr entry to the memory,
229 * since we do EROFS_XATTR_ALIGN
230 * therefore entry should be in the page
231 */
232 entry = *(struct erofs_xattr_entry *)(it->kaddr + it->ofs);
233 if (tlimit) {
234 unsigned int entry_sz = EROFS_XATTR_ENTRY_SIZE(&entry);
235
236 /* xattr on-disk corruption: xattr entry beyond xattr_isize */
237 if (unlikely(*tlimit < entry_sz)) {
238 DBG_BUGON(1);
239 return -EFSCORRUPTED;
240 }
241 *tlimit -= entry_sz;
242 }
243
244 it->ofs += sizeof(struct erofs_xattr_entry);
245 value_sz = le16_to_cpu(entry.e_value_size);
246
247 /* handle entry */
248 err = op->entry(it, &entry);
249 if (err) {
250 it->ofs += entry.e_name_len + value_sz;
251 goto out;
252 }
253
254 /* 2. handle xattr name (ofs will finally be at the end of name) */
255 processed = 0;
256
257 while (processed < entry.e_name_len) {
258 if (it->ofs >= EROFS_BLKSIZ) {
259 DBG_BUGON(it->ofs > EROFS_BLKSIZ);
260
261 err = xattr_iter_fixup(it);
262 if (err)
263 goto out;
264 it->ofs = 0;
265 }
266
267 slice = min_t(unsigned int, PAGE_SIZE - it->ofs,
268 entry.e_name_len - processed);
269
270 /* handle name */
271 err = op->name(it, processed, it->kaddr + it->ofs, slice);
272 if (err) {
273 it->ofs += entry.e_name_len - processed + value_sz;
274 goto out;
275 }
276
277 it->ofs += slice;
278 processed += slice;
279 }
280
281 /* 3. handle xattr value */
282 processed = 0;
283
284 if (op->alloc_buffer) {
285 err = op->alloc_buffer(it, value_sz);
286 if (err) {
287 it->ofs += value_sz;
288 goto out;
289 }
290 }
291
292 while (processed < value_sz) {
293 if (it->ofs >= EROFS_BLKSIZ) {
294 DBG_BUGON(it->ofs > EROFS_BLKSIZ);
295
296 err = xattr_iter_fixup(it);
297 if (err)
298 goto out;
299 it->ofs = 0;
300 }
301
302 slice = min_t(unsigned int, PAGE_SIZE - it->ofs,
303 value_sz - processed);
304 op->value(it, processed, it->kaddr + it->ofs, slice);
305 it->ofs += slice;
306 processed += slice;
307 }
308
309out:
310 /* xattrs should be 4-byte aligned (on-disk constraint) */
311 it->ofs = EROFS_XATTR_ALIGN(it->ofs);
312 return err < 0 ? err : 0;
313}
314
315struct getxattr_iter {
316 struct xattr_iter it;
317
318 char *buffer;
319 int buffer_size, index;
320 struct qstr name;
321};
322
323static int xattr_entrymatch(struct xattr_iter *_it,
324 struct erofs_xattr_entry *entry)
325{
326 struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it);
327
328 return (it->index != entry->e_name_index ||
329 it->name.len != entry->e_name_len) ? -ENOATTR : 0;
330}
331
332static int xattr_namematch(struct xattr_iter *_it,
333 unsigned int processed, char *buf, unsigned int len)
334{
335 struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it);
336
337 return memcmp(buf, it->name.name + processed, len) ? -ENOATTR : 0;
338}
339
340static int xattr_checkbuffer(struct xattr_iter *_it,
341 unsigned int value_sz)
342{
343 struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it);
344 int err = it->buffer_size < value_sz ? -ERANGE : 0;
345
346 it->buffer_size = value_sz;
347 return !it->buffer ? 1 : err;
348}
349
350static void xattr_copyvalue(struct xattr_iter *_it,
351 unsigned int processed,
352 char *buf, unsigned int len)
353{
354 struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it);
355
356 memcpy(it->buffer + processed, buf, len);
357}
358
359static const struct xattr_iter_handlers find_xattr_handlers = {
360 .entry = xattr_entrymatch,
361 .name = xattr_namematch,
362 .alloc_buffer = xattr_checkbuffer,
363 .value = xattr_copyvalue
364};
365
366static int inline_getxattr(struct inode *inode, struct getxattr_iter *it)
367{
368 int ret;
369 unsigned int remaining;
370
371 ret = inline_xattr_iter_begin(&it->it, inode);
372 if (ret < 0)
373 return ret;
374
375 remaining = ret;
376 while (remaining) {
377 ret = xattr_foreach(&it->it, &find_xattr_handlers, &remaining);
378 if (ret != -ENOATTR)
379 break;
380 }
381 xattr_iter_end_final(&it->it);
382
383 return ret ? ret : it->buffer_size;
384}
385
386static int shared_getxattr(struct inode *inode, struct getxattr_iter *it)
387{
388 struct erofs_vnode *const vi = EROFS_V(inode);
389 struct super_block *const sb = inode->i_sb;
390 struct erofs_sb_info *const sbi = EROFS_SB(sb);
391 unsigned int i;
392 int ret = -ENOATTR;
393
394 for (i = 0; i < vi->xattr_shared_count; ++i) {
395 erofs_blk_t blkaddr =
396 xattrblock_addr(sbi, vi->xattr_shared_xattrs[i]);
397
398 it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]);
399
400 if (!i || blkaddr != it->it.blkaddr) {
401 if (i)
402 xattr_iter_end(&it->it, true);
403
404 it->it.page = erofs_get_meta_page(sb, blkaddr, false);
405 if (IS_ERR(it->it.page))
406 return PTR_ERR(it->it.page);
407
408 it->it.kaddr = kmap_atomic(it->it.page);
409 it->it.blkaddr = blkaddr;
410 }
411
412 ret = xattr_foreach(&it->it, &find_xattr_handlers, NULL);
413 if (ret != -ENOATTR)
414 break;
415 }
416 if (vi->xattr_shared_count)
417 xattr_iter_end_final(&it->it);
418
419 return ret ? ret : it->buffer_size;
420}
421
422static bool erofs_xattr_user_list(struct dentry *dentry)
423{
424 return test_opt(EROFS_SB(dentry->d_sb), XATTR_USER);
425}
426
427static bool erofs_xattr_trusted_list(struct dentry *dentry)
428{
429 return capable(CAP_SYS_ADMIN);
430}
431
432int erofs_getxattr(struct inode *inode, int index,
433 const char *name,
434 void *buffer, size_t buffer_size)
435{
436 int ret;
437 struct getxattr_iter it;
438
439 if (unlikely(!name))
440 return -EINVAL;
441
442 ret = init_inode_xattrs(inode);
443 if (ret)
444 return ret;
445
446 it.index = index;
447
448 it.name.len = strlen(name);
449 if (it.name.len > EROFS_NAME_LEN)
450 return -ERANGE;
451 it.name.name = name;
452
453 it.buffer = buffer;
454 it.buffer_size = buffer_size;
455
456 it.it.sb = inode->i_sb;
457 ret = inline_getxattr(inode, &it);
458 if (ret == -ENOATTR)
459 ret = shared_getxattr(inode, &it);
460 return ret;
461}
462
463static int erofs_xattr_generic_get(const struct xattr_handler *handler,
464 struct dentry *unused, struct inode *inode,
465 const char *name, void *buffer, size_t size)
466{
467 struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
468
469 switch (handler->flags) {
470 case EROFS_XATTR_INDEX_USER:
471 if (!test_opt(sbi, XATTR_USER))
472 return -EOPNOTSUPP;
473 break;
474 case EROFS_XATTR_INDEX_TRUSTED:
475 if (!capable(CAP_SYS_ADMIN))
476 return -EPERM;
477 break;
478 case EROFS_XATTR_INDEX_SECURITY:
479 break;
480 default:
481 return -EINVAL;
482 }
483
484 return erofs_getxattr(inode, handler->flags, name, buffer, size);
485}
486
487const struct xattr_handler erofs_xattr_user_handler = {
488 .prefix = XATTR_USER_PREFIX,
489 .flags = EROFS_XATTR_INDEX_USER,
490 .list = erofs_xattr_user_list,
491 .get = erofs_xattr_generic_get,
492};
493
494const struct xattr_handler erofs_xattr_trusted_handler = {
495 .prefix = XATTR_TRUSTED_PREFIX,
496 .flags = EROFS_XATTR_INDEX_TRUSTED,
497 .list = erofs_xattr_trusted_list,
498 .get = erofs_xattr_generic_get,
499};
500
501#ifdef CONFIG_EROFS_FS_SECURITY
502const struct xattr_handler __maybe_unused erofs_xattr_security_handler = {
503 .prefix = XATTR_SECURITY_PREFIX,
504 .flags = EROFS_XATTR_INDEX_SECURITY,
505 .get = erofs_xattr_generic_get,
506};
507#endif
508
509const struct xattr_handler *erofs_xattr_handlers[] = {
510 &erofs_xattr_user_handler,
511#ifdef CONFIG_EROFS_FS_POSIX_ACL
512 &posix_acl_access_xattr_handler,
513 &posix_acl_default_xattr_handler,
514#endif
515 &erofs_xattr_trusted_handler,
516#ifdef CONFIG_EROFS_FS_SECURITY
517 &erofs_xattr_security_handler,
518#endif
519 NULL,
520};
521
522struct listxattr_iter {
523 struct xattr_iter it;
524
525 struct dentry *dentry;
526 char *buffer;
527 int buffer_size, buffer_ofs;
528};
529
530static int xattr_entrylist(struct xattr_iter *_it,
531 struct erofs_xattr_entry *entry)
532{
533 struct listxattr_iter *it =
534 container_of(_it, struct listxattr_iter, it);
535 unsigned int prefix_len;
536 const char *prefix;
537
538 const struct xattr_handler *h =
539 erofs_xattr_handler(entry->e_name_index);
540
541 if (!h || (h->list && !h->list(it->dentry)))
542 return 1;
543
544 prefix = xattr_prefix(h);
545 prefix_len = strlen(prefix);
546
547 if (!it->buffer) {
548 it->buffer_ofs += prefix_len + entry->e_name_len + 1;
549 return 1;
550 }
551
552 if (it->buffer_ofs + prefix_len
553 + entry->e_name_len + 1 > it->buffer_size)
554 return -ERANGE;
555
556 memcpy(it->buffer + it->buffer_ofs, prefix, prefix_len);
557 it->buffer_ofs += prefix_len;
558 return 0;
559}
560
561static int xattr_namelist(struct xattr_iter *_it,
562 unsigned int processed, char *buf, unsigned int len)
563{
564 struct listxattr_iter *it =
565 container_of(_it, struct listxattr_iter, it);
566
567 memcpy(it->buffer + it->buffer_ofs, buf, len);
568 it->buffer_ofs += len;
569 return 0;
570}
571
572static int xattr_skipvalue(struct xattr_iter *_it,
573 unsigned int value_sz)
574{
575 struct listxattr_iter *it =
576 container_of(_it, struct listxattr_iter, it);
577
578 it->buffer[it->buffer_ofs++] = '\0';
579 return 1;
580}
581
582static const struct xattr_iter_handlers list_xattr_handlers = {
583 .entry = xattr_entrylist,
584 .name = xattr_namelist,
585 .alloc_buffer = xattr_skipvalue,
586 .value = NULL
587};
588
589static int inline_listxattr(struct listxattr_iter *it)
590{
591 int ret;
592 unsigned int remaining;
593
594 ret = inline_xattr_iter_begin(&it->it, d_inode(it->dentry));
595 if (ret < 0)
596 return ret;
597
598 remaining = ret;
599 while (remaining) {
600 ret = xattr_foreach(&it->it, &list_xattr_handlers, &remaining);
601 if (ret)
602 break;
603 }
604 xattr_iter_end_final(&it->it);
605 return ret ? ret : it->buffer_ofs;
606}
607
608static int shared_listxattr(struct listxattr_iter *it)
609{
610 struct inode *const inode = d_inode(it->dentry);
611 struct erofs_vnode *const vi = EROFS_V(inode);
612 struct super_block *const sb = inode->i_sb;
613 struct erofs_sb_info *const sbi = EROFS_SB(sb);
614 unsigned int i;
615 int ret = 0;
616
617 for (i = 0; i < vi->xattr_shared_count; ++i) {
618 erofs_blk_t blkaddr =
619 xattrblock_addr(sbi, vi->xattr_shared_xattrs[i]);
620
621 it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]);
622 if (!i || blkaddr != it->it.blkaddr) {
623 if (i)
624 xattr_iter_end(&it->it, true);
625
626 it->it.page = erofs_get_meta_page(sb, blkaddr, false);
627 if (IS_ERR(it->it.page))
628 return PTR_ERR(it->it.page);
629
630 it->it.kaddr = kmap_atomic(it->it.page);
631 it->it.blkaddr = blkaddr;
632 }
633
634 ret = xattr_foreach(&it->it, &list_xattr_handlers, NULL);
635 if (ret)
636 break;
637 }
638 if (vi->xattr_shared_count)
639 xattr_iter_end_final(&it->it);
640
641 return ret ? ret : it->buffer_ofs;
642}
643
644ssize_t erofs_listxattr(struct dentry *dentry,
645 char *buffer, size_t buffer_size)
646{
647 int ret;
648 struct listxattr_iter it;
649
650 ret = init_inode_xattrs(d_inode(dentry));
651 if (ret)
652 return ret;
653
654 it.dentry = dentry;
655 it.buffer = buffer;
656 it.buffer_size = buffer_size;
657 it.buffer_ofs = 0;
658
659 it.it.sb = dentry->d_sb;
660
661 ret = inline_listxattr(&it);
662 if (ret < 0 && ret != -ENOATTR)
663 return ret;
664 return shared_listxattr(&it);
665}
666
667#ifdef CONFIG_EROFS_FS_POSIX_ACL
668struct posix_acl *erofs_get_acl(struct inode *inode, int type)
669{
670 struct posix_acl *acl;
671 int prefix, rc;
672 char *value = NULL;
673
674 switch (type) {
675 case ACL_TYPE_ACCESS:
676 prefix = EROFS_XATTR_INDEX_POSIX_ACL_ACCESS;
677 break;
678 case ACL_TYPE_DEFAULT:
679 prefix = EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT;
680 break;
681 default:
682 return ERR_PTR(-EINVAL);
683 }
684
685 rc = erofs_getxattr(inode, prefix, "", NULL, 0);
686 if (rc > 0) {
687 value = kmalloc(rc, GFP_KERNEL);
688 if (!value)
689 return ERR_PTR(-ENOMEM);
690 rc = erofs_getxattr(inode, prefix, "", value, rc);
691 }
692
693 if (rc == -ENOATTR)
694 acl = NULL;
695 else if (rc < 0)
696 acl = ERR_PTR(rc);
697 else
698 acl = posix_acl_from_xattr(&init_user_ns, value, rc);
699 kfree(value);
700 return acl;
701}
702#endif
703
diff --git a/fs/erofs/xattr.h b/fs/erofs/xattr.h
new file mode 100644
index 000000000000..c5ca47d814dd
--- /dev/null
+++ b/fs/erofs/xattr.h
@@ -0,0 +1,92 @@
1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * Copyright (C) 2017-2018 HUAWEI, Inc.
4 * http://www.huawei.com/
5 * Created by Gao Xiang <gaoxiang25@huawei.com>
6 */
7#ifndef __EROFS_XATTR_H
8#define __EROFS_XATTR_H
9
10#include "internal.h"
11#include <linux/posix_acl_xattr.h>
12#include <linux/xattr.h>
13
14/* Attribute not found */
15#define ENOATTR ENODATA
16
17static inline unsigned int inlinexattr_header_size(struct inode *inode)
18{
19 return sizeof(struct erofs_xattr_ibody_header)
20 + sizeof(u32) * EROFS_V(inode)->xattr_shared_count;
21}
22
23static inline erofs_blk_t xattrblock_addr(struct erofs_sb_info *sbi,
24 unsigned int xattr_id)
25{
26#ifdef CONFIG_EROFS_FS_XATTR
27 return sbi->xattr_blkaddr +
28 xattr_id * sizeof(__u32) / EROFS_BLKSIZ;
29#else
30 return 0;
31#endif
32}
33
34static inline unsigned int xattrblock_offset(struct erofs_sb_info *sbi,
35 unsigned int xattr_id)
36{
37 return (xattr_id * sizeof(__u32)) % EROFS_BLKSIZ;
38}
39
40#ifdef CONFIG_EROFS_FS_XATTR
41extern const struct xattr_handler erofs_xattr_user_handler;
42extern const struct xattr_handler erofs_xattr_trusted_handler;
43#ifdef CONFIG_EROFS_FS_SECURITY
44extern const struct xattr_handler erofs_xattr_security_handler;
45#endif
46
47static inline const struct xattr_handler *erofs_xattr_handler(unsigned int idx)
48{
49static const struct xattr_handler *xattr_handler_map[] = {
50 [EROFS_XATTR_INDEX_USER] = &erofs_xattr_user_handler,
51#ifdef CONFIG_EROFS_FS_POSIX_ACL
52 [EROFS_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler,
53 [EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT] =
54 &posix_acl_default_xattr_handler,
55#endif
56 [EROFS_XATTR_INDEX_TRUSTED] = &erofs_xattr_trusted_handler,
57#ifdef CONFIG_EROFS_FS_SECURITY
58 [EROFS_XATTR_INDEX_SECURITY] = &erofs_xattr_security_handler,
59#endif
60};
61
62 return idx && idx < ARRAY_SIZE(xattr_handler_map) ?
63 xattr_handler_map[idx] : NULL;
64}
65
66extern const struct xattr_handler *erofs_xattr_handlers[];
67
68int erofs_getxattr(struct inode *, int, const char *, void *, size_t);
69ssize_t erofs_listxattr(struct dentry *, char *, size_t);
70#else
71static inline int erofs_getxattr(struct inode *inode, int index,
72 const char *name, void *buffer,
73 size_t buffer_size)
74{
75 return -EOPNOTSUPP;
76}
77
78static inline ssize_t erofs_listxattr(struct dentry *dentry,
79 char *buffer, size_t buffer_size)
80{
81 return -EOPNOTSUPP;
82}
83#endif /* !CONFIG_EROFS_FS_XATTR */
84
85#ifdef CONFIG_EROFS_FS_POSIX_ACL
86struct posix_acl *erofs_get_acl(struct inode *inode, int type);
87#else
88#define erofs_get_acl (NULL)
89#endif
90
91#endif
92
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
new file mode 100644
index 000000000000..b32ad585237c
--- /dev/null
+++ b/fs/erofs/zdata.c
@@ -0,0 +1,1432 @@
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright (C) 2018 HUAWEI, Inc.
4 * http://www.huawei.com/
5 * Created by Gao Xiang <gaoxiang25@huawei.com>
6 */
7#include "zdata.h"
8#include "compress.h"
9#include <linux/prefetch.h>
10
11#include <trace/events/erofs.h>
12
13/*
14 * a compressed_pages[] placeholder in order to avoid
15 * being filled with file pages for in-place decompression.
16 */
17#define PAGE_UNALLOCATED ((void *)0x5F0E4B1D)
18
19/* how to allocate cached pages for a pcluster */
20enum z_erofs_cache_alloctype {
21 DONTALLOC, /* don't allocate any cached pages */
22 DELAYEDALLOC, /* delayed allocation (at the time of submitting io) */
23};
24
25/*
26 * tagged pointer with 1-bit tag for all compressed pages
27 * tag 0 - the page is just found with an extra page reference
28 */
29typedef tagptr1_t compressed_page_t;
30
31#define tag_compressed_page_justfound(page) \
32 tagptr_fold(compressed_page_t, page, 1)
33
34static struct workqueue_struct *z_erofs_workqueue __read_mostly;
35static struct kmem_cache *pcluster_cachep __read_mostly;
36
37void z_erofs_exit_zip_subsystem(void)
38{
39 destroy_workqueue(z_erofs_workqueue);
40 kmem_cache_destroy(pcluster_cachep);
41}
42
43static inline int init_unzip_workqueue(void)
44{
45 const unsigned int onlinecpus = num_possible_cpus();
46 const unsigned int flags = WQ_UNBOUND | WQ_HIGHPRI | WQ_CPU_INTENSIVE;
47
48 /*
49 * no need to spawn too many threads, limiting threads could minimum
50 * scheduling overhead, perhaps per-CPU threads should be better?
51 */
52 z_erofs_workqueue = alloc_workqueue("erofs_unzipd", flags,
53 onlinecpus + onlinecpus / 4);
54 return z_erofs_workqueue ? 0 : -ENOMEM;
55}
56
57static void init_once(void *ptr)
58{
59 struct z_erofs_pcluster *pcl = ptr;
60 struct z_erofs_collection *cl = z_erofs_primarycollection(pcl);
61 unsigned int i;
62
63 mutex_init(&cl->lock);
64 cl->nr_pages = 0;
65 cl->vcnt = 0;
66 for (i = 0; i < Z_EROFS_CLUSTER_MAX_PAGES; ++i)
67 pcl->compressed_pages[i] = NULL;
68}
69
70static void init_always(struct z_erofs_pcluster *pcl)
71{
72 struct z_erofs_collection *cl = z_erofs_primarycollection(pcl);
73
74 atomic_set(&pcl->obj.refcount, 1);
75
76 DBG_BUGON(cl->nr_pages);
77 DBG_BUGON(cl->vcnt);
78}
79
80int __init z_erofs_init_zip_subsystem(void)
81{
82 pcluster_cachep = kmem_cache_create("erofs_compress",
83 Z_EROFS_WORKGROUP_SIZE, 0,
84 SLAB_RECLAIM_ACCOUNT, init_once);
85 if (pcluster_cachep) {
86 if (!init_unzip_workqueue())
87 return 0;
88
89 kmem_cache_destroy(pcluster_cachep);
90 }
91 return -ENOMEM;
92}
93
94enum z_erofs_collectmode {
95 COLLECT_SECONDARY,
96 COLLECT_PRIMARY,
97 /*
98 * The current collection was the tail of an exist chain, in addition
99 * that the previous processed chained collections are all decided to
100 * be hooked up to it.
101 * A new chain will be created for the remaining collections which are
102 * not processed yet, therefore different from COLLECT_PRIMARY_FOLLOWED,
103 * the next collection cannot reuse the whole page safely in
104 * the following scenario:
105 * ________________________________________________________________
106 * | tail (partial) page | head (partial) page |
107 * | (belongs to the next cl) | (belongs to the current cl) |
108 * |_______PRIMARY_FOLLOWED_______|________PRIMARY_HOOKED___________|
109 */
110 COLLECT_PRIMARY_HOOKED,
111 COLLECT_PRIMARY_FOLLOWED_NOINPLACE,
112 /*
113 * The current collection has been linked with the owned chain, and
114 * could also be linked with the remaining collections, which means
115 * if the processing page is the tail page of the collection, thus
116 * the current collection can safely use the whole page (since
117 * the previous collection is under control) for in-place I/O, as
118 * illustrated below:
119 * ________________________________________________________________
120 * | tail (partial) page | head (partial) page |
121 * | (of the current cl) | (of the previous collection) |
122 * | PRIMARY_FOLLOWED or | |
123 * |_____PRIMARY_HOOKED___|____________PRIMARY_FOLLOWED____________|
124 *
125 * [ (*) the above page can be used as inplace I/O. ]
126 */
127 COLLECT_PRIMARY_FOLLOWED,
128};
129
130struct z_erofs_collector {
131 struct z_erofs_pagevec_ctor vector;
132
133 struct z_erofs_pcluster *pcl, *tailpcl;
134 struct z_erofs_collection *cl;
135 struct page **compressedpages;
136 z_erofs_next_pcluster_t owned_head;
137
138 enum z_erofs_collectmode mode;
139};
140
141struct z_erofs_decompress_frontend {
142 struct inode *const inode;
143
144 struct z_erofs_collector clt;
145 struct erofs_map_blocks map;
146
147 /* used for applying cache strategy on the fly */
148 bool backmost;
149 erofs_off_t headoffset;
150};
151
152#define COLLECTOR_INIT() { \
153 .owned_head = Z_EROFS_PCLUSTER_TAIL, \
154 .mode = COLLECT_PRIMARY_FOLLOWED }
155
156#define DECOMPRESS_FRONTEND_INIT(__i) { \
157 .inode = __i, .clt = COLLECTOR_INIT(), \
158 .backmost = true, }
159
160static struct page *z_pagemap_global[Z_EROFS_VMAP_GLOBAL_PAGES];
161static DEFINE_MUTEX(z_pagemap_global_lock);
162
163static void preload_compressed_pages(struct z_erofs_collector *clt,
164 struct address_space *mc,
165 enum z_erofs_cache_alloctype type,
166 struct list_head *pagepool)
167{
168 const struct z_erofs_pcluster *pcl = clt->pcl;
169 const unsigned int clusterpages = BIT(pcl->clusterbits);
170 struct page **pages = clt->compressedpages;
171 pgoff_t index = pcl->obj.index + (pages - pcl->compressed_pages);
172 bool standalone = true;
173
174 if (clt->mode < COLLECT_PRIMARY_FOLLOWED)
175 return;
176
177 for (; pages < pcl->compressed_pages + clusterpages; ++pages) {
178 struct page *page;
179 compressed_page_t t;
180
181 /* the compressed page was loaded before */
182 if (READ_ONCE(*pages))
183 continue;
184
185 page = find_get_page(mc, index);
186
187 if (page) {
188 t = tag_compressed_page_justfound(page);
189 } else if (type == DELAYEDALLOC) {
190 t = tagptr_init(compressed_page_t, PAGE_UNALLOCATED);
191 } else { /* DONTALLOC */
192 if (standalone)
193 clt->compressedpages = pages;
194 standalone = false;
195 continue;
196 }
197
198 if (!cmpxchg_relaxed(pages, NULL, tagptr_cast_ptr(t)))
199 continue;
200
201 if (page)
202 put_page(page);
203 }
204
205 if (standalone) /* downgrade to PRIMARY_FOLLOWED_NOINPLACE */
206 clt->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE;
207}
208
209/* called by erofs_shrinker to get rid of all compressed_pages */
210int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
211 struct erofs_workgroup *grp)
212{
213 struct z_erofs_pcluster *const pcl =
214 container_of(grp, struct z_erofs_pcluster, obj);
215 struct address_space *const mapping = MNGD_MAPPING(sbi);
216 const unsigned int clusterpages = BIT(pcl->clusterbits);
217 int i;
218
219 /*
220 * refcount of workgroup is now freezed as 1,
221 * therefore no need to worry about available decompression users.
222 */
223 for (i = 0; i < clusterpages; ++i) {
224 struct page *page = pcl->compressed_pages[i];
225
226 if (!page)
227 continue;
228
229 /* block other users from reclaiming or migrating the page */
230 if (!trylock_page(page))
231 return -EBUSY;
232
233 if (unlikely(page->mapping != mapping))
234 continue;
235
236 /* barrier is implied in the following 'unlock_page' */
237 WRITE_ONCE(pcl->compressed_pages[i], NULL);
238 set_page_private(page, 0);
239 ClearPagePrivate(page);
240
241 unlock_page(page);
242 put_page(page);
243 }
244 return 0;
245}
246
247int erofs_try_to_free_cached_page(struct address_space *mapping,
248 struct page *page)
249{
250 struct z_erofs_pcluster *const pcl = (void *)page_private(page);
251 const unsigned int clusterpages = BIT(pcl->clusterbits);
252 int ret = 0; /* 0 - busy */
253
254 if (erofs_workgroup_try_to_freeze(&pcl->obj, 1)) {
255 unsigned int i;
256
257 for (i = 0; i < clusterpages; ++i) {
258 if (pcl->compressed_pages[i] == page) {
259 WRITE_ONCE(pcl->compressed_pages[i], NULL);
260 ret = 1;
261 break;
262 }
263 }
264 erofs_workgroup_unfreeze(&pcl->obj, 1);
265
266 if (ret) {
267 ClearPagePrivate(page);
268 put_page(page);
269 }
270 }
271 return ret;
272}
273
274/* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */
275static inline bool try_inplace_io(struct z_erofs_collector *clt,
276 struct page *page)
277{
278 struct z_erofs_pcluster *const pcl = clt->pcl;
279 const unsigned int clusterpages = BIT(pcl->clusterbits);
280
281 while (clt->compressedpages < pcl->compressed_pages + clusterpages) {
282 if (!cmpxchg(clt->compressedpages++, NULL, page))
283 return true;
284 }
285 return false;
286}
287
288/* callers must be with collection lock held */
289static int z_erofs_attach_page(struct z_erofs_collector *clt,
290 struct page *page,
291 enum z_erofs_page_type type)
292{
293 int ret;
294 bool occupied;
295
296 /* give priority for inplaceio */
297 if (clt->mode >= COLLECT_PRIMARY &&
298 type == Z_EROFS_PAGE_TYPE_EXCLUSIVE &&
299 try_inplace_io(clt, page))
300 return 0;
301
302 ret = z_erofs_pagevec_enqueue(&clt->vector,
303 page, type, &occupied);
304 clt->cl->vcnt += (unsigned int)ret;
305
306 return ret ? 0 : -EAGAIN;
307}
308
309static enum z_erofs_collectmode
310try_to_claim_pcluster(struct z_erofs_pcluster *pcl,
311 z_erofs_next_pcluster_t *owned_head)
312{
313 /* let's claim these following types of pclusters */
314retry:
315 if (pcl->next == Z_EROFS_PCLUSTER_NIL) {
316 /* type 1, nil pcluster */
317 if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_NIL,
318 *owned_head) != Z_EROFS_PCLUSTER_NIL)
319 goto retry;
320
321 *owned_head = &pcl->next;
322 /* lucky, I am the followee :) */
323 return COLLECT_PRIMARY_FOLLOWED;
324 } else if (pcl->next == Z_EROFS_PCLUSTER_TAIL) {
325 /*
326 * type 2, link to the end of a existing open chain,
327 * be careful that its submission itself is governed
328 * by the original owned chain.
329 */
330 if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL,
331 *owned_head) != Z_EROFS_PCLUSTER_TAIL)
332 goto retry;
333 *owned_head = Z_EROFS_PCLUSTER_TAIL;
334 return COLLECT_PRIMARY_HOOKED;
335 }
336 return COLLECT_PRIMARY; /* :( better luck next time */
337}
338
339static struct z_erofs_collection *cllookup(struct z_erofs_collector *clt,
340 struct inode *inode,
341 struct erofs_map_blocks *map)
342{
343 struct erofs_workgroup *grp;
344 struct z_erofs_pcluster *pcl;
345 struct z_erofs_collection *cl;
346 unsigned int length;
347 bool tag;
348
349 grp = erofs_find_workgroup(inode->i_sb, map->m_pa >> PAGE_SHIFT, &tag);
350 if (!grp)
351 return NULL;
352
353 pcl = container_of(grp, struct z_erofs_pcluster, obj);
354 if (clt->owned_head == &pcl->next || pcl == clt->tailpcl) {
355 DBG_BUGON(1);
356 erofs_workgroup_put(grp);
357 return ERR_PTR(-EFSCORRUPTED);
358 }
359
360 cl = z_erofs_primarycollection(pcl);
361 if (unlikely(cl->pageofs != (map->m_la & ~PAGE_MASK))) {
362 DBG_BUGON(1);
363 erofs_workgroup_put(grp);
364 return ERR_PTR(-EFSCORRUPTED);
365 }
366
367 length = READ_ONCE(pcl->length);
368 if (length & Z_EROFS_PCLUSTER_FULL_LENGTH) {
369 if ((map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) > length) {
370 DBG_BUGON(1);
371 erofs_workgroup_put(grp);
372 return ERR_PTR(-EFSCORRUPTED);
373 }
374 } else {
375 unsigned int llen = map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT;
376
377 if (map->m_flags & EROFS_MAP_FULL_MAPPED)
378 llen |= Z_EROFS_PCLUSTER_FULL_LENGTH;
379
380 while (llen > length &&
381 length != cmpxchg_relaxed(&pcl->length, length, llen)) {
382 cpu_relax();
383 length = READ_ONCE(pcl->length);
384 }
385 }
386 mutex_lock(&cl->lock);
387 /* used to check tail merging loop due to corrupted images */
388 if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL)
389 clt->tailpcl = pcl;
390 clt->mode = try_to_claim_pcluster(pcl, &clt->owned_head);
391 /* clean tailpcl if the current owned_head is Z_EROFS_PCLUSTER_TAIL */
392 if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL)
393 clt->tailpcl = NULL;
394 clt->pcl = pcl;
395 clt->cl = cl;
396 return cl;
397}
398
399static struct z_erofs_collection *clregister(struct z_erofs_collector *clt,
400 struct inode *inode,
401 struct erofs_map_blocks *map)
402{
403 struct z_erofs_pcluster *pcl;
404 struct z_erofs_collection *cl;
405 int err;
406
407 /* no available workgroup, let's allocate one */
408 pcl = kmem_cache_alloc(pcluster_cachep, GFP_NOFS);
409 if (unlikely(!pcl))
410 return ERR_PTR(-ENOMEM);
411
412 init_always(pcl);
413 pcl->obj.index = map->m_pa >> PAGE_SHIFT;
414
415 pcl->length = (map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) |
416 (map->m_flags & EROFS_MAP_FULL_MAPPED ?
417 Z_EROFS_PCLUSTER_FULL_LENGTH : 0);
418
419 if (map->m_flags & EROFS_MAP_ZIPPED)
420 pcl->algorithmformat = Z_EROFS_COMPRESSION_LZ4;
421 else
422 pcl->algorithmformat = Z_EROFS_COMPRESSION_SHIFTED;
423
424 pcl->clusterbits = EROFS_V(inode)->z_physical_clusterbits[0];
425 pcl->clusterbits -= PAGE_SHIFT;
426
427 /* new pclusters should be claimed as type 1, primary and followed */
428 pcl->next = clt->owned_head;
429 clt->mode = COLLECT_PRIMARY_FOLLOWED;
430
431 cl = z_erofs_primarycollection(pcl);
432 cl->pageofs = map->m_la & ~PAGE_MASK;
433
434 /*
435 * lock all primary followed works before visible to others
436 * and mutex_trylock *never* fails for a new pcluster.
437 */
438 mutex_trylock(&cl->lock);
439
440 err = erofs_register_workgroup(inode->i_sb, &pcl->obj, 0);
441 if (err) {
442 mutex_unlock(&cl->lock);
443 kmem_cache_free(pcluster_cachep, pcl);
444 return ERR_PTR(-EAGAIN);
445 }
446 /* used to check tail merging loop due to corrupted images */
447 if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL)
448 clt->tailpcl = pcl;
449 clt->owned_head = &pcl->next;
450 clt->pcl = pcl;
451 clt->cl = cl;
452 return cl;
453}
454
455static int z_erofs_collector_begin(struct z_erofs_collector *clt,
456 struct inode *inode,
457 struct erofs_map_blocks *map)
458{
459 struct z_erofs_collection *cl;
460
461 DBG_BUGON(clt->cl);
462
463 /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous collection */
464 DBG_BUGON(clt->owned_head == Z_EROFS_PCLUSTER_NIL);
465 DBG_BUGON(clt->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
466
467 if (!PAGE_ALIGNED(map->m_pa)) {
468 DBG_BUGON(1);
469 return -EINVAL;
470 }
471
472repeat:
473 cl = cllookup(clt, inode, map);
474 if (!cl) {
475 cl = clregister(clt, inode, map);
476
477 if (unlikely(cl == ERR_PTR(-EAGAIN)))
478 goto repeat;
479 }
480
481 if (IS_ERR(cl))
482 return PTR_ERR(cl);
483
484 z_erofs_pagevec_ctor_init(&clt->vector, Z_EROFS_NR_INLINE_PAGEVECS,
485 cl->pagevec, cl->vcnt);
486
487 clt->compressedpages = clt->pcl->compressed_pages;
488 if (clt->mode <= COLLECT_PRIMARY) /* cannot do in-place I/O */
489 clt->compressedpages += Z_EROFS_CLUSTER_MAX_PAGES;
490 return 0;
491}
492
493/*
494 * keep in mind that no referenced pclusters will be freed
495 * only after a RCU grace period.
496 */
497static void z_erofs_rcu_callback(struct rcu_head *head)
498{
499 struct z_erofs_collection *const cl =
500 container_of(head, struct z_erofs_collection, rcu);
501
502 kmem_cache_free(pcluster_cachep,
503 container_of(cl, struct z_erofs_pcluster,
504 primary_collection));
505}
506
507void erofs_workgroup_free_rcu(struct erofs_workgroup *grp)
508{
509 struct z_erofs_pcluster *const pcl =
510 container_of(grp, struct z_erofs_pcluster, obj);
511 struct z_erofs_collection *const cl = z_erofs_primarycollection(pcl);
512
513 call_rcu(&cl->rcu, z_erofs_rcu_callback);
514}
515
516static void z_erofs_collection_put(struct z_erofs_collection *cl)
517{
518 struct z_erofs_pcluster *const pcl =
519 container_of(cl, struct z_erofs_pcluster, primary_collection);
520
521 erofs_workgroup_put(&pcl->obj);
522}
523
524static bool z_erofs_collector_end(struct z_erofs_collector *clt)
525{
526 struct z_erofs_collection *cl = clt->cl;
527
528 if (!cl)
529 return false;
530
531 z_erofs_pagevec_ctor_exit(&clt->vector, false);
532 mutex_unlock(&cl->lock);
533
534 /*
535 * if all pending pages are added, don't hold its reference
536 * any longer if the pcluster isn't hosted by ourselves.
537 */
538 if (clt->mode < COLLECT_PRIMARY_FOLLOWED_NOINPLACE)
539 z_erofs_collection_put(cl);
540
541 clt->cl = NULL;
542 return true;
543}
544
545static inline struct page *__stagingpage_alloc(struct list_head *pagepool,
546 gfp_t gfp)
547{
548 struct page *page = erofs_allocpage(pagepool, gfp, true);
549
550 page->mapping = Z_EROFS_MAPPING_STAGING;
551 return page;
552}
553
554static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend *fe,
555 unsigned int cachestrategy,
556 erofs_off_t la)
557{
558 if (cachestrategy <= EROFS_ZIP_CACHE_DISABLED)
559 return false;
560
561 if (fe->backmost)
562 return true;
563
564 return cachestrategy >= EROFS_ZIP_CACHE_READAROUND &&
565 la < fe->headoffset;
566}
567
568static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
569 struct page *page,
570 struct list_head *pagepool)
571{
572 struct inode *const inode = fe->inode;
573 struct erofs_sb_info *const sbi __maybe_unused = EROFS_I_SB(inode);
574 struct erofs_map_blocks *const map = &fe->map;
575 struct z_erofs_collector *const clt = &fe->clt;
576 const loff_t offset = page_offset(page);
577 bool tight = (clt->mode >= COLLECT_PRIMARY_HOOKED);
578
579 enum z_erofs_cache_alloctype cache_strategy;
580 enum z_erofs_page_type page_type;
581 unsigned int cur, end, spiltted, index;
582 int err = 0;
583
584 /* register locked file pages as online pages in pack */
585 z_erofs_onlinepage_init(page);
586
587 spiltted = 0;
588 end = PAGE_SIZE;
589repeat:
590 cur = end - 1;
591
592 /* lucky, within the range of the current map_blocks */
593 if (offset + cur >= map->m_la &&
594 offset + cur < map->m_la + map->m_llen) {
595 /* didn't get a valid collection previously (very rare) */
596 if (!clt->cl)
597 goto restart_now;
598 goto hitted;
599 }
600
601 /* go ahead the next map_blocks */
602 debugln("%s: [out-of-range] pos %llu", __func__, offset + cur);
603
604 if (z_erofs_collector_end(clt))
605 fe->backmost = false;
606
607 map->m_la = offset + cur;
608 map->m_llen = 0;
609 err = z_erofs_map_blocks_iter(inode, map, 0);
610 if (unlikely(err))
611 goto err_out;
612
613restart_now:
614 if (unlikely(!(map->m_flags & EROFS_MAP_MAPPED)))
615 goto hitted;
616
617 err = z_erofs_collector_begin(clt, inode, map);
618 if (unlikely(err))
619 goto err_out;
620
621 /* preload all compressed pages (maybe downgrade role if necessary) */
622 if (should_alloc_managed_pages(fe, sbi->cache_strategy, map->m_la))
623 cache_strategy = DELAYEDALLOC;
624 else
625 cache_strategy = DONTALLOC;
626
627 preload_compressed_pages(clt, MNGD_MAPPING(sbi),
628 cache_strategy, pagepool);
629
630 tight &= (clt->mode >= COLLECT_PRIMARY_HOOKED);
631hitted:
632 cur = end - min_t(unsigned int, offset + end - map->m_la, end);
633 if (unlikely(!(map->m_flags & EROFS_MAP_MAPPED))) {
634 zero_user_segment(page, cur, end);
635 goto next_part;
636 }
637
638 /* let's derive page type */
639 page_type = cur ? Z_EROFS_VLE_PAGE_TYPE_HEAD :
640 (!spiltted ? Z_EROFS_PAGE_TYPE_EXCLUSIVE :
641 (tight ? Z_EROFS_PAGE_TYPE_EXCLUSIVE :
642 Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED));
643
644 if (cur)
645 tight &= (clt->mode >= COLLECT_PRIMARY_FOLLOWED);
646
647retry:
648 err = z_erofs_attach_page(clt, page, page_type);
649 /* should allocate an additional staging page for pagevec */
650 if (err == -EAGAIN) {
651 struct page *const newpage =
652 __stagingpage_alloc(pagepool, GFP_NOFS);
653
654 err = z_erofs_attach_page(clt, newpage,
655 Z_EROFS_PAGE_TYPE_EXCLUSIVE);
656 if (likely(!err))
657 goto retry;
658 }
659
660 if (unlikely(err))
661 goto err_out;
662
663 index = page->index - (map->m_la >> PAGE_SHIFT);
664
665 z_erofs_onlinepage_fixup(page, index, true);
666
667 /* bump up the number of spiltted parts of a page */
668 ++spiltted;
669 /* also update nr_pages */
670 clt->cl->nr_pages = max_t(pgoff_t, clt->cl->nr_pages, index + 1);
671next_part:
672 /* can be used for verification */
673 map->m_llen = offset + cur - map->m_la;
674
675 end = cur;
676 if (end > 0)
677 goto repeat;
678
679out:
680 z_erofs_onlinepage_endio(page);
681
682 debugln("%s, finish page: %pK spiltted: %u map->m_llen %llu",
683 __func__, page, spiltted, map->m_llen);
684 return err;
685
686 /* if some error occurred while processing this page */
687err_out:
688 SetPageError(page);
689 goto out;
690}
691
692static void z_erofs_vle_unzip_kickoff(void *ptr, int bios)
693{
694 tagptr1_t t = tagptr_init(tagptr1_t, ptr);
695 struct z_erofs_unzip_io *io = tagptr_unfold_ptr(t);
696 bool background = tagptr_unfold_tags(t);
697
698 if (!background) {
699 unsigned long flags;
700
701 spin_lock_irqsave(&io->u.wait.lock, flags);
702 if (!atomic_add_return(bios, &io->pending_bios))
703 wake_up_locked(&io->u.wait);
704 spin_unlock_irqrestore(&io->u.wait.lock, flags);
705 return;
706 }
707
708 if (!atomic_add_return(bios, &io->pending_bios))
709 queue_work(z_erofs_workqueue, &io->u.work);
710}
711
712static inline void z_erofs_vle_read_endio(struct bio *bio)
713{
714 struct erofs_sb_info *sbi = NULL;
715 blk_status_t err = bio->bi_status;
716 struct bio_vec *bvec;
717 struct bvec_iter_all iter_all;
718
719 bio_for_each_segment_all(bvec, bio, iter_all) {
720 struct page *page = bvec->bv_page;
721 bool cachemngd = false;
722
723 DBG_BUGON(PageUptodate(page));
724 DBG_BUGON(!page->mapping);
725
726 if (unlikely(!sbi && !z_erofs_page_is_staging(page))) {
727 sbi = EROFS_SB(page->mapping->host->i_sb);
728
729 if (time_to_inject(sbi, FAULT_READ_IO)) {
730 erofs_show_injection_info(FAULT_READ_IO);
731 err = BLK_STS_IOERR;
732 }
733 }
734
735 /* sbi should already be gotten if the page is managed */
736 if (sbi)
737 cachemngd = erofs_page_is_managed(sbi, page);
738
739 if (unlikely(err))
740 SetPageError(page);
741 else if (cachemngd)
742 SetPageUptodate(page);
743
744 if (cachemngd)
745 unlock_page(page);
746 }
747
748 z_erofs_vle_unzip_kickoff(bio->bi_private, -1);
749 bio_put(bio);
750}
751
752static int z_erofs_decompress_pcluster(struct super_block *sb,
753 struct z_erofs_pcluster *pcl,
754 struct list_head *pagepool)
755{
756 struct erofs_sb_info *const sbi = EROFS_SB(sb);
757 const unsigned int clusterpages = BIT(pcl->clusterbits);
758 struct z_erofs_pagevec_ctor ctor;
759 unsigned int i, outputsize, llen, nr_pages;
760 struct page *pages_onstack[Z_EROFS_VMAP_ONSTACK_PAGES];
761 struct page **pages, **compressed_pages, *page;
762
763 enum z_erofs_page_type page_type;
764 bool overlapped, partial;
765 struct z_erofs_collection *cl;
766 int err;
767
768 might_sleep();
769 cl = z_erofs_primarycollection(pcl);
770 DBG_BUGON(!READ_ONCE(cl->nr_pages));
771
772 mutex_lock(&cl->lock);
773 nr_pages = cl->nr_pages;
774
775 if (likely(nr_pages <= Z_EROFS_VMAP_ONSTACK_PAGES)) {
776 pages = pages_onstack;
777 } else if (nr_pages <= Z_EROFS_VMAP_GLOBAL_PAGES &&
778 mutex_trylock(&z_pagemap_global_lock)) {
779 pages = z_pagemap_global;
780 } else {
781 gfp_t gfp_flags = GFP_KERNEL;
782
783 if (nr_pages > Z_EROFS_VMAP_GLOBAL_PAGES)
784 gfp_flags |= __GFP_NOFAIL;
785
786 pages = kvmalloc_array(nr_pages, sizeof(struct page *),
787 gfp_flags);
788
789 /* fallback to global pagemap for the lowmem scenario */
790 if (unlikely(!pages)) {
791 mutex_lock(&z_pagemap_global_lock);
792 pages = z_pagemap_global;
793 }
794 }
795
796 for (i = 0; i < nr_pages; ++i)
797 pages[i] = NULL;
798
799 err = 0;
800 z_erofs_pagevec_ctor_init(&ctor, Z_EROFS_NR_INLINE_PAGEVECS,
801 cl->pagevec, 0);
802
803 for (i = 0; i < cl->vcnt; ++i) {
804 unsigned int pagenr;
805
806 page = z_erofs_pagevec_dequeue(&ctor, &page_type);
807
808 /* all pages in pagevec ought to be valid */
809 DBG_BUGON(!page);
810 DBG_BUGON(!page->mapping);
811
812 if (z_erofs_put_stagingpage(pagepool, page))
813 continue;
814
815 if (page_type == Z_EROFS_VLE_PAGE_TYPE_HEAD)
816 pagenr = 0;
817 else
818 pagenr = z_erofs_onlinepage_index(page);
819
820 DBG_BUGON(pagenr >= nr_pages);
821
822 /*
823 * currently EROFS doesn't support multiref(dedup),
824 * so here erroring out one multiref page.
825 */
826 if (unlikely(pages[pagenr])) {
827 DBG_BUGON(1);
828 SetPageError(pages[pagenr]);
829 z_erofs_onlinepage_endio(pages[pagenr]);
830 err = -EFSCORRUPTED;
831 }
832 pages[pagenr] = page;
833 }
834 z_erofs_pagevec_ctor_exit(&ctor, true);
835
836 overlapped = false;
837 compressed_pages = pcl->compressed_pages;
838
839 for (i = 0; i < clusterpages; ++i) {
840 unsigned int pagenr;
841
842 page = compressed_pages[i];
843
844 /* all compressed pages ought to be valid */
845 DBG_BUGON(!page);
846 DBG_BUGON(!page->mapping);
847
848 if (!z_erofs_page_is_staging(page)) {
849 if (erofs_page_is_managed(sbi, page)) {
850 if (unlikely(!PageUptodate(page)))
851 err = -EIO;
852 continue;
853 }
854
855 /*
856 * only if non-head page can be selected
857 * for inplace decompression
858 */
859 pagenr = z_erofs_onlinepage_index(page);
860
861 DBG_BUGON(pagenr >= nr_pages);
862 if (unlikely(pages[pagenr])) {
863 DBG_BUGON(1);
864 SetPageError(pages[pagenr]);
865 z_erofs_onlinepage_endio(pages[pagenr]);
866 err = -EFSCORRUPTED;
867 }
868 pages[pagenr] = page;
869
870 overlapped = true;
871 }
872
873 /* PG_error needs checking for inplaced and staging pages */
874 if (unlikely(PageError(page))) {
875 DBG_BUGON(PageUptodate(page));
876 err = -EIO;
877 }
878 }
879
880 if (unlikely(err))
881 goto out;
882
883 llen = pcl->length >> Z_EROFS_PCLUSTER_LENGTH_BIT;
884 if (nr_pages << PAGE_SHIFT >= cl->pageofs + llen) {
885 outputsize = llen;
886 partial = !(pcl->length & Z_EROFS_PCLUSTER_FULL_LENGTH);
887 } else {
888 outputsize = (nr_pages << PAGE_SHIFT) - cl->pageofs;
889 partial = true;
890 }
891
892 err = z_erofs_decompress(&(struct z_erofs_decompress_req) {
893 .sb = sb,
894 .in = compressed_pages,
895 .out = pages,
896 .pageofs_out = cl->pageofs,
897 .inputsize = PAGE_SIZE,
898 .outputsize = outputsize,
899 .alg = pcl->algorithmformat,
900 .inplace_io = overlapped,
901 .partial_decoding = partial
902 }, pagepool);
903
904out:
905 /* must handle all compressed pages before endding pages */
906 for (i = 0; i < clusterpages; ++i) {
907 page = compressed_pages[i];
908
909 if (erofs_page_is_managed(sbi, page))
910 continue;
911
912 /* recycle all individual staging pages */
913 (void)z_erofs_put_stagingpage(pagepool, page);
914
915 WRITE_ONCE(compressed_pages[i], NULL);
916 }
917
918 for (i = 0; i < nr_pages; ++i) {
919 page = pages[i];
920 if (!page)
921 continue;
922
923 DBG_BUGON(!page->mapping);
924
925 /* recycle all individual staging pages */
926 if (z_erofs_put_stagingpage(pagepool, page))
927 continue;
928
929 if (unlikely(err < 0))
930 SetPageError(page);
931
932 z_erofs_onlinepage_endio(page);
933 }
934
935 if (pages == z_pagemap_global)
936 mutex_unlock(&z_pagemap_global_lock);
937 else if (unlikely(pages != pages_onstack))
938 kvfree(pages);
939
940 cl->nr_pages = 0;
941 cl->vcnt = 0;
942
943 /* all cl locks MUST be taken before the following line */
944 WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_NIL);
945
946 /* all cl locks SHOULD be released right now */
947 mutex_unlock(&cl->lock);
948
949 z_erofs_collection_put(cl);
950 return err;
951}
952
953static void z_erofs_vle_unzip_all(struct super_block *sb,
954 struct z_erofs_unzip_io *io,
955 struct list_head *pagepool)
956{
957 z_erofs_next_pcluster_t owned = io->head;
958
959 while (owned != Z_EROFS_PCLUSTER_TAIL_CLOSED) {
960 struct z_erofs_pcluster *pcl;
961
962 /* no possible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */
963 DBG_BUGON(owned == Z_EROFS_PCLUSTER_TAIL);
964
965 /* no possible that 'owned' equals NULL */
966 DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL);
967
968 pcl = container_of(owned, struct z_erofs_pcluster, next);
969 owned = READ_ONCE(pcl->next);
970
971 z_erofs_decompress_pcluster(sb, pcl, pagepool);
972 }
973}
974
975static void z_erofs_vle_unzip_wq(struct work_struct *work)
976{
977 struct z_erofs_unzip_io_sb *iosb =
978 container_of(work, struct z_erofs_unzip_io_sb, io.u.work);
979 LIST_HEAD(pagepool);
980
981 DBG_BUGON(iosb->io.head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
982 z_erofs_vle_unzip_all(iosb->sb, &iosb->io, &pagepool);
983
984 put_pages_list(&pagepool);
985 kvfree(iosb);
986}
987
988static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl,
989 unsigned int nr,
990 struct list_head *pagepool,
991 struct address_space *mc,
992 gfp_t gfp)
993{
994 /* determined at compile time to avoid too many #ifdefs */
995 const bool nocache = __builtin_constant_p(mc) ? !mc : false;
996 const pgoff_t index = pcl->obj.index;
997 bool tocache = false;
998
999 struct address_space *mapping;
1000 struct page *oldpage, *page;
1001
1002 compressed_page_t t;
1003 int justfound;
1004
1005repeat:
1006 page = READ_ONCE(pcl->compressed_pages[nr]);
1007 oldpage = page;
1008
1009 if (!page)
1010 goto out_allocpage;
1011
1012 /*
1013 * the cached page has not been allocated and
1014 * an placeholder is out there, prepare it now.
1015 */
1016 if (!nocache && page == PAGE_UNALLOCATED) {
1017 tocache = true;
1018 goto out_allocpage;
1019 }
1020
1021 /* process the target tagged pointer */
1022 t = tagptr_init(compressed_page_t, page);
1023 justfound = tagptr_unfold_tags(t);
1024 page = tagptr_unfold_ptr(t);
1025
1026 mapping = READ_ONCE(page->mapping);
1027
1028 /*
1029 * if managed cache is disabled, it's no way to
1030 * get such a cached-like page.
1031 */
1032 if (nocache) {
1033 /* if managed cache is disabled, it is impossible `justfound' */
1034 DBG_BUGON(justfound);
1035
1036 /* and it should be locked, not uptodate, and not truncated */
1037 DBG_BUGON(!PageLocked(page));
1038 DBG_BUGON(PageUptodate(page));
1039 DBG_BUGON(!mapping);
1040 goto out;
1041 }
1042
1043 /*
1044 * unmanaged (file) pages are all locked solidly,
1045 * therefore it is impossible for `mapping' to be NULL.
1046 */
1047 if (mapping && mapping != mc)
1048 /* ought to be unmanaged pages */
1049 goto out;
1050
1051 lock_page(page);
1052
1053 /* only true if page reclaim goes wrong, should never happen */
1054 DBG_BUGON(justfound && PagePrivate(page));
1055
1056 /* the page is still in manage cache */
1057 if (page->mapping == mc) {
1058 WRITE_ONCE(pcl->compressed_pages[nr], page);
1059
1060 ClearPageError(page);
1061 if (!PagePrivate(page)) {
1062 /*
1063 * impossible to be !PagePrivate(page) for
1064 * the current restriction as well if
1065 * the page is already in compressed_pages[].
1066 */
1067 DBG_BUGON(!justfound);
1068
1069 justfound = 0;
1070 set_page_private(page, (unsigned long)pcl);
1071 SetPagePrivate(page);
1072 }
1073
1074 /* no need to submit io if it is already up-to-date */
1075 if (PageUptodate(page)) {
1076 unlock_page(page);
1077 page = NULL;
1078 }
1079 goto out;
1080 }
1081
1082 /*
1083 * the managed page has been truncated, it's unsafe to
1084 * reuse this one, let's allocate a new cache-managed page.
1085 */
1086 DBG_BUGON(page->mapping);
1087 DBG_BUGON(!justfound);
1088
1089 tocache = true;
1090 unlock_page(page);
1091 put_page(page);
1092out_allocpage:
1093 page = __stagingpage_alloc(pagepool, gfp);
1094 if (oldpage != cmpxchg(&pcl->compressed_pages[nr], oldpage, page)) {
1095 list_add(&page->lru, pagepool);
1096 cpu_relax();
1097 goto repeat;
1098 }
1099 if (nocache || !tocache)
1100 goto out;
1101 if (add_to_page_cache_lru(page, mc, index + nr, gfp)) {
1102 page->mapping = Z_EROFS_MAPPING_STAGING;
1103 goto out;
1104 }
1105
1106 set_page_private(page, (unsigned long)pcl);
1107 SetPagePrivate(page);
1108out: /* the only exit (for tracing and debugging) */
1109 return page;
1110}
1111
1112static struct z_erofs_unzip_io *jobqueue_init(struct super_block *sb,
1113 struct z_erofs_unzip_io *io,
1114 bool foreground)
1115{
1116 struct z_erofs_unzip_io_sb *iosb;
1117
1118 if (foreground) {
1119 /* waitqueue available for foreground io */
1120 DBG_BUGON(!io);
1121
1122 init_waitqueue_head(&io->u.wait);
1123 atomic_set(&io->pending_bios, 0);
1124 goto out;
1125 }
1126
1127 iosb = kvzalloc(sizeof(*iosb), GFP_KERNEL | __GFP_NOFAIL);
1128 DBG_BUGON(!iosb);
1129
1130 /* initialize fields in the allocated descriptor */
1131 io = &iosb->io;
1132 iosb->sb = sb;
1133 INIT_WORK(&io->u.work, z_erofs_vle_unzip_wq);
1134out:
1135 io->head = Z_EROFS_PCLUSTER_TAIL_CLOSED;
1136 return io;
1137}
1138
1139/* define decompression jobqueue types */
1140enum {
1141 JQ_BYPASS,
1142 JQ_SUBMIT,
1143 NR_JOBQUEUES,
1144};
1145
1146static void *jobqueueset_init(struct super_block *sb,
1147 z_erofs_next_pcluster_t qtail[],
1148 struct z_erofs_unzip_io *q[],
1149 struct z_erofs_unzip_io *fgq,
1150 bool forcefg)
1151{
1152 /*
1153 * if managed cache is enabled, bypass jobqueue is needed,
1154 * no need to read from device for all pclusters in this queue.
1155 */
1156 q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, true);
1157 qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head;
1158
1159 q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, forcefg);
1160 qtail[JQ_SUBMIT] = &q[JQ_SUBMIT]->head;
1161
1162 return tagptr_cast_ptr(tagptr_fold(tagptr1_t, q[JQ_SUBMIT], !forcefg));
1163}
1164
1165static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl,
1166 z_erofs_next_pcluster_t qtail[],
1167 z_erofs_next_pcluster_t owned_head)
1168{
1169 z_erofs_next_pcluster_t *const submit_qtail = qtail[JQ_SUBMIT];
1170 z_erofs_next_pcluster_t *const bypass_qtail = qtail[JQ_BYPASS];
1171
1172 DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
1173 if (owned_head == Z_EROFS_PCLUSTER_TAIL)
1174 owned_head = Z_EROFS_PCLUSTER_TAIL_CLOSED;
1175
1176 WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL_CLOSED);
1177
1178 WRITE_ONCE(*submit_qtail, owned_head);
1179 WRITE_ONCE(*bypass_qtail, &pcl->next);
1180
1181 qtail[JQ_BYPASS] = &pcl->next;
1182}
1183
1184static bool postsubmit_is_all_bypassed(struct z_erofs_unzip_io *q[],
1185 unsigned int nr_bios,
1186 bool force_fg)
1187{
1188 /*
1189 * although background is preferred, no one is pending for submission.
1190 * don't issue workqueue for decompression but drop it directly instead.
1191 */
1192 if (force_fg || nr_bios)
1193 return false;
1194
1195 kvfree(container_of(q[JQ_SUBMIT], struct z_erofs_unzip_io_sb, io));
1196 return true;
1197}
1198
1199static bool z_erofs_vle_submit_all(struct super_block *sb,
1200 z_erofs_next_pcluster_t owned_head,
1201 struct list_head *pagepool,
1202 struct z_erofs_unzip_io *fgq,
1203 bool force_fg)
1204{
1205 struct erofs_sb_info *const sbi __maybe_unused = EROFS_SB(sb);
1206 z_erofs_next_pcluster_t qtail[NR_JOBQUEUES];
1207 struct z_erofs_unzip_io *q[NR_JOBQUEUES];
1208 struct bio *bio;
1209 void *bi_private;
1210 /* since bio will be NULL, no need to initialize last_index */
1211 pgoff_t uninitialized_var(last_index);
1212 bool force_submit = false;
1213 unsigned int nr_bios;
1214
1215 if (unlikely(owned_head == Z_EROFS_PCLUSTER_TAIL))
1216 return false;
1217
1218 force_submit = false;
1219 bio = NULL;
1220 nr_bios = 0;
1221 bi_private = jobqueueset_init(sb, qtail, q, fgq, force_fg);
1222
1223 /* by default, all need io submission */
1224 q[JQ_SUBMIT]->head = owned_head;
1225
1226 do {
1227 struct z_erofs_pcluster *pcl;
1228 unsigned int clusterpages;
1229 pgoff_t first_index;
1230 struct page *page;
1231 unsigned int i = 0, bypass = 0;
1232 int err;
1233
1234 /* no possible 'owned_head' equals the following */
1235 DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
1236 DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_NIL);
1237
1238 pcl = container_of(owned_head, struct z_erofs_pcluster, next);
1239
1240 clusterpages = BIT(pcl->clusterbits);
1241
1242 /* close the main owned chain at first */
1243 owned_head = cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL,
1244 Z_EROFS_PCLUSTER_TAIL_CLOSED);
1245
1246 first_index = pcl->obj.index;
1247 force_submit |= (first_index != last_index + 1);
1248
1249repeat:
1250 page = pickup_page_for_submission(pcl, i, pagepool,
1251 MNGD_MAPPING(sbi),
1252 GFP_NOFS);
1253 if (!page) {
1254 force_submit = true;
1255 ++bypass;
1256 goto skippage;
1257 }
1258
1259 if (bio && force_submit) {
1260submit_bio_retry:
1261 __submit_bio(bio, REQ_OP_READ, 0);
1262 bio = NULL;
1263 }
1264
1265 if (!bio) {
1266 bio = erofs_grab_bio(sb, first_index + i,
1267 BIO_MAX_PAGES, bi_private,
1268 z_erofs_vle_read_endio, true);
1269 ++nr_bios;
1270 }
1271
1272 err = bio_add_page(bio, page, PAGE_SIZE, 0);
1273 if (err < PAGE_SIZE)
1274 goto submit_bio_retry;
1275
1276 force_submit = false;
1277 last_index = first_index + i;
1278skippage:
1279 if (++i < clusterpages)
1280 goto repeat;
1281
1282 if (bypass < clusterpages)
1283 qtail[JQ_SUBMIT] = &pcl->next;
1284 else
1285 move_to_bypass_jobqueue(pcl, qtail, owned_head);
1286 } while (owned_head != Z_EROFS_PCLUSTER_TAIL);
1287
1288 if (bio)
1289 __submit_bio(bio, REQ_OP_READ, 0);
1290
1291 if (postsubmit_is_all_bypassed(q, nr_bios, force_fg))
1292 return true;
1293
1294 z_erofs_vle_unzip_kickoff(bi_private, nr_bios);
1295 return true;
1296}
1297
1298static void z_erofs_submit_and_unzip(struct super_block *sb,
1299 struct z_erofs_collector *clt,
1300 struct list_head *pagepool,
1301 bool force_fg)
1302{
1303 struct z_erofs_unzip_io io[NR_JOBQUEUES];
1304
1305 if (!z_erofs_vle_submit_all(sb, clt->owned_head,
1306 pagepool, io, force_fg))
1307 return;
1308
1309 /* decompress no I/O pclusters immediately */
1310 z_erofs_vle_unzip_all(sb, &io[JQ_BYPASS], pagepool);
1311
1312 if (!force_fg)
1313 return;
1314
1315 /* wait until all bios are completed */
1316 wait_event(io[JQ_SUBMIT].u.wait,
1317 !atomic_read(&io[JQ_SUBMIT].pending_bios));
1318
1319 /* let's synchronous decompression */
1320 z_erofs_vle_unzip_all(sb, &io[JQ_SUBMIT], pagepool);
1321}
1322
1323static int z_erofs_vle_normalaccess_readpage(struct file *file,
1324 struct page *page)
1325{
1326 struct inode *const inode = page->mapping->host;
1327 struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
1328 int err;
1329 LIST_HEAD(pagepool);
1330
1331 trace_erofs_readpage(page, false);
1332
1333 f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT;
1334
1335 err = z_erofs_do_read_page(&f, page, &pagepool);
1336 (void)z_erofs_collector_end(&f.clt);
1337
1338 /* if some compressed cluster ready, need submit them anyway */
1339 z_erofs_submit_and_unzip(inode->i_sb, &f.clt, &pagepool, true);
1340
1341 if (err)
1342 errln("%s, failed to read, err [%d]", __func__, err);
1343
1344 if (f.map.mpage)
1345 put_page(f.map.mpage);
1346
1347 /* clean up the remaining free pages */
1348 put_pages_list(&pagepool);
1349 return err;
1350}
1351
1352static bool should_decompress_synchronously(struct erofs_sb_info *sbi,
1353 unsigned int nr)
1354{
1355 return nr <= sbi->max_sync_decompress_pages;
1356}
1357
1358static int z_erofs_vle_normalaccess_readpages(struct file *filp,
1359 struct address_space *mapping,
1360 struct list_head *pages,
1361 unsigned int nr_pages)
1362{
1363 struct inode *const inode = mapping->host;
1364 struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
1365
1366 bool sync = should_decompress_synchronously(sbi, nr_pages);
1367 struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
1368 gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL);
1369 struct page *head = NULL;
1370 LIST_HEAD(pagepool);
1371
1372 trace_erofs_readpages(mapping->host, lru_to_page(pages),
1373 nr_pages, false);
1374
1375 f.headoffset = (erofs_off_t)lru_to_page(pages)->index << PAGE_SHIFT;
1376
1377 for (; nr_pages; --nr_pages) {
1378 struct page *page = lru_to_page(pages);
1379
1380 prefetchw(&page->flags);
1381 list_del(&page->lru);
1382
1383 /*
1384 * A pure asynchronous readahead is indicated if
1385 * a PG_readahead marked page is hitted at first.
1386 * Let's also do asynchronous decompression for this case.
1387 */
1388 sync &= !(PageReadahead(page) && !head);
1389
1390 if (add_to_page_cache_lru(page, mapping, page->index, gfp)) {
1391 list_add(&page->lru, &pagepool);
1392 continue;
1393 }
1394
1395 set_page_private(page, (unsigned long)head);
1396 head = page;
1397 }
1398
1399 while (head) {
1400 struct page *page = head;
1401 int err;
1402
1403 /* traversal in reverse order */
1404 head = (void *)page_private(page);
1405
1406 err = z_erofs_do_read_page(&f, page, &pagepool);
1407 if (err) {
1408 struct erofs_vnode *vi = EROFS_V(inode);
1409
1410 errln("%s, readahead error at page %lu of nid %llu",
1411 __func__, page->index, vi->nid);
1412 }
1413 put_page(page);
1414 }
1415
1416 (void)z_erofs_collector_end(&f.clt);
1417
1418 z_erofs_submit_and_unzip(inode->i_sb, &f.clt, &pagepool, sync);
1419
1420 if (f.map.mpage)
1421 put_page(f.map.mpage);
1422
1423 /* clean up the remaining free pages */
1424 put_pages_list(&pagepool);
1425 return 0;
1426}
1427
1428const struct address_space_operations z_erofs_vle_normalaccess_aops = {
1429 .readpage = z_erofs_vle_normalaccess_readpage,
1430 .readpages = z_erofs_vle_normalaccess_readpages,
1431};
1432
diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h
new file mode 100644
index 000000000000..4fc547bc01f9
--- /dev/null
+++ b/fs/erofs/zdata.h
@@ -0,0 +1,193 @@
1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * Copyright (C) 2018 HUAWEI, Inc.
4 * http://www.huawei.com/
5 * Created by Gao Xiang <gaoxiang25@huawei.com>
6 */
7#ifndef __EROFS_FS_ZDATA_H
8#define __EROFS_FS_ZDATA_H
9
10#include "internal.h"
11#include "zpvec.h"
12
13#define Z_EROFS_NR_INLINE_PAGEVECS 3
14
15/*
16 * Structure fields follow one of the following exclusion rules.
17 *
18 * I: Modifiable by initialization/destruction paths and read-only
19 * for everyone else;
20 *
21 * L: Field should be protected by pageset lock;
22 *
23 * A: Field should be accessed / updated in atomic for parallelized code.
24 */
25struct z_erofs_collection {
26 struct mutex lock;
27
28 /* I: page offset of start position of decompression */
29 unsigned short pageofs;
30
31 /* L: maximum relative page index in pagevec[] */
32 unsigned short nr_pages;
33
34 /* L: total number of pages in pagevec[] */
35 unsigned int vcnt;
36
37 union {
38 /* L: inline a certain number of pagevecs for bootstrap */
39 erofs_vtptr_t pagevec[Z_EROFS_NR_INLINE_PAGEVECS];
40
41 /* I: can be used to free the pcluster by RCU. */
42 struct rcu_head rcu;
43 };
44};
45
46#define Z_EROFS_PCLUSTER_FULL_LENGTH 0x00000001
47#define Z_EROFS_PCLUSTER_LENGTH_BIT 1
48
49/*
50 * let's leave a type here in case of introducing
51 * another tagged pointer later.
52 */
53typedef void *z_erofs_next_pcluster_t;
54
55struct z_erofs_pcluster {
56 struct erofs_workgroup obj;
57 struct z_erofs_collection primary_collection;
58
59 /* A: point to next chained pcluster or TAILs */
60 z_erofs_next_pcluster_t next;
61
62 /* A: compressed pages (including multi-usage pages) */
63 struct page *compressed_pages[Z_EROFS_CLUSTER_MAX_PAGES];
64
65 /* A: lower limit of decompressed length and if full length or not */
66 unsigned int length;
67
68 /* I: compression algorithm format */
69 unsigned char algorithmformat;
70 /* I: bit shift of physical cluster size */
71 unsigned char clusterbits;
72};
73
74#define z_erofs_primarycollection(pcluster) (&(pcluster)->primary_collection)
75
76/* let's avoid the valid 32-bit kernel addresses */
77
78/* the chained workgroup has't submitted io (still open) */
79#define Z_EROFS_PCLUSTER_TAIL ((void *)0x5F0ECAFE)
80/* the chained workgroup has already submitted io */
81#define Z_EROFS_PCLUSTER_TAIL_CLOSED ((void *)0x5F0EDEAD)
82
83#define Z_EROFS_PCLUSTER_NIL (NULL)
84
85#define Z_EROFS_WORKGROUP_SIZE sizeof(struct z_erofs_pcluster)
86
87struct z_erofs_unzip_io {
88 atomic_t pending_bios;
89 z_erofs_next_pcluster_t head;
90
91 union {
92 wait_queue_head_t wait;
93 struct work_struct work;
94 } u;
95};
96
97struct z_erofs_unzip_io_sb {
98 struct z_erofs_unzip_io io;
99 struct super_block *sb;
100};
101
102#define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping)
103static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi,
104 struct page *page)
105{
106 return page->mapping == MNGD_MAPPING(sbi);
107}
108
109#define Z_EROFS_ONLINEPAGE_COUNT_BITS 2
110#define Z_EROFS_ONLINEPAGE_COUNT_MASK ((1 << Z_EROFS_ONLINEPAGE_COUNT_BITS) - 1)
111#define Z_EROFS_ONLINEPAGE_INDEX_SHIFT (Z_EROFS_ONLINEPAGE_COUNT_BITS)
112
113/*
114 * waiters (aka. ongoing_packs): # to unlock the page
115 * sub-index: 0 - for partial page, >= 1 full page sub-index
116 */
117typedef atomic_t z_erofs_onlinepage_t;
118
119/* type punning */
120union z_erofs_onlinepage_converter {
121 z_erofs_onlinepage_t *o;
122 unsigned long *v;
123};
124
125static inline unsigned int z_erofs_onlinepage_index(struct page *page)
126{
127 union z_erofs_onlinepage_converter u;
128
129 DBG_BUGON(!PagePrivate(page));
130 u.v = &page_private(page);
131
132 return atomic_read(u.o) >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
133}
134
135static inline void z_erofs_onlinepage_init(struct page *page)
136{
137 union {
138 z_erofs_onlinepage_t o;
139 unsigned long v;
140 /* keep from being unlocked in advance */
141 } u = { .o = ATOMIC_INIT(1) };
142
143 set_page_private(page, u.v);
144 smp_wmb();
145 SetPagePrivate(page);
146}
147
148static inline void z_erofs_onlinepage_fixup(struct page *page,
149 uintptr_t index, bool down)
150{
151 unsigned long *p, o, v, id;
152repeat:
153 p = &page_private(page);
154 o = READ_ONCE(*p);
155
156 id = o >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
157 if (id) {
158 if (!index)
159 return;
160
161 DBG_BUGON(id != index);
162 }
163
164 v = (index << Z_EROFS_ONLINEPAGE_INDEX_SHIFT) |
165 ((o & Z_EROFS_ONLINEPAGE_COUNT_MASK) + (unsigned int)down);
166 if (cmpxchg(p, o, v) != o)
167 goto repeat;
168}
169
170static inline void z_erofs_onlinepage_endio(struct page *page)
171{
172 union z_erofs_onlinepage_converter u;
173 unsigned int v;
174
175 DBG_BUGON(!PagePrivate(page));
176 u.v = &page_private(page);
177
178 v = atomic_dec_return(u.o);
179 if (!(v & Z_EROFS_ONLINEPAGE_COUNT_MASK)) {
180 ClearPagePrivate(page);
181 if (!PageError(page))
182 SetPageUptodate(page);
183 unlock_page(page);
184 }
185 debugln("%s, page %p value %x", __func__, page, atomic_read(u.o));
186}
187
188#define Z_EROFS_VMAP_ONSTACK_PAGES \
189 min_t(unsigned int, THREAD_SIZE / 8 / sizeof(struct page *), 96U)
190#define Z_EROFS_VMAP_GLOBAL_PAGES 2048
191
192#endif
193
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
new file mode 100644
index 000000000000..4dc9cec01297
--- /dev/null
+++ b/fs/erofs/zmap.c
@@ -0,0 +1,466 @@
1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright (C) 2018-2019 HUAWEI, Inc.
4 * http://www.huawei.com/
5 * Created by Gao Xiang <gaoxiang25@huawei.com>
6 */
7#include "internal.h"
8#include <asm/unaligned.h>
9#include <trace/events/erofs.h>
10
11int z_erofs_fill_inode(struct inode *inode)
12{
13 struct erofs_vnode *const vi = EROFS_V(inode);
14
15 if (vi->datamode == EROFS_INODE_FLAT_COMPRESSION_LEGACY) {
16 vi->z_advise = 0;
17 vi->z_algorithmtype[0] = 0;
18 vi->z_algorithmtype[1] = 0;
19 vi->z_logical_clusterbits = LOG_BLOCK_SIZE;
20 vi->z_physical_clusterbits[0] = vi->z_logical_clusterbits;
21 vi->z_physical_clusterbits[1] = vi->z_logical_clusterbits;
22 set_bit(EROFS_V_Z_INITED_BIT, &vi->flags);
23 }
24
25 inode->i_mapping->a_ops = &z_erofs_vle_normalaccess_aops;
26 return 0;
27}
28
29static int fill_inode_lazy(struct inode *inode)
30{
31 struct erofs_vnode *const vi = EROFS_V(inode);
32 struct super_block *const sb = inode->i_sb;
33 int err;
34 erofs_off_t pos;
35 struct page *page;
36 void *kaddr;
37 struct z_erofs_map_header *h;
38
39 if (test_bit(EROFS_V_Z_INITED_BIT, &vi->flags))
40 return 0;
41
42 if (wait_on_bit_lock(&vi->flags, EROFS_V_BL_Z_BIT, TASK_KILLABLE))
43 return -ERESTARTSYS;
44
45 err = 0;
46 if (test_bit(EROFS_V_Z_INITED_BIT, &vi->flags))
47 goto out_unlock;
48
49 DBG_BUGON(vi->datamode == EROFS_INODE_FLAT_COMPRESSION_LEGACY);
50
51 pos = ALIGN(iloc(EROFS_SB(sb), vi->nid) + vi->inode_isize +
52 vi->xattr_isize, 8);
53 page = erofs_get_meta_page(sb, erofs_blknr(pos), false);
54 if (IS_ERR(page)) {
55 err = PTR_ERR(page);
56 goto out_unlock;
57 }
58
59 kaddr = kmap_atomic(page);
60
61 h = kaddr + erofs_blkoff(pos);
62 vi->z_advise = le16_to_cpu(h->h_advise);
63 vi->z_algorithmtype[0] = h->h_algorithmtype & 15;
64 vi->z_algorithmtype[1] = h->h_algorithmtype >> 4;
65
66 if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX) {
67 errln("unknown compression format %u for nid %llu, please upgrade kernel",
68 vi->z_algorithmtype[0], vi->nid);
69 err = -EOPNOTSUPP;
70 goto unmap_done;
71 }
72
73 vi->z_logical_clusterbits = LOG_BLOCK_SIZE + (h->h_clusterbits & 7);
74 vi->z_physical_clusterbits[0] = vi->z_logical_clusterbits +
75 ((h->h_clusterbits >> 3) & 3);
76
77 if (vi->z_physical_clusterbits[0] != LOG_BLOCK_SIZE) {
78 errln("unsupported physical clusterbits %u for nid %llu, please upgrade kernel",
79 vi->z_physical_clusterbits[0], vi->nid);
80 err = -EOPNOTSUPP;
81 goto unmap_done;
82 }
83
84 vi->z_physical_clusterbits[1] = vi->z_logical_clusterbits +
85 ((h->h_clusterbits >> 5) & 7);
86 set_bit(EROFS_V_Z_INITED_BIT, &vi->flags);
87unmap_done:
88 kunmap_atomic(kaddr);
89 unlock_page(page);
90 put_page(page);
91out_unlock:
92 clear_and_wake_up_bit(EROFS_V_BL_Z_BIT, &vi->flags);
93 return err;
94}
95
96struct z_erofs_maprecorder {
97 struct inode *inode;
98 struct erofs_map_blocks *map;
99 void *kaddr;
100
101 unsigned long lcn;
102 /* compression extent information gathered */
103 u8 type;
104 u16 clusterofs;
105 u16 delta[2];
106 erofs_blk_t pblk;
107};
108
109static int z_erofs_reload_indexes(struct z_erofs_maprecorder *m,
110 erofs_blk_t eblk)
111{
112 struct super_block *const sb = m->inode->i_sb;
113 struct erofs_map_blocks *const map = m->map;
114 struct page *mpage = map->mpage;
115
116 if (mpage) {
117 if (mpage->index == eblk) {
118 if (!m->kaddr)
119 m->kaddr = kmap_atomic(mpage);
120 return 0;
121 }
122
123 if (m->kaddr) {
124 kunmap_atomic(m->kaddr);
125 m->kaddr = NULL;
126 }
127 put_page(mpage);
128 }
129
130 mpage = erofs_get_meta_page(sb, eblk, false);
131 if (IS_ERR(mpage)) {
132 map->mpage = NULL;
133 return PTR_ERR(mpage);
134 }
135 m->kaddr = kmap_atomic(mpage);
136 unlock_page(mpage);
137 map->mpage = mpage;
138 return 0;
139}
140
141static int vle_legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m,
142 unsigned long lcn)
143{
144 struct inode *const inode = m->inode;
145 struct erofs_vnode *const vi = EROFS_V(inode);
146 const erofs_off_t ibase = iloc(EROFS_I_SB(inode), vi->nid);
147 const erofs_off_t pos =
148 Z_EROFS_VLE_LEGACY_INDEX_ALIGN(ibase + vi->inode_isize +
149 vi->xattr_isize) +
150 lcn * sizeof(struct z_erofs_vle_decompressed_index);
151 struct z_erofs_vle_decompressed_index *di;
152 unsigned int advise, type;
153 int err;
154
155 err = z_erofs_reload_indexes(m, erofs_blknr(pos));
156 if (err)
157 return err;
158
159 m->lcn = lcn;
160 di = m->kaddr + erofs_blkoff(pos);
161
162 advise = le16_to_cpu(di->di_advise);
163 type = (advise >> Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT) &
164 ((1 << Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS) - 1);
165 switch (type) {
166 case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
167 m->clusterofs = 1 << vi->z_logical_clusterbits;
168 m->delta[0] = le16_to_cpu(di->di_u.delta[0]);
169 m->delta[1] = le16_to_cpu(di->di_u.delta[1]);
170 break;
171 case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
172 case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
173 m->clusterofs = le16_to_cpu(di->di_clusterofs);
174 m->pblk = le32_to_cpu(di->di_u.blkaddr);
175 break;
176 default:
177 DBG_BUGON(1);
178 return -EOPNOTSUPP;
179 }
180 m->type = type;
181 return 0;
182}
183
184static unsigned int decode_compactedbits(unsigned int lobits,
185 unsigned int lomask,
186 u8 *in, unsigned int pos, u8 *type)
187{
188 const unsigned int v = get_unaligned_le32(in + pos / 8) >> (pos & 7);
189 const unsigned int lo = v & lomask;
190
191 *type = (v >> lobits) & 3;
192 return lo;
193}
194
195static int unpack_compacted_index(struct z_erofs_maprecorder *m,
196 unsigned int amortizedshift,
197 unsigned int eofs)
198{
199 struct erofs_vnode *const vi = EROFS_V(m->inode);
200 const unsigned int lclusterbits = vi->z_logical_clusterbits;
201 const unsigned int lomask = (1 << lclusterbits) - 1;
202 unsigned int vcnt, base, lo, encodebits, nblk;
203 int i;
204 u8 *in, type;
205
206 if (1 << amortizedshift == 4)
207 vcnt = 2;
208 else if (1 << amortizedshift == 2 && lclusterbits == 12)
209 vcnt = 16;
210 else
211 return -EOPNOTSUPP;
212
213 encodebits = ((vcnt << amortizedshift) - sizeof(__le32)) * 8 / vcnt;
214 base = round_down(eofs, vcnt << amortizedshift);
215 in = m->kaddr + base;
216
217 i = (eofs - base) >> amortizedshift;
218
219 lo = decode_compactedbits(lclusterbits, lomask,
220 in, encodebits * i, &type);
221 m->type = type;
222 if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) {
223 m->clusterofs = 1 << lclusterbits;
224 if (i + 1 != vcnt) {
225 m->delta[0] = lo;
226 return 0;
227 }
228 /*
229 * since the last lcluster in the pack is special,
230 * of which lo saves delta[1] rather than delta[0].
231 * Hence, get delta[0] by the previous lcluster indirectly.
232 */
233 lo = decode_compactedbits(lclusterbits, lomask,
234 in, encodebits * (i - 1), &type);
235 if (type != Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD)
236 lo = 0;
237 m->delta[0] = lo + 1;
238 return 0;
239 }
240 m->clusterofs = lo;
241 m->delta[0] = 0;
242 /* figout out blkaddr (pblk) for HEAD lclusters */
243 nblk = 1;
244 while (i > 0) {
245 --i;
246 lo = decode_compactedbits(lclusterbits, lomask,
247 in, encodebits * i, &type);
248 if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD)
249 i -= lo;
250
251 if (i >= 0)
252 ++nblk;
253 }
254 in += (vcnt << amortizedshift) - sizeof(__le32);
255 m->pblk = le32_to_cpu(*(__le32 *)in) + nblk;
256 return 0;
257}
258
259static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m,
260 unsigned long lcn)
261{
262 struct inode *const inode = m->inode;
263 struct erofs_vnode *const vi = EROFS_V(inode);
264 const unsigned int lclusterbits = vi->z_logical_clusterbits;
265 const erofs_off_t ebase = ALIGN(iloc(EROFS_I_SB(inode), vi->nid) +
266 vi->inode_isize + vi->xattr_isize, 8) +
267 sizeof(struct z_erofs_map_header);
268 const unsigned int totalidx = DIV_ROUND_UP(inode->i_size, EROFS_BLKSIZ);
269 unsigned int compacted_4b_initial, compacted_2b;
270 unsigned int amortizedshift;
271 erofs_off_t pos;
272 int err;
273
274 if (lclusterbits != 12)
275 return -EOPNOTSUPP;
276
277 if (lcn >= totalidx)
278 return -EINVAL;
279
280 m->lcn = lcn;
281 /* used to align to 32-byte (compacted_2b) alignment */
282 compacted_4b_initial = (32 - ebase % 32) / 4;
283 if (compacted_4b_initial == 32 / 4)
284 compacted_4b_initial = 0;
285
286 if (vi->z_advise & Z_EROFS_ADVISE_COMPACTED_2B)
287 compacted_2b = rounddown(totalidx - compacted_4b_initial, 16);
288 else
289 compacted_2b = 0;
290
291 pos = ebase;
292 if (lcn < compacted_4b_initial) {
293 amortizedshift = 2;
294 goto out;
295 }
296 pos += compacted_4b_initial * 4;
297 lcn -= compacted_4b_initial;
298
299 if (lcn < compacted_2b) {
300 amortizedshift = 1;
301 goto out;
302 }
303 pos += compacted_2b * 2;
304 lcn -= compacted_2b;
305 amortizedshift = 2;
306out:
307 pos += lcn * (1 << amortizedshift);
308 err = z_erofs_reload_indexes(m, erofs_blknr(pos));
309 if (err)
310 return err;
311 return unpack_compacted_index(m, amortizedshift, erofs_blkoff(pos));
312}
313
314static int vle_load_cluster_from_disk(struct z_erofs_maprecorder *m,
315 unsigned int lcn)
316{
317 const unsigned int datamode = EROFS_V(m->inode)->datamode;
318
319 if (datamode == EROFS_INODE_FLAT_COMPRESSION_LEGACY)
320 return vle_legacy_load_cluster_from_disk(m, lcn);
321
322 if (datamode == EROFS_INODE_FLAT_COMPRESSION)
323 return compacted_load_cluster_from_disk(m, lcn);
324
325 return -EINVAL;
326}
327
328static int vle_extent_lookback(struct z_erofs_maprecorder *m,
329 unsigned int lookback_distance)
330{
331 struct erofs_vnode *const vi = EROFS_V(m->inode);
332 struct erofs_map_blocks *const map = m->map;
333 const unsigned int lclusterbits = vi->z_logical_clusterbits;
334 unsigned long lcn = m->lcn;
335 int err;
336
337 if (lcn < lookback_distance) {
338 errln("bogus lookback distance @ nid %llu", vi->nid);
339 DBG_BUGON(1);
340 return -EFSCORRUPTED;
341 }
342
343 /* load extent head logical cluster if needed */
344 lcn -= lookback_distance;
345 err = vle_load_cluster_from_disk(m, lcn);
346 if (err)
347 return err;
348
349 switch (m->type) {
350 case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
351 if (unlikely(!m->delta[0])) {
352 errln("invalid lookback distance 0 at nid %llu",
353 vi->nid);
354 DBG_BUGON(1);
355 return -EFSCORRUPTED;
356 }
357 return vle_extent_lookback(m, m->delta[0]);
358 case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
359 map->m_flags &= ~EROFS_MAP_ZIPPED;
360 /* fallthrough */
361 case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
362 map->m_la = (lcn << lclusterbits) | m->clusterofs;
363 break;
364 default:
365 errln("unknown type %u at lcn %lu of nid %llu",
366 m->type, lcn, vi->nid);
367 DBG_BUGON(1);
368 return -EOPNOTSUPP;
369 }
370 return 0;
371}
372
373int z_erofs_map_blocks_iter(struct inode *inode,
374 struct erofs_map_blocks *map,
375 int flags)
376{
377 struct erofs_vnode *const vi = EROFS_V(inode);
378 struct z_erofs_maprecorder m = {
379 .inode = inode,
380 .map = map,
381 };
382 int err = 0;
383 unsigned int lclusterbits, endoff;
384 unsigned long long ofs, end;
385
386 trace_z_erofs_map_blocks_iter_enter(inode, map, flags);
387
388 /* when trying to read beyond EOF, leave it unmapped */
389 if (unlikely(map->m_la >= inode->i_size)) {
390 map->m_llen = map->m_la + 1 - inode->i_size;
391 map->m_la = inode->i_size;
392 map->m_flags = 0;
393 goto out;
394 }
395
396 err = fill_inode_lazy(inode);
397 if (err)
398 goto out;
399
400 lclusterbits = vi->z_logical_clusterbits;
401 ofs = map->m_la;
402 m.lcn = ofs >> lclusterbits;
403 endoff = ofs & ((1 << lclusterbits) - 1);
404
405 err = vle_load_cluster_from_disk(&m, m.lcn);
406 if (err)
407 goto unmap_out;
408
409 map->m_flags = EROFS_MAP_ZIPPED; /* by default, compressed */
410 end = (m.lcn + 1ULL) << lclusterbits;
411
412 switch (m.type) {
413 case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
414 if (endoff >= m.clusterofs)
415 map->m_flags &= ~EROFS_MAP_ZIPPED;
416 /* fallthrough */
417 case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
418 if (endoff >= m.clusterofs) {
419 map->m_la = (m.lcn << lclusterbits) | m.clusterofs;
420 break;
421 }
422 /* m.lcn should be >= 1 if endoff < m.clusterofs */
423 if (unlikely(!m.lcn)) {
424 errln("invalid logical cluster 0 at nid %llu",
425 vi->nid);
426 err = -EFSCORRUPTED;
427 goto unmap_out;
428 }
429 end = (m.lcn << lclusterbits) | m.clusterofs;
430 map->m_flags |= EROFS_MAP_FULL_MAPPED;
431 m.delta[0] = 1;
432 /* fallthrough */
433 case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
434 /* get the correspoinding first chunk */
435 err = vle_extent_lookback(&m, m.delta[0]);
436 if (unlikely(err))
437 goto unmap_out;
438 break;
439 default:
440 errln("unknown type %u at offset %llu of nid %llu",
441 m.type, ofs, vi->nid);
442 err = -EOPNOTSUPP;
443 goto unmap_out;
444 }
445
446 map->m_llen = end - map->m_la;
447 map->m_plen = 1 << lclusterbits;
448 map->m_pa = blknr_to_addr(m.pblk);
449 map->m_flags |= EROFS_MAP_MAPPED;
450
451unmap_out:
452 if (m.kaddr)
453 kunmap_atomic(m.kaddr);
454
455out:
456 debugln("%s, m_la %llu m_pa %llu m_llen %llu m_plen %llu m_flags 0%o",
457 __func__, map->m_la, map->m_pa,
458 map->m_llen, map->m_plen, map->m_flags);
459
460 trace_z_erofs_map_blocks_iter_exit(inode, map, flags, err);
461
462 /* aggressively BUG_ON iff CONFIG_EROFS_FS_DEBUG is on */
463 DBG_BUGON(err < 0 && err != -ENOMEM);
464 return err;
465}
466
diff --git a/fs/erofs/zpvec.h b/fs/erofs/zpvec.h
new file mode 100644
index 000000000000..bd3cee16491c
--- /dev/null
+++ b/fs/erofs/zpvec.h
@@ -0,0 +1,157 @@
1/* SPDX-License-Identifier: GPL-2.0-only */
2/*
3 * Copyright (C) 2018 HUAWEI, Inc.
4 * http://www.huawei.com/
5 * Created by Gao Xiang <gaoxiang25@huawei.com>
6 */
7#ifndef __EROFS_FS_ZPVEC_H
8#define __EROFS_FS_ZPVEC_H
9
10#include "tagptr.h"
11
12/* page type in pagevec for decompress subsystem */
13enum z_erofs_page_type {
14 /* including Z_EROFS_VLE_PAGE_TAIL_EXCLUSIVE */
15 Z_EROFS_PAGE_TYPE_EXCLUSIVE,
16
17 Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED,
18
19 Z_EROFS_VLE_PAGE_TYPE_HEAD,
20 Z_EROFS_VLE_PAGE_TYPE_MAX
21};
22
23extern void __compiletime_error("Z_EROFS_PAGE_TYPE_EXCLUSIVE != 0")
24 __bad_page_type_exclusive(void);
25
26/* pagevec tagged pointer */
27typedef tagptr2_t erofs_vtptr_t;
28
29/* pagevec collector */
30struct z_erofs_pagevec_ctor {
31 struct page *curr, *next;
32 erofs_vtptr_t *pages;
33
34 unsigned int nr, index;
35};
36
37static inline void z_erofs_pagevec_ctor_exit(struct z_erofs_pagevec_ctor *ctor,
38 bool atomic)
39{
40 if (!ctor->curr)
41 return;
42
43 if (atomic)
44 kunmap_atomic(ctor->pages);
45 else
46 kunmap(ctor->curr);
47}
48
49static inline struct page *
50z_erofs_pagevec_ctor_next_page(struct z_erofs_pagevec_ctor *ctor,
51 unsigned int nr)
52{
53 unsigned int index;
54
55 /* keep away from occupied pages */
56 if (ctor->next)
57 return ctor->next;
58
59 for (index = 0; index < nr; ++index) {
60 const erofs_vtptr_t t = ctor->pages[index];
61 const unsigned int tags = tagptr_unfold_tags(t);
62
63 if (tags == Z_EROFS_PAGE_TYPE_EXCLUSIVE)
64 return tagptr_unfold_ptr(t);
65 }
66 DBG_BUGON(nr >= ctor->nr);
67 return NULL;
68}
69
70static inline void
71z_erofs_pagevec_ctor_pagedown(struct z_erofs_pagevec_ctor *ctor,
72 bool atomic)
73{
74 struct page *next = z_erofs_pagevec_ctor_next_page(ctor, ctor->nr);
75
76 z_erofs_pagevec_ctor_exit(ctor, atomic);
77
78 ctor->curr = next;
79 ctor->next = NULL;
80 ctor->pages = atomic ?
81 kmap_atomic(ctor->curr) : kmap(ctor->curr);
82
83 ctor->nr = PAGE_SIZE / sizeof(struct page *);
84 ctor->index = 0;
85}
86
87static inline void z_erofs_pagevec_ctor_init(struct z_erofs_pagevec_ctor *ctor,
88 unsigned int nr,
89 erofs_vtptr_t *pages,
90 unsigned int i)
91{
92 ctor->nr = nr;
93 ctor->curr = ctor->next = NULL;
94 ctor->pages = pages;
95
96 if (i >= nr) {
97 i -= nr;
98 z_erofs_pagevec_ctor_pagedown(ctor, false);
99 while (i > ctor->nr) {
100 i -= ctor->nr;
101 z_erofs_pagevec_ctor_pagedown(ctor, false);
102 }
103 }
104 ctor->next = z_erofs_pagevec_ctor_next_page(ctor, i);
105 ctor->index = i;
106}
107
108static inline bool z_erofs_pagevec_enqueue(struct z_erofs_pagevec_ctor *ctor,
109 struct page *page,
110 enum z_erofs_page_type type,
111 bool *occupied)
112{
113 *occupied = false;
114 if (unlikely(!ctor->next && type))
115 if (ctor->index + 1 == ctor->nr)
116 return false;
117
118 if (unlikely(ctor->index >= ctor->nr))
119 z_erofs_pagevec_ctor_pagedown(ctor, false);
120
121 /* exclusive page type must be 0 */
122 if (Z_EROFS_PAGE_TYPE_EXCLUSIVE != (uintptr_t)NULL)
123 __bad_page_type_exclusive();
124
125 /* should remind that collector->next never equal to 1, 2 */
126 if (type == (uintptr_t)ctor->next) {
127 ctor->next = page;
128 *occupied = true;
129 }
130 ctor->pages[ctor->index++] = tagptr_fold(erofs_vtptr_t, page, type);
131 return true;
132}
133
134static inline struct page *
135z_erofs_pagevec_dequeue(struct z_erofs_pagevec_ctor *ctor,
136 enum z_erofs_page_type *type)
137{
138 erofs_vtptr_t t;
139
140 if (unlikely(ctor->index >= ctor->nr)) {
141 DBG_BUGON(!ctor->next);
142 z_erofs_pagevec_ctor_pagedown(ctor, true);
143 }
144
145 t = ctor->pages[ctor->index];
146
147 *type = tagptr_unfold_tags(t);
148
149 /* should remind that collector->next never equal to 1, 2 */
150 if (*type == (uintptr_t)ctor->next)
151 ctor->next = tagptr_unfold_ptr(t);
152
153 ctor->pages[ctor->index++] = tagptr_fold(erofs_vtptr_t, NULL, 0);
154 return tagptr_unfold_ptr(t);
155}
156#endif
157