diff options
Diffstat (limited to 'fs/erofs')
-rw-r--r-- | fs/erofs/Kconfig | 98 | ||||
-rw-r--r-- | fs/erofs/Makefile | 11 | ||||
-rw-r--r-- | fs/erofs/compress.h | 60 | ||||
-rw-r--r-- | fs/erofs/data.c | 423 | ||||
-rw-r--r-- | fs/erofs/decompressor.c | 358 | ||||
-rw-r--r-- | fs/erofs/dir.c | 139 | ||||
-rw-r--r-- | fs/erofs/erofs_fs.h | 307 | ||||
-rw-r--r-- | fs/erofs/inode.c | 332 | ||||
-rw-r--r-- | fs/erofs/internal.h | 553 | ||||
-rw-r--r-- | fs/erofs/namei.c | 251 | ||||
-rw-r--r-- | fs/erofs/super.c | 669 | ||||
-rw-r--r-- | fs/erofs/tagptr.h | 110 | ||||
-rw-r--r-- | fs/erofs/utils.c | 333 | ||||
-rw-r--r-- | fs/erofs/xattr.c | 703 | ||||
-rw-r--r-- | fs/erofs/xattr.h | 92 | ||||
-rw-r--r-- | fs/erofs/zdata.c | 1432 | ||||
-rw-r--r-- | fs/erofs/zdata.h | 193 | ||||
-rw-r--r-- | fs/erofs/zmap.c | 466 | ||||
-rw-r--r-- | fs/erofs/zpvec.h | 157 |
19 files changed, 6687 insertions, 0 deletions
diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig new file mode 100644 index 000000000000..16316d1adca3 --- /dev/null +++ b/fs/erofs/Kconfig | |||
@@ -0,0 +1,98 @@ | |||
1 | # SPDX-License-Identifier: GPL-2.0-only | ||
2 | |||
3 | config EROFS_FS | ||
4 | tristate "EROFS filesystem support" | ||
5 | depends on BLOCK | ||
6 | help | ||
7 | EROFS (Enhanced Read-Only File System) is a lightweight | ||
8 | read-only file system with modern designs (eg. page-sized | ||
9 | blocks, inline xattrs/data, etc.) for scenarios which need | ||
10 | high-performance read-only requirements, e.g. Android OS | ||
11 | for mobile phones and LIVECDs. | ||
12 | |||
13 | It also provides fixed-sized output compression support, | ||
14 | which improves storage density, keeps relatively higher | ||
15 | compression ratios, which is more useful to achieve high | ||
16 | performance for embedded devices with limited memory. | ||
17 | |||
18 | If unsure, say N. | ||
19 | |||
20 | config EROFS_FS_DEBUG | ||
21 | bool "EROFS debugging feature" | ||
22 | depends on EROFS_FS | ||
23 | help | ||
24 | Print debugging messages and enable more BUG_ONs which check | ||
25 | filesystem consistency and find potential issues aggressively, | ||
26 | which can be used for Android eng build, for example. | ||
27 | |||
28 | For daily use, say N. | ||
29 | |||
30 | config EROFS_FAULT_INJECTION | ||
31 | bool "EROFS fault injection facility" | ||
32 | depends on EROFS_FS | ||
33 | help | ||
34 | Test EROFS to inject faults such as ENOMEM, EIO, and so on. | ||
35 | If unsure, say N. | ||
36 | |||
37 | config EROFS_FS_XATTR | ||
38 | bool "EROFS extended attributes" | ||
39 | depends on EROFS_FS | ||
40 | default y | ||
41 | help | ||
42 | Extended attributes are name:value pairs associated with inodes by | ||
43 | the kernel or by users (see the attr(5) manual page, or visit | ||
44 | <http://acl.bestbits.at/> for details). | ||
45 | |||
46 | If unsure, say N. | ||
47 | |||
48 | config EROFS_FS_POSIX_ACL | ||
49 | bool "EROFS Access Control Lists" | ||
50 | depends on EROFS_FS_XATTR | ||
51 | select FS_POSIX_ACL | ||
52 | default y | ||
53 | help | ||
54 | Posix Access Control Lists (ACLs) support permissions for users and | ||
55 | groups beyond the owner/group/world scheme. | ||
56 | |||
57 | To learn more about Access Control Lists, visit the POSIX ACLs for | ||
58 | Linux website <http://acl.bestbits.at/>. | ||
59 | |||
60 | If you don't know what Access Control Lists are, say N. | ||
61 | |||
62 | config EROFS_FS_SECURITY | ||
63 | bool "EROFS Security Labels" | ||
64 | depends on EROFS_FS_XATTR | ||
65 | default y | ||
66 | help | ||
67 | Security labels provide an access control facility to support Linux | ||
68 | Security Models (LSMs) accepted by AppArmor, SELinux, Smack and TOMOYO | ||
69 | Linux. This option enables an extended attribute handler for file | ||
70 | security labels in the erofs filesystem, so that it requires enabling | ||
71 | the extended attribute support in advance. | ||
72 | |||
73 | If you are not using a security module, say N. | ||
74 | |||
75 | config EROFS_FS_ZIP | ||
76 | bool "EROFS Data Compression Support" | ||
77 | depends on EROFS_FS | ||
78 | select LZ4_DECOMPRESS | ||
79 | default y | ||
80 | help | ||
81 | Enable fixed-sized output compression for EROFS. | ||
82 | |||
83 | If you don't want to enable compression feature, say N. | ||
84 | |||
85 | config EROFS_FS_CLUSTER_PAGE_LIMIT | ||
86 | int "EROFS Cluster Pages Hard Limit" | ||
87 | depends on EROFS_FS_ZIP | ||
88 | range 1 256 | ||
89 | default "1" | ||
90 | help | ||
91 | Indicates maximum # of pages of a compressed | ||
92 | physical cluster. | ||
93 | |||
94 | For example, if files in a image were compressed | ||
95 | into 8k-unit, hard limit should not be configured | ||
96 | less than 2. Otherwise, the image will be refused | ||
97 | to mount on this kernel. | ||
98 | |||
diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile new file mode 100644 index 000000000000..46f2aa4ba46c --- /dev/null +++ b/fs/erofs/Makefile | |||
@@ -0,0 +1,11 @@ | |||
1 | # SPDX-License-Identifier: GPL-2.0-only | ||
2 | |||
3 | EROFS_VERSION = "1.0" | ||
4 | |||
5 | ccflags-y += -DEROFS_VERSION=\"$(EROFS_VERSION)\" | ||
6 | |||
7 | obj-$(CONFIG_EROFS_FS) += erofs.o | ||
8 | erofs-objs := super.o inode.o data.o namei.o dir.o utils.o | ||
9 | erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o | ||
10 | erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o | ||
11 | |||
diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h new file mode 100644 index 000000000000..07d279fd5d67 --- /dev/null +++ b/fs/erofs/compress.h | |||
@@ -0,0 +1,60 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0-only */ | ||
2 | /* | ||
3 | * Copyright (C) 2019 HUAWEI, Inc. | ||
4 | * http://www.huawei.com/ | ||
5 | * Created by Gao Xiang <gaoxiang25@huawei.com> | ||
6 | */ | ||
7 | #ifndef __EROFS_FS_COMPRESS_H | ||
8 | #define __EROFS_FS_COMPRESS_H | ||
9 | |||
10 | #include "internal.h" | ||
11 | |||
12 | enum { | ||
13 | Z_EROFS_COMPRESSION_SHIFTED = Z_EROFS_COMPRESSION_MAX, | ||
14 | Z_EROFS_COMPRESSION_RUNTIME_MAX | ||
15 | }; | ||
16 | |||
17 | struct z_erofs_decompress_req { | ||
18 | struct super_block *sb; | ||
19 | struct page **in, **out; | ||
20 | |||
21 | unsigned short pageofs_out; | ||
22 | unsigned int inputsize, outputsize; | ||
23 | |||
24 | /* indicate the algorithm will be used for decompression */ | ||
25 | unsigned int alg; | ||
26 | bool inplace_io, partial_decoding; | ||
27 | }; | ||
28 | |||
29 | /* | ||
30 | * - 0x5A110C8D ('sallocated', Z_EROFS_MAPPING_STAGING) - | ||
31 | * used to mark temporary allocated pages from other | ||
32 | * file/cached pages and NULL mapping pages. | ||
33 | */ | ||
34 | #define Z_EROFS_MAPPING_STAGING ((void *)0x5A110C8D) | ||
35 | |||
36 | /* check if a page is marked as staging */ | ||
37 | static inline bool z_erofs_page_is_staging(struct page *page) | ||
38 | { | ||
39 | return page->mapping == Z_EROFS_MAPPING_STAGING; | ||
40 | } | ||
41 | |||
42 | static inline bool z_erofs_put_stagingpage(struct list_head *pagepool, | ||
43 | struct page *page) | ||
44 | { | ||
45 | if (!z_erofs_page_is_staging(page)) | ||
46 | return false; | ||
47 | |||
48 | /* staging pages should not be used by others at the same time */ | ||
49 | if (page_ref_count(page) > 1) | ||
50 | put_page(page); | ||
51 | else | ||
52 | list_add(&page->lru, pagepool); | ||
53 | return true; | ||
54 | } | ||
55 | |||
56 | int z_erofs_decompress(struct z_erofs_decompress_req *rq, | ||
57 | struct list_head *pagepool); | ||
58 | |||
59 | #endif | ||
60 | |||
diff --git a/fs/erofs/data.c b/fs/erofs/data.c new file mode 100644 index 000000000000..fda16ec8863e --- /dev/null +++ b/fs/erofs/data.c | |||
@@ -0,0 +1,423 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-only | ||
2 | /* | ||
3 | * Copyright (C) 2017-2018 HUAWEI, Inc. | ||
4 | * http://www.huawei.com/ | ||
5 | * Created by Gao Xiang <gaoxiang25@huawei.com> | ||
6 | */ | ||
7 | #include "internal.h" | ||
8 | #include <linux/prefetch.h> | ||
9 | |||
10 | #include <trace/events/erofs.h> | ||
11 | |||
12 | static inline void read_endio(struct bio *bio) | ||
13 | { | ||
14 | struct super_block *const sb = bio->bi_private; | ||
15 | struct bio_vec *bvec; | ||
16 | blk_status_t err = bio->bi_status; | ||
17 | struct bvec_iter_all iter_all; | ||
18 | |||
19 | if (time_to_inject(EROFS_SB(sb), FAULT_READ_IO)) { | ||
20 | erofs_show_injection_info(FAULT_READ_IO); | ||
21 | err = BLK_STS_IOERR; | ||
22 | } | ||
23 | |||
24 | bio_for_each_segment_all(bvec, bio, iter_all) { | ||
25 | struct page *page = bvec->bv_page; | ||
26 | |||
27 | /* page is already locked */ | ||
28 | DBG_BUGON(PageUptodate(page)); | ||
29 | |||
30 | if (unlikely(err)) | ||
31 | SetPageError(page); | ||
32 | else | ||
33 | SetPageUptodate(page); | ||
34 | |||
35 | unlock_page(page); | ||
36 | /* page could be reclaimed now */ | ||
37 | } | ||
38 | bio_put(bio); | ||
39 | } | ||
40 | |||
41 | /* prio -- true is used for dir */ | ||
42 | struct page *__erofs_get_meta_page(struct super_block *sb, | ||
43 | erofs_blk_t blkaddr, bool prio, bool nofail) | ||
44 | { | ||
45 | struct inode *const bd_inode = sb->s_bdev->bd_inode; | ||
46 | struct address_space *const mapping = bd_inode->i_mapping; | ||
47 | /* prefer retrying in the allocator to blindly looping below */ | ||
48 | const gfp_t gfp = mapping_gfp_constraint(mapping, ~__GFP_FS) | | ||
49 | (nofail ? __GFP_NOFAIL : 0); | ||
50 | unsigned int io_retries = nofail ? EROFS_IO_MAX_RETRIES_NOFAIL : 0; | ||
51 | struct page *page; | ||
52 | int err; | ||
53 | |||
54 | repeat: | ||
55 | page = find_or_create_page(mapping, blkaddr, gfp); | ||
56 | if (unlikely(!page)) { | ||
57 | DBG_BUGON(nofail); | ||
58 | return ERR_PTR(-ENOMEM); | ||
59 | } | ||
60 | DBG_BUGON(!PageLocked(page)); | ||
61 | |||
62 | if (!PageUptodate(page)) { | ||
63 | struct bio *bio; | ||
64 | |||
65 | bio = erofs_grab_bio(sb, blkaddr, 1, sb, read_endio, nofail); | ||
66 | if (IS_ERR(bio)) { | ||
67 | DBG_BUGON(nofail); | ||
68 | err = PTR_ERR(bio); | ||
69 | goto err_out; | ||
70 | } | ||
71 | |||
72 | err = bio_add_page(bio, page, PAGE_SIZE, 0); | ||
73 | if (unlikely(err != PAGE_SIZE)) { | ||
74 | err = -EFAULT; | ||
75 | goto err_out; | ||
76 | } | ||
77 | |||
78 | __submit_bio(bio, REQ_OP_READ, | ||
79 | REQ_META | (prio ? REQ_PRIO : 0)); | ||
80 | |||
81 | lock_page(page); | ||
82 | |||
83 | /* this page has been truncated by others */ | ||
84 | if (unlikely(page->mapping != mapping)) { | ||
85 | unlock_repeat: | ||
86 | unlock_page(page); | ||
87 | put_page(page); | ||
88 | goto repeat; | ||
89 | } | ||
90 | |||
91 | /* more likely a read error */ | ||
92 | if (unlikely(!PageUptodate(page))) { | ||
93 | if (io_retries) { | ||
94 | --io_retries; | ||
95 | goto unlock_repeat; | ||
96 | } | ||
97 | err = -EIO; | ||
98 | goto err_out; | ||
99 | } | ||
100 | } | ||
101 | return page; | ||
102 | |||
103 | err_out: | ||
104 | unlock_page(page); | ||
105 | put_page(page); | ||
106 | return ERR_PTR(err); | ||
107 | } | ||
108 | |||
109 | static int erofs_map_blocks_flatmode(struct inode *inode, | ||
110 | struct erofs_map_blocks *map, | ||
111 | int flags) | ||
112 | { | ||
113 | int err = 0; | ||
114 | erofs_blk_t nblocks, lastblk; | ||
115 | u64 offset = map->m_la; | ||
116 | struct erofs_vnode *vi = EROFS_V(inode); | ||
117 | |||
118 | trace_erofs_map_blocks_flatmode_enter(inode, map, flags); | ||
119 | |||
120 | nblocks = DIV_ROUND_UP(inode->i_size, PAGE_SIZE); | ||
121 | lastblk = nblocks - is_inode_flat_inline(inode); | ||
122 | |||
123 | if (unlikely(offset >= inode->i_size)) { | ||
124 | /* leave out-of-bound access unmapped */ | ||
125 | map->m_flags = 0; | ||
126 | map->m_plen = 0; | ||
127 | goto out; | ||
128 | } | ||
129 | |||
130 | /* there is no hole in flatmode */ | ||
131 | map->m_flags = EROFS_MAP_MAPPED; | ||
132 | |||
133 | if (offset < blknr_to_addr(lastblk)) { | ||
134 | map->m_pa = blknr_to_addr(vi->raw_blkaddr) + map->m_la; | ||
135 | map->m_plen = blknr_to_addr(lastblk) - offset; | ||
136 | } else if (is_inode_flat_inline(inode)) { | ||
137 | /* 2 - inode inline B: inode, [xattrs], inline last blk... */ | ||
138 | struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb); | ||
139 | |||
140 | map->m_pa = iloc(sbi, vi->nid) + vi->inode_isize + | ||
141 | vi->xattr_isize + erofs_blkoff(map->m_la); | ||
142 | map->m_plen = inode->i_size - offset; | ||
143 | |||
144 | /* inline data should be located in one meta block */ | ||
145 | if (erofs_blkoff(map->m_pa) + map->m_plen > PAGE_SIZE) { | ||
146 | errln("inline data cross block boundary @ nid %llu", | ||
147 | vi->nid); | ||
148 | DBG_BUGON(1); | ||
149 | err = -EFSCORRUPTED; | ||
150 | goto err_out; | ||
151 | } | ||
152 | |||
153 | map->m_flags |= EROFS_MAP_META; | ||
154 | } else { | ||
155 | errln("internal error @ nid: %llu (size %llu), m_la 0x%llx", | ||
156 | vi->nid, inode->i_size, map->m_la); | ||
157 | DBG_BUGON(1); | ||
158 | err = -EIO; | ||
159 | goto err_out; | ||
160 | } | ||
161 | |||
162 | out: | ||
163 | map->m_llen = map->m_plen; | ||
164 | |||
165 | err_out: | ||
166 | trace_erofs_map_blocks_flatmode_exit(inode, map, flags, 0); | ||
167 | return err; | ||
168 | } | ||
169 | |||
170 | int erofs_map_blocks(struct inode *inode, | ||
171 | struct erofs_map_blocks *map, int flags) | ||
172 | { | ||
173 | if (unlikely(is_inode_layout_compression(inode))) { | ||
174 | int err = z_erofs_map_blocks_iter(inode, map, flags); | ||
175 | |||
176 | if (map->mpage) { | ||
177 | put_page(map->mpage); | ||
178 | map->mpage = NULL; | ||
179 | } | ||
180 | return err; | ||
181 | } | ||
182 | return erofs_map_blocks_flatmode(inode, map, flags); | ||
183 | } | ||
184 | |||
185 | static inline struct bio *erofs_read_raw_page(struct bio *bio, | ||
186 | struct address_space *mapping, | ||
187 | struct page *page, | ||
188 | erofs_off_t *last_block, | ||
189 | unsigned int nblocks, | ||
190 | bool ra) | ||
191 | { | ||
192 | struct inode *const inode = mapping->host; | ||
193 | struct super_block *const sb = inode->i_sb; | ||
194 | erofs_off_t current_block = (erofs_off_t)page->index; | ||
195 | int err; | ||
196 | |||
197 | DBG_BUGON(!nblocks); | ||
198 | |||
199 | if (PageUptodate(page)) { | ||
200 | err = 0; | ||
201 | goto has_updated; | ||
202 | } | ||
203 | |||
204 | /* note that for readpage case, bio also equals to NULL */ | ||
205 | if (bio && | ||
206 | /* not continuous */ | ||
207 | *last_block + 1 != current_block) { | ||
208 | submit_bio_retry: | ||
209 | __submit_bio(bio, REQ_OP_READ, 0); | ||
210 | bio = NULL; | ||
211 | } | ||
212 | |||
213 | if (!bio) { | ||
214 | struct erofs_map_blocks map = { | ||
215 | .m_la = blknr_to_addr(current_block), | ||
216 | }; | ||
217 | erofs_blk_t blknr; | ||
218 | unsigned int blkoff; | ||
219 | |||
220 | err = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW); | ||
221 | if (unlikely(err)) | ||
222 | goto err_out; | ||
223 | |||
224 | /* zero out the holed page */ | ||
225 | if (unlikely(!(map.m_flags & EROFS_MAP_MAPPED))) { | ||
226 | zero_user_segment(page, 0, PAGE_SIZE); | ||
227 | SetPageUptodate(page); | ||
228 | |||
229 | /* imply err = 0, see erofs_map_blocks */ | ||
230 | goto has_updated; | ||
231 | } | ||
232 | |||
233 | /* for RAW access mode, m_plen must be equal to m_llen */ | ||
234 | DBG_BUGON(map.m_plen != map.m_llen); | ||
235 | |||
236 | blknr = erofs_blknr(map.m_pa); | ||
237 | blkoff = erofs_blkoff(map.m_pa); | ||
238 | |||
239 | /* deal with inline page */ | ||
240 | if (map.m_flags & EROFS_MAP_META) { | ||
241 | void *vsrc, *vto; | ||
242 | struct page *ipage; | ||
243 | |||
244 | DBG_BUGON(map.m_plen > PAGE_SIZE); | ||
245 | |||
246 | ipage = erofs_get_meta_page(inode->i_sb, blknr, 0); | ||
247 | |||
248 | if (IS_ERR(ipage)) { | ||
249 | err = PTR_ERR(ipage); | ||
250 | goto err_out; | ||
251 | } | ||
252 | |||
253 | vsrc = kmap_atomic(ipage); | ||
254 | vto = kmap_atomic(page); | ||
255 | memcpy(vto, vsrc + blkoff, map.m_plen); | ||
256 | memset(vto + map.m_plen, 0, PAGE_SIZE - map.m_plen); | ||
257 | kunmap_atomic(vto); | ||
258 | kunmap_atomic(vsrc); | ||
259 | flush_dcache_page(page); | ||
260 | |||
261 | SetPageUptodate(page); | ||
262 | /* TODO: could we unlock the page earlier? */ | ||
263 | unlock_page(ipage); | ||
264 | put_page(ipage); | ||
265 | |||
266 | /* imply err = 0, see erofs_map_blocks */ | ||
267 | goto has_updated; | ||
268 | } | ||
269 | |||
270 | /* pa must be block-aligned for raw reading */ | ||
271 | DBG_BUGON(erofs_blkoff(map.m_pa)); | ||
272 | |||
273 | /* max # of continuous pages */ | ||
274 | if (nblocks > DIV_ROUND_UP(map.m_plen, PAGE_SIZE)) | ||
275 | nblocks = DIV_ROUND_UP(map.m_plen, PAGE_SIZE); | ||
276 | if (nblocks > BIO_MAX_PAGES) | ||
277 | nblocks = BIO_MAX_PAGES; | ||
278 | |||
279 | bio = erofs_grab_bio(sb, blknr, nblocks, sb, | ||
280 | read_endio, false); | ||
281 | if (IS_ERR(bio)) { | ||
282 | err = PTR_ERR(bio); | ||
283 | bio = NULL; | ||
284 | goto err_out; | ||
285 | } | ||
286 | } | ||
287 | |||
288 | err = bio_add_page(bio, page, PAGE_SIZE, 0); | ||
289 | /* out of the extent or bio is full */ | ||
290 | if (err < PAGE_SIZE) | ||
291 | goto submit_bio_retry; | ||
292 | |||
293 | *last_block = current_block; | ||
294 | |||
295 | /* shift in advance in case of it followed by too many gaps */ | ||
296 | if (bio->bi_iter.bi_size >= bio->bi_max_vecs * PAGE_SIZE) { | ||
297 | /* err should reassign to 0 after submitting */ | ||
298 | err = 0; | ||
299 | goto submit_bio_out; | ||
300 | } | ||
301 | |||
302 | return bio; | ||
303 | |||
304 | err_out: | ||
305 | /* for sync reading, set page error immediately */ | ||
306 | if (!ra) { | ||
307 | SetPageError(page); | ||
308 | ClearPageUptodate(page); | ||
309 | } | ||
310 | has_updated: | ||
311 | unlock_page(page); | ||
312 | |||
313 | /* if updated manually, continuous pages has a gap */ | ||
314 | if (bio) | ||
315 | submit_bio_out: | ||
316 | __submit_bio(bio, REQ_OP_READ, 0); | ||
317 | |||
318 | return unlikely(err) ? ERR_PTR(err) : NULL; | ||
319 | } | ||
320 | |||
321 | /* | ||
322 | * since we dont have write or truncate flows, so no inode | ||
323 | * locking needs to be held at the moment. | ||
324 | */ | ||
325 | static int erofs_raw_access_readpage(struct file *file, struct page *page) | ||
326 | { | ||
327 | erofs_off_t last_block; | ||
328 | struct bio *bio; | ||
329 | |||
330 | trace_erofs_readpage(page, true); | ||
331 | |||
332 | bio = erofs_read_raw_page(NULL, page->mapping, | ||
333 | page, &last_block, 1, false); | ||
334 | |||
335 | if (IS_ERR(bio)) | ||
336 | return PTR_ERR(bio); | ||
337 | |||
338 | DBG_BUGON(bio); /* since we have only one bio -- must be NULL */ | ||
339 | return 0; | ||
340 | } | ||
341 | |||
342 | static int erofs_raw_access_readpages(struct file *filp, | ||
343 | struct address_space *mapping, | ||
344 | struct list_head *pages, | ||
345 | unsigned int nr_pages) | ||
346 | { | ||
347 | erofs_off_t last_block; | ||
348 | struct bio *bio = NULL; | ||
349 | gfp_t gfp = readahead_gfp_mask(mapping); | ||
350 | struct page *page = list_last_entry(pages, struct page, lru); | ||
351 | |||
352 | trace_erofs_readpages(mapping->host, page, nr_pages, true); | ||
353 | |||
354 | for (; nr_pages; --nr_pages) { | ||
355 | page = list_entry(pages->prev, struct page, lru); | ||
356 | |||
357 | prefetchw(&page->flags); | ||
358 | list_del(&page->lru); | ||
359 | |||
360 | if (!add_to_page_cache_lru(page, mapping, page->index, gfp)) { | ||
361 | bio = erofs_read_raw_page(bio, mapping, page, | ||
362 | &last_block, nr_pages, true); | ||
363 | |||
364 | /* all the page errors are ignored when readahead */ | ||
365 | if (IS_ERR(bio)) { | ||
366 | pr_err("%s, readahead error at page %lu of nid %llu\n", | ||
367 | __func__, page->index, | ||
368 | EROFS_V(mapping->host)->nid); | ||
369 | |||
370 | bio = NULL; | ||
371 | } | ||
372 | } | ||
373 | |||
374 | /* pages could still be locked */ | ||
375 | put_page(page); | ||
376 | } | ||
377 | DBG_BUGON(!list_empty(pages)); | ||
378 | |||
379 | /* the rare case (end in gaps) */ | ||
380 | if (unlikely(bio)) | ||
381 | __submit_bio(bio, REQ_OP_READ, 0); | ||
382 | return 0; | ||
383 | } | ||
384 | |||
385 | static int erofs_get_block(struct inode *inode, sector_t iblock, | ||
386 | struct buffer_head *bh, int create) | ||
387 | { | ||
388 | struct erofs_map_blocks map = { | ||
389 | .m_la = iblock << 9, | ||
390 | }; | ||
391 | int err; | ||
392 | |||
393 | err = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW); | ||
394 | if (err) | ||
395 | return err; | ||
396 | |||
397 | if (map.m_flags & EROFS_MAP_MAPPED) | ||
398 | bh->b_blocknr = erofs_blknr(map.m_pa); | ||
399 | |||
400 | return err; | ||
401 | } | ||
402 | |||
403 | static sector_t erofs_bmap(struct address_space *mapping, sector_t block) | ||
404 | { | ||
405 | struct inode *inode = mapping->host; | ||
406 | |||
407 | if (is_inode_flat_inline(inode)) { | ||
408 | erofs_blk_t blks = i_size_read(inode) >> LOG_BLOCK_SIZE; | ||
409 | |||
410 | if (block >> LOG_SECTORS_PER_BLOCK >= blks) | ||
411 | return 0; | ||
412 | } | ||
413 | |||
414 | return generic_block_bmap(mapping, block, erofs_get_block); | ||
415 | } | ||
416 | |||
417 | /* for uncompressed (aligned) files and raw access for other files */ | ||
418 | const struct address_space_operations erofs_raw_access_aops = { | ||
419 | .readpage = erofs_raw_access_readpage, | ||
420 | .readpages = erofs_raw_access_readpages, | ||
421 | .bmap = erofs_bmap, | ||
422 | }; | ||
423 | |||
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c new file mode 100644 index 000000000000..5f4b7f302863 --- /dev/null +++ b/fs/erofs/decompressor.c | |||
@@ -0,0 +1,358 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-only | ||
2 | /* | ||
3 | * Copyright (C) 2019 HUAWEI, Inc. | ||
4 | * http://www.huawei.com/ | ||
5 | * Created by Gao Xiang <gaoxiang25@huawei.com> | ||
6 | */ | ||
7 | #include "compress.h" | ||
8 | #include <linux/module.h> | ||
9 | #include <linux/lz4.h> | ||
10 | |||
11 | #ifndef LZ4_DISTANCE_MAX /* history window size */ | ||
12 | #define LZ4_DISTANCE_MAX 65535 /* set to maximum value by default */ | ||
13 | #endif | ||
14 | |||
15 | #define LZ4_MAX_DISTANCE_PAGES (DIV_ROUND_UP(LZ4_DISTANCE_MAX, PAGE_SIZE) + 1) | ||
16 | #ifndef LZ4_DECOMPRESS_INPLACE_MARGIN | ||
17 | #define LZ4_DECOMPRESS_INPLACE_MARGIN(srcsize) (((srcsize) >> 8) + 32) | ||
18 | #endif | ||
19 | |||
20 | struct z_erofs_decompressor { | ||
21 | /* | ||
22 | * if destpages have sparsed pages, fill them with bounce pages. | ||
23 | * it also check whether destpages indicate continuous physical memory. | ||
24 | */ | ||
25 | int (*prepare_destpages)(struct z_erofs_decompress_req *rq, | ||
26 | struct list_head *pagepool); | ||
27 | int (*decompress)(struct z_erofs_decompress_req *rq, u8 *out); | ||
28 | char *name; | ||
29 | }; | ||
30 | |||
31 | static bool use_vmap; | ||
32 | module_param(use_vmap, bool, 0444); | ||
33 | MODULE_PARM_DESC(use_vmap, "Use vmap() instead of vm_map_ram() (default 0)"); | ||
34 | |||
35 | static int lz4_prepare_destpages(struct z_erofs_decompress_req *rq, | ||
36 | struct list_head *pagepool) | ||
37 | { | ||
38 | const unsigned int nr = | ||
39 | PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; | ||
40 | struct page *availables[LZ4_MAX_DISTANCE_PAGES] = { NULL }; | ||
41 | unsigned long bounced[DIV_ROUND_UP(LZ4_MAX_DISTANCE_PAGES, | ||
42 | BITS_PER_LONG)] = { 0 }; | ||
43 | void *kaddr = NULL; | ||
44 | unsigned int i, j, top; | ||
45 | |||
46 | top = 0; | ||
47 | for (i = j = 0; i < nr; ++i, ++j) { | ||
48 | struct page *const page = rq->out[i]; | ||
49 | struct page *victim; | ||
50 | |||
51 | if (j >= LZ4_MAX_DISTANCE_PAGES) | ||
52 | j = 0; | ||
53 | |||
54 | /* 'valid' bounced can only be tested after a complete round */ | ||
55 | if (test_bit(j, bounced)) { | ||
56 | DBG_BUGON(i < LZ4_MAX_DISTANCE_PAGES); | ||
57 | DBG_BUGON(top >= LZ4_MAX_DISTANCE_PAGES); | ||
58 | availables[top++] = rq->out[i - LZ4_MAX_DISTANCE_PAGES]; | ||
59 | } | ||
60 | |||
61 | if (page) { | ||
62 | __clear_bit(j, bounced); | ||
63 | if (kaddr) { | ||
64 | if (kaddr + PAGE_SIZE == page_address(page)) | ||
65 | kaddr += PAGE_SIZE; | ||
66 | else | ||
67 | kaddr = NULL; | ||
68 | } else if (!i) { | ||
69 | kaddr = page_address(page); | ||
70 | } | ||
71 | continue; | ||
72 | } | ||
73 | kaddr = NULL; | ||
74 | __set_bit(j, bounced); | ||
75 | |||
76 | if (top) { | ||
77 | victim = availables[--top]; | ||
78 | get_page(victim); | ||
79 | } else { | ||
80 | victim = erofs_allocpage(pagepool, GFP_KERNEL, false); | ||
81 | if (unlikely(!victim)) | ||
82 | return -ENOMEM; | ||
83 | victim->mapping = Z_EROFS_MAPPING_STAGING; | ||
84 | } | ||
85 | rq->out[i] = victim; | ||
86 | } | ||
87 | return kaddr ? 1 : 0; | ||
88 | } | ||
89 | |||
90 | static void *generic_copy_inplace_data(struct z_erofs_decompress_req *rq, | ||
91 | u8 *src, unsigned int pageofs_in) | ||
92 | { | ||
93 | /* | ||
94 | * if in-place decompression is ongoing, those decompressed | ||
95 | * pages should be copied in order to avoid being overlapped. | ||
96 | */ | ||
97 | struct page **in = rq->in; | ||
98 | u8 *const tmp = erofs_get_pcpubuf(0); | ||
99 | u8 *tmpp = tmp; | ||
100 | unsigned int inlen = rq->inputsize - pageofs_in; | ||
101 | unsigned int count = min_t(uint, inlen, PAGE_SIZE - pageofs_in); | ||
102 | |||
103 | while (tmpp < tmp + inlen) { | ||
104 | if (!src) | ||
105 | src = kmap_atomic(*in); | ||
106 | memcpy(tmpp, src + pageofs_in, count); | ||
107 | kunmap_atomic(src); | ||
108 | src = NULL; | ||
109 | tmpp += count; | ||
110 | pageofs_in = 0; | ||
111 | count = PAGE_SIZE; | ||
112 | ++in; | ||
113 | } | ||
114 | return tmp; | ||
115 | } | ||
116 | |||
117 | static int lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out) | ||
118 | { | ||
119 | unsigned int inputmargin, inlen; | ||
120 | u8 *src; | ||
121 | bool copied, support_0padding; | ||
122 | int ret; | ||
123 | |||
124 | if (rq->inputsize > PAGE_SIZE) | ||
125 | return -EOPNOTSUPP; | ||
126 | |||
127 | src = kmap_atomic(*rq->in); | ||
128 | inputmargin = 0; | ||
129 | support_0padding = false; | ||
130 | |||
131 | /* decompression inplace is only safe when 0padding is enabled */ | ||
132 | if (EROFS_SB(rq->sb)->requirements & EROFS_REQUIREMENT_LZ4_0PADDING) { | ||
133 | support_0padding = true; | ||
134 | |||
135 | while (!src[inputmargin & ~PAGE_MASK]) | ||
136 | if (!(++inputmargin & ~PAGE_MASK)) | ||
137 | break; | ||
138 | |||
139 | if (inputmargin >= rq->inputsize) { | ||
140 | kunmap_atomic(src); | ||
141 | return -EIO; | ||
142 | } | ||
143 | } | ||
144 | |||
145 | copied = false; | ||
146 | inlen = rq->inputsize - inputmargin; | ||
147 | if (rq->inplace_io) { | ||
148 | const uint oend = (rq->pageofs_out + | ||
149 | rq->outputsize) & ~PAGE_MASK; | ||
150 | const uint nr = PAGE_ALIGN(rq->pageofs_out + | ||
151 | rq->outputsize) >> PAGE_SHIFT; | ||
152 | |||
153 | if (rq->partial_decoding || !support_0padding || | ||
154 | rq->out[nr - 1] != rq->in[0] || | ||
155 | rq->inputsize - oend < | ||
156 | LZ4_DECOMPRESS_INPLACE_MARGIN(inlen)) { | ||
157 | src = generic_copy_inplace_data(rq, src, inputmargin); | ||
158 | inputmargin = 0; | ||
159 | copied = true; | ||
160 | } | ||
161 | } | ||
162 | |||
163 | ret = LZ4_decompress_safe_partial(src + inputmargin, out, | ||
164 | inlen, rq->outputsize, | ||
165 | rq->outputsize); | ||
166 | if (ret < 0) { | ||
167 | errln("%s, failed to decompress, in[%p, %u, %u] out[%p, %u]", | ||
168 | __func__, src + inputmargin, inlen, inputmargin, | ||
169 | out, rq->outputsize); | ||
170 | WARN_ON(1); | ||
171 | print_hex_dump(KERN_DEBUG, "[ in]: ", DUMP_PREFIX_OFFSET, | ||
172 | 16, 1, src + inputmargin, inlen, true); | ||
173 | print_hex_dump(KERN_DEBUG, "[out]: ", DUMP_PREFIX_OFFSET, | ||
174 | 16, 1, out, rq->outputsize, true); | ||
175 | ret = -EIO; | ||
176 | } | ||
177 | |||
178 | if (copied) | ||
179 | erofs_put_pcpubuf(src); | ||
180 | else | ||
181 | kunmap_atomic(src); | ||
182 | return ret; | ||
183 | } | ||
184 | |||
185 | static struct z_erofs_decompressor decompressors[] = { | ||
186 | [Z_EROFS_COMPRESSION_SHIFTED] = { | ||
187 | .name = "shifted" | ||
188 | }, | ||
189 | [Z_EROFS_COMPRESSION_LZ4] = { | ||
190 | .prepare_destpages = lz4_prepare_destpages, | ||
191 | .decompress = lz4_decompress, | ||
192 | .name = "lz4" | ||
193 | }, | ||
194 | }; | ||
195 | |||
196 | static void copy_from_pcpubuf(struct page **out, const char *dst, | ||
197 | unsigned short pageofs_out, | ||
198 | unsigned int outputsize) | ||
199 | { | ||
200 | const char *end = dst + outputsize; | ||
201 | const unsigned int righthalf = PAGE_SIZE - pageofs_out; | ||
202 | const char *cur = dst - pageofs_out; | ||
203 | |||
204 | while (cur < end) { | ||
205 | struct page *const page = *out++; | ||
206 | |||
207 | if (page) { | ||
208 | char *buf = kmap_atomic(page); | ||
209 | |||
210 | if (cur >= dst) { | ||
211 | memcpy(buf, cur, min_t(uint, PAGE_SIZE, | ||
212 | end - cur)); | ||
213 | } else { | ||
214 | memcpy(buf + pageofs_out, cur + pageofs_out, | ||
215 | min_t(uint, righthalf, end - cur)); | ||
216 | } | ||
217 | kunmap_atomic(buf); | ||
218 | } | ||
219 | cur += PAGE_SIZE; | ||
220 | } | ||
221 | } | ||
222 | |||
223 | static void *erofs_vmap(struct page **pages, unsigned int count) | ||
224 | { | ||
225 | int i = 0; | ||
226 | |||
227 | if (use_vmap) | ||
228 | return vmap(pages, count, VM_MAP, PAGE_KERNEL); | ||
229 | |||
230 | while (1) { | ||
231 | void *addr = vm_map_ram(pages, count, -1, PAGE_KERNEL); | ||
232 | |||
233 | /* retry two more times (totally 3 times) */ | ||
234 | if (addr || ++i >= 3) | ||
235 | return addr; | ||
236 | vm_unmap_aliases(); | ||
237 | } | ||
238 | return NULL; | ||
239 | } | ||
240 | |||
241 | static void erofs_vunmap(const void *mem, unsigned int count) | ||
242 | { | ||
243 | if (!use_vmap) | ||
244 | vm_unmap_ram(mem, count); | ||
245 | else | ||
246 | vunmap(mem); | ||
247 | } | ||
248 | |||
249 | static int decompress_generic(struct z_erofs_decompress_req *rq, | ||
250 | struct list_head *pagepool) | ||
251 | { | ||
252 | const unsigned int nrpages_out = | ||
253 | PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; | ||
254 | const struct z_erofs_decompressor *alg = decompressors + rq->alg; | ||
255 | unsigned int dst_maptype; | ||
256 | void *dst; | ||
257 | int ret; | ||
258 | |||
259 | if (nrpages_out == 1 && !rq->inplace_io) { | ||
260 | DBG_BUGON(!*rq->out); | ||
261 | dst = kmap_atomic(*rq->out); | ||
262 | dst_maptype = 0; | ||
263 | goto dstmap_out; | ||
264 | } | ||
265 | |||
266 | /* | ||
267 | * For the case of small output size (especially much less | ||
268 | * than PAGE_SIZE), memcpy the decompressed data rather than | ||
269 | * compressed data is preferred. | ||
270 | */ | ||
271 | if (rq->outputsize <= PAGE_SIZE * 7 / 8) { | ||
272 | dst = erofs_get_pcpubuf(0); | ||
273 | if (IS_ERR(dst)) | ||
274 | return PTR_ERR(dst); | ||
275 | |||
276 | rq->inplace_io = false; | ||
277 | ret = alg->decompress(rq, dst); | ||
278 | if (!ret) | ||
279 | copy_from_pcpubuf(rq->out, dst, rq->pageofs_out, | ||
280 | rq->outputsize); | ||
281 | |||
282 | erofs_put_pcpubuf(dst); | ||
283 | return ret; | ||
284 | } | ||
285 | |||
286 | ret = alg->prepare_destpages(rq, pagepool); | ||
287 | if (ret < 0) { | ||
288 | return ret; | ||
289 | } else if (ret) { | ||
290 | dst = page_address(*rq->out); | ||
291 | dst_maptype = 1; | ||
292 | goto dstmap_out; | ||
293 | } | ||
294 | |||
295 | dst = erofs_vmap(rq->out, nrpages_out); | ||
296 | if (!dst) | ||
297 | return -ENOMEM; | ||
298 | dst_maptype = 2; | ||
299 | |||
300 | dstmap_out: | ||
301 | ret = alg->decompress(rq, dst + rq->pageofs_out); | ||
302 | |||
303 | if (!dst_maptype) | ||
304 | kunmap_atomic(dst); | ||
305 | else if (dst_maptype == 2) | ||
306 | erofs_vunmap(dst, nrpages_out); | ||
307 | return ret; | ||
308 | } | ||
309 | |||
310 | static int shifted_decompress(const struct z_erofs_decompress_req *rq, | ||
311 | struct list_head *pagepool) | ||
312 | { | ||
313 | const unsigned int nrpages_out = | ||
314 | PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; | ||
315 | const unsigned int righthalf = PAGE_SIZE - rq->pageofs_out; | ||
316 | unsigned char *src, *dst; | ||
317 | |||
318 | if (nrpages_out > 2) { | ||
319 | DBG_BUGON(1); | ||
320 | return -EIO; | ||
321 | } | ||
322 | |||
323 | if (rq->out[0] == *rq->in) { | ||
324 | DBG_BUGON(nrpages_out != 1); | ||
325 | return 0; | ||
326 | } | ||
327 | |||
328 | src = kmap_atomic(*rq->in); | ||
329 | if (!rq->out[0]) { | ||
330 | dst = NULL; | ||
331 | } else { | ||
332 | dst = kmap_atomic(rq->out[0]); | ||
333 | memcpy(dst + rq->pageofs_out, src, righthalf); | ||
334 | } | ||
335 | |||
336 | if (rq->out[1] == *rq->in) { | ||
337 | memmove(src, src + righthalf, rq->pageofs_out); | ||
338 | } else if (nrpages_out == 2) { | ||
339 | if (dst) | ||
340 | kunmap_atomic(dst); | ||
341 | DBG_BUGON(!rq->out[1]); | ||
342 | dst = kmap_atomic(rq->out[1]); | ||
343 | memcpy(dst, src + righthalf, rq->pageofs_out); | ||
344 | } | ||
345 | if (dst) | ||
346 | kunmap_atomic(dst); | ||
347 | kunmap_atomic(src); | ||
348 | return 0; | ||
349 | } | ||
350 | |||
351 | int z_erofs_decompress(struct z_erofs_decompress_req *rq, | ||
352 | struct list_head *pagepool) | ||
353 | { | ||
354 | if (rq->alg == Z_EROFS_COMPRESSION_SHIFTED) | ||
355 | return shifted_decompress(rq, pagepool); | ||
356 | return decompress_generic(rq, pagepool); | ||
357 | } | ||
358 | |||
diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c new file mode 100644 index 000000000000..1976e60e5174 --- /dev/null +++ b/fs/erofs/dir.c | |||
@@ -0,0 +1,139 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-only | ||
2 | /* | ||
3 | * Copyright (C) 2017-2018 HUAWEI, Inc. | ||
4 | * http://www.huawei.com/ | ||
5 | * Created by Gao Xiang <gaoxiang25@huawei.com> | ||
6 | */ | ||
7 | #include "internal.h" | ||
8 | |||
9 | static void debug_one_dentry(unsigned char d_type, const char *de_name, | ||
10 | unsigned int de_namelen) | ||
11 | { | ||
12 | #ifdef CONFIG_EROFS_FS_DEBUG | ||
13 | /* since the on-disk name could not have the trailing '\0' */ | ||
14 | unsigned char dbg_namebuf[EROFS_NAME_LEN + 1]; | ||
15 | |||
16 | memcpy(dbg_namebuf, de_name, de_namelen); | ||
17 | dbg_namebuf[de_namelen] = '\0'; | ||
18 | |||
19 | debugln("found dirent %s de_len %u d_type %d", dbg_namebuf, | ||
20 | de_namelen, d_type); | ||
21 | #endif | ||
22 | } | ||
23 | |||
24 | static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx, | ||
25 | void *dentry_blk, unsigned int *ofs, | ||
26 | unsigned int nameoff, unsigned int maxsize) | ||
27 | { | ||
28 | struct erofs_dirent *de = dentry_blk + *ofs; | ||
29 | const struct erofs_dirent *end = dentry_blk + nameoff; | ||
30 | |||
31 | while (de < end) { | ||
32 | const char *de_name; | ||
33 | unsigned int de_namelen; | ||
34 | unsigned char d_type; | ||
35 | |||
36 | d_type = fs_ftype_to_dtype(de->file_type); | ||
37 | |||
38 | nameoff = le16_to_cpu(de->nameoff); | ||
39 | de_name = (char *)dentry_blk + nameoff; | ||
40 | |||
41 | /* the last dirent in the block? */ | ||
42 | if (de + 1 >= end) | ||
43 | de_namelen = strnlen(de_name, maxsize - nameoff); | ||
44 | else | ||
45 | de_namelen = le16_to_cpu(de[1].nameoff) - nameoff; | ||
46 | |||
47 | /* a corrupted entry is found */ | ||
48 | if (unlikely(nameoff + de_namelen > maxsize || | ||
49 | de_namelen > EROFS_NAME_LEN)) { | ||
50 | errln("bogus dirent @ nid %llu", EROFS_V(dir)->nid); | ||
51 | DBG_BUGON(1); | ||
52 | return -EFSCORRUPTED; | ||
53 | } | ||
54 | |||
55 | debug_one_dentry(d_type, de_name, de_namelen); | ||
56 | if (!dir_emit(ctx, de_name, de_namelen, | ||
57 | le64_to_cpu(de->nid), d_type)) | ||
58 | /* stopped by some reason */ | ||
59 | return 1; | ||
60 | ++de; | ||
61 | *ofs += sizeof(struct erofs_dirent); | ||
62 | } | ||
63 | *ofs = maxsize; | ||
64 | return 0; | ||
65 | } | ||
66 | |||
67 | static int erofs_readdir(struct file *f, struct dir_context *ctx) | ||
68 | { | ||
69 | struct inode *dir = file_inode(f); | ||
70 | struct address_space *mapping = dir->i_mapping; | ||
71 | const size_t dirsize = i_size_read(dir); | ||
72 | unsigned int i = ctx->pos / EROFS_BLKSIZ; | ||
73 | unsigned int ofs = ctx->pos % EROFS_BLKSIZ; | ||
74 | int err = 0; | ||
75 | bool initial = true; | ||
76 | |||
77 | while (ctx->pos < dirsize) { | ||
78 | struct page *dentry_page; | ||
79 | struct erofs_dirent *de; | ||
80 | unsigned int nameoff, maxsize; | ||
81 | |||
82 | dentry_page = read_mapping_page(mapping, i, NULL); | ||
83 | if (dentry_page == ERR_PTR(-ENOMEM)) { | ||
84 | err = -ENOMEM; | ||
85 | break; | ||
86 | } else if (IS_ERR(dentry_page)) { | ||
87 | errln("fail to readdir of logical block %u of nid %llu", | ||
88 | i, EROFS_V(dir)->nid); | ||
89 | err = -EFSCORRUPTED; | ||
90 | break; | ||
91 | } | ||
92 | |||
93 | de = (struct erofs_dirent *)kmap(dentry_page); | ||
94 | |||
95 | nameoff = le16_to_cpu(de->nameoff); | ||
96 | |||
97 | if (unlikely(nameoff < sizeof(struct erofs_dirent) || | ||
98 | nameoff >= PAGE_SIZE)) { | ||
99 | errln("%s, invalid de[0].nameoff %u @ nid %llu", | ||
100 | __func__, nameoff, EROFS_V(dir)->nid); | ||
101 | err = -EFSCORRUPTED; | ||
102 | goto skip_this; | ||
103 | } | ||
104 | |||
105 | maxsize = min_t(unsigned int, | ||
106 | dirsize - ctx->pos + ofs, PAGE_SIZE); | ||
107 | |||
108 | /* search dirents at the arbitrary position */ | ||
109 | if (unlikely(initial)) { | ||
110 | initial = false; | ||
111 | |||
112 | ofs = roundup(ofs, sizeof(struct erofs_dirent)); | ||
113 | if (unlikely(ofs >= nameoff)) | ||
114 | goto skip_this; | ||
115 | } | ||
116 | |||
117 | err = erofs_fill_dentries(dir, ctx, de, &ofs, | ||
118 | nameoff, maxsize); | ||
119 | skip_this: | ||
120 | kunmap(dentry_page); | ||
121 | |||
122 | put_page(dentry_page); | ||
123 | |||
124 | ctx->pos = blknr_to_addr(i) + ofs; | ||
125 | |||
126 | if (unlikely(err)) | ||
127 | break; | ||
128 | ++i; | ||
129 | ofs = 0; | ||
130 | } | ||
131 | return err < 0 ? err : 0; | ||
132 | } | ||
133 | |||
134 | const struct file_operations erofs_dir_fops = { | ||
135 | .llseek = generic_file_llseek, | ||
136 | .read = generic_read_dir, | ||
137 | .iterate_shared = erofs_readdir, | ||
138 | }; | ||
139 | |||
diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h new file mode 100644 index 000000000000..afa7d45ca958 --- /dev/null +++ b/fs/erofs/erofs_fs.h | |||
@@ -0,0 +1,307 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0-only OR Apache-2.0 */ | ||
2 | /* | ||
3 | * Copyright (C) 2017-2018 HUAWEI, Inc. | ||
4 | * http://www.huawei.com/ | ||
5 | * Created by Gao Xiang <gaoxiang25@huawei.com> | ||
6 | */ | ||
7 | #ifndef __EROFS_FS_H | ||
8 | #define __EROFS_FS_H | ||
9 | |||
10 | /* Enhanced(Extended) ROM File System */ | ||
11 | #define EROFS_SUPER_OFFSET 1024 | ||
12 | |||
13 | /* | ||
14 | * Any bits that aren't in EROFS_ALL_REQUIREMENTS should be | ||
15 | * incompatible with this kernel version. | ||
16 | */ | ||
17 | #define EROFS_REQUIREMENT_LZ4_0PADDING 0x00000001 | ||
18 | #define EROFS_ALL_REQUIREMENTS EROFS_REQUIREMENT_LZ4_0PADDING | ||
19 | |||
20 | struct erofs_super_block { | ||
21 | /* 0 */__le32 magic; /* in the little endian */ | ||
22 | /* 4 */__le32 checksum; /* crc32c(super_block) */ | ||
23 | /* 8 */__le32 features; /* (aka. feature_compat) */ | ||
24 | /* 12 */__u8 blkszbits; /* support block_size == PAGE_SIZE only */ | ||
25 | /* 13 */__u8 reserved; | ||
26 | |||
27 | /* 14 */__le16 root_nid; | ||
28 | /* 16 */__le64 inos; /* total valid ino # (== f_files - f_favail) */ | ||
29 | |||
30 | /* 24 */__le64 build_time; /* inode v1 time derivation */ | ||
31 | /* 32 */__le32 build_time_nsec; | ||
32 | /* 36 */__le32 blocks; /* used for statfs */ | ||
33 | /* 40 */__le32 meta_blkaddr; | ||
34 | /* 44 */__le32 xattr_blkaddr; | ||
35 | /* 48 */__u8 uuid[16]; /* 128-bit uuid for volume */ | ||
36 | /* 64 */__u8 volume_name[16]; /* volume name */ | ||
37 | /* 80 */__le32 requirements; /* (aka. feature_incompat) */ | ||
38 | |||
39 | /* 84 */__u8 reserved2[44]; | ||
40 | } __packed; /* 128 bytes */ | ||
41 | |||
42 | /* | ||
43 | * erofs inode data mapping: | ||
44 | * 0 - inode plain without inline data A: | ||
45 | * inode, [xattrs], ... | ... | no-holed data | ||
46 | * 1 - inode VLE compression B (legacy): | ||
47 | * inode, [xattrs], extents ... | ... | ||
48 | * 2 - inode plain with inline data C: | ||
49 | * inode, [xattrs], last_inline_data, ... | ... | no-holed data | ||
50 | * 3 - inode compression D: | ||
51 | * inode, [xattrs], map_header, extents ... | ... | ||
52 | * 4~7 - reserved | ||
53 | */ | ||
54 | enum { | ||
55 | EROFS_INODE_FLAT_PLAIN, | ||
56 | EROFS_INODE_FLAT_COMPRESSION_LEGACY, | ||
57 | EROFS_INODE_FLAT_INLINE, | ||
58 | EROFS_INODE_FLAT_COMPRESSION, | ||
59 | EROFS_INODE_LAYOUT_MAX | ||
60 | }; | ||
61 | |||
62 | static inline bool erofs_inode_is_data_compressed(unsigned int datamode) | ||
63 | { | ||
64 | if (datamode == EROFS_INODE_FLAT_COMPRESSION) | ||
65 | return true; | ||
66 | return datamode == EROFS_INODE_FLAT_COMPRESSION_LEGACY; | ||
67 | } | ||
68 | |||
69 | /* bit definitions of inode i_advise */ | ||
70 | #define EROFS_I_VERSION_BITS 1 | ||
71 | #define EROFS_I_DATA_MAPPING_BITS 3 | ||
72 | |||
73 | #define EROFS_I_VERSION_BIT 0 | ||
74 | #define EROFS_I_DATA_MAPPING_BIT 1 | ||
75 | |||
76 | struct erofs_inode_v1 { | ||
77 | /* 0 */__le16 i_advise; | ||
78 | |||
79 | /* 1 header + n-1 * 4 bytes inline xattr to keep continuity */ | ||
80 | /* 2 */__le16 i_xattr_icount; | ||
81 | /* 4 */__le16 i_mode; | ||
82 | /* 6 */__le16 i_nlink; | ||
83 | /* 8 */__le32 i_size; | ||
84 | /* 12 */__le32 i_reserved; | ||
85 | /* 16 */union { | ||
86 | /* file total compressed blocks for data mapping 1 */ | ||
87 | __le32 compressed_blocks; | ||
88 | __le32 raw_blkaddr; | ||
89 | |||
90 | /* for device files, used to indicate old/new device # */ | ||
91 | __le32 rdev; | ||
92 | } i_u __packed; | ||
93 | /* 20 */__le32 i_ino; /* only used for 32-bit stat compatibility */ | ||
94 | /* 24 */__le16 i_uid; | ||
95 | /* 26 */__le16 i_gid; | ||
96 | /* 28 */__le32 i_reserved2; | ||
97 | } __packed; | ||
98 | |||
99 | /* 32 bytes on-disk inode */ | ||
100 | #define EROFS_INODE_LAYOUT_V1 0 | ||
101 | /* 64 bytes on-disk inode */ | ||
102 | #define EROFS_INODE_LAYOUT_V2 1 | ||
103 | |||
104 | struct erofs_inode_v2 { | ||
105 | /* 0 */__le16 i_advise; | ||
106 | |||
107 | /* 1 header + n-1 * 4 bytes inline xattr to keep continuity */ | ||
108 | /* 2 */__le16 i_xattr_icount; | ||
109 | /* 4 */__le16 i_mode; | ||
110 | /* 6 */__le16 i_reserved; | ||
111 | /* 8 */__le64 i_size; | ||
112 | /* 16 */union { | ||
113 | /* file total compressed blocks for data mapping 1 */ | ||
114 | __le32 compressed_blocks; | ||
115 | __le32 raw_blkaddr; | ||
116 | |||
117 | /* for device files, used to indicate old/new device # */ | ||
118 | __le32 rdev; | ||
119 | } i_u __packed; | ||
120 | |||
121 | /* only used for 32-bit stat compatibility */ | ||
122 | /* 20 */__le32 i_ino; | ||
123 | |||
124 | /* 24 */__le32 i_uid; | ||
125 | /* 28 */__le32 i_gid; | ||
126 | /* 32 */__le64 i_ctime; | ||
127 | /* 40 */__le32 i_ctime_nsec; | ||
128 | /* 44 */__le32 i_nlink; | ||
129 | /* 48 */__u8 i_reserved2[16]; | ||
130 | } __packed; /* 64 bytes */ | ||
131 | |||
132 | #define EROFS_MAX_SHARED_XATTRS (128) | ||
133 | /* h_shared_count between 129 ... 255 are special # */ | ||
134 | #define EROFS_SHARED_XATTR_EXTENT (255) | ||
135 | |||
136 | /* | ||
137 | * inline xattrs (n == i_xattr_icount): | ||
138 | * erofs_xattr_ibody_header(1) + (n - 1) * 4 bytes | ||
139 | * 12 bytes / \ | ||
140 | * / \ | ||
141 | * /-----------------------\ | ||
142 | * | erofs_xattr_entries+ | | ||
143 | * +-----------------------+ | ||
144 | * inline xattrs must starts in erofs_xattr_ibody_header, | ||
145 | * for read-only fs, no need to introduce h_refcount | ||
146 | */ | ||
147 | struct erofs_xattr_ibody_header { | ||
148 | __le32 h_reserved; | ||
149 | __u8 h_shared_count; | ||
150 | __u8 h_reserved2[7]; | ||
151 | __le32 h_shared_xattrs[0]; /* shared xattr id array */ | ||
152 | } __packed; | ||
153 | |||
154 | /* Name indexes */ | ||
155 | #define EROFS_XATTR_INDEX_USER 1 | ||
156 | #define EROFS_XATTR_INDEX_POSIX_ACL_ACCESS 2 | ||
157 | #define EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT 3 | ||
158 | #define EROFS_XATTR_INDEX_TRUSTED 4 | ||
159 | #define EROFS_XATTR_INDEX_LUSTRE 5 | ||
160 | #define EROFS_XATTR_INDEX_SECURITY 6 | ||
161 | |||
162 | /* xattr entry (for both inline & shared xattrs) */ | ||
163 | struct erofs_xattr_entry { | ||
164 | __u8 e_name_len; /* length of name */ | ||
165 | __u8 e_name_index; /* attribute name index */ | ||
166 | __le16 e_value_size; /* size of attribute value */ | ||
167 | /* followed by e_name and e_value */ | ||
168 | char e_name[0]; /* attribute name */ | ||
169 | } __packed; | ||
170 | |||
171 | #define ondisk_xattr_ibody_size(count) ({\ | ||
172 | u32 __count = le16_to_cpu(count); \ | ||
173 | ((__count) == 0) ? 0 : \ | ||
174 | sizeof(struct erofs_xattr_ibody_header) + \ | ||
175 | sizeof(__u32) * ((__count) - 1); }) | ||
176 | |||
177 | #define EROFS_XATTR_ALIGN(size) round_up(size, sizeof(struct erofs_xattr_entry)) | ||
178 | #define EROFS_XATTR_ENTRY_SIZE(entry) EROFS_XATTR_ALIGN( \ | ||
179 | sizeof(struct erofs_xattr_entry) + \ | ||
180 | (entry)->e_name_len + le16_to_cpu((entry)->e_value_size)) | ||
181 | |||
182 | /* available compression algorithm types */ | ||
183 | enum { | ||
184 | Z_EROFS_COMPRESSION_LZ4, | ||
185 | Z_EROFS_COMPRESSION_MAX | ||
186 | }; | ||
187 | |||
188 | /* | ||
189 | * bit 0 : COMPACTED_2B indexes (0 - off; 1 - on) | ||
190 | * e.g. for 4k logical cluster size, 4B if compacted 2B is off; | ||
191 | * (4B) + 2B + (4B) if compacted 2B is on. | ||
192 | */ | ||
193 | #define Z_EROFS_ADVISE_COMPACTED_2B_BIT 0 | ||
194 | |||
195 | #define Z_EROFS_ADVISE_COMPACTED_2B (1 << Z_EROFS_ADVISE_COMPACTED_2B_BIT) | ||
196 | |||
197 | struct z_erofs_map_header { | ||
198 | __le32 h_reserved1; | ||
199 | __le16 h_advise; | ||
200 | /* | ||
201 | * bit 0-3 : algorithm type of head 1 (logical cluster type 01); | ||
202 | * bit 4-7 : algorithm type of head 2 (logical cluster type 11). | ||
203 | */ | ||
204 | __u8 h_algorithmtype; | ||
205 | /* | ||
206 | * bit 0-2 : logical cluster bits - 12, e.g. 0 for 4096; | ||
207 | * bit 3-4 : (physical - logical) cluster bits of head 1: | ||
208 | * For example, if logical clustersize = 4096, 1 for 8192. | ||
209 | * bit 5-7 : (physical - logical) cluster bits of head 2. | ||
210 | */ | ||
211 | __u8 h_clusterbits; | ||
212 | }; | ||
213 | |||
214 | #define Z_EROFS_VLE_LEGACY_HEADER_PADDING 8 | ||
215 | |||
216 | /* | ||
217 | * Z_EROFS Variable-sized Logical Extent cluster type: | ||
218 | * 0 - literal (uncompressed) cluster | ||
219 | * 1 - compressed cluster (for the head logical cluster) | ||
220 | * 2 - compressed cluster (for the other logical clusters) | ||
221 | * | ||
222 | * In detail, | ||
223 | * 0 - literal (uncompressed) cluster, | ||
224 | * di_advise = 0 | ||
225 | * di_clusterofs = the literal data offset of the cluster | ||
226 | * di_blkaddr = the blkaddr of the literal cluster | ||
227 | * | ||
228 | * 1 - compressed cluster (for the head logical cluster) | ||
229 | * di_advise = 1 | ||
230 | * di_clusterofs = the decompressed data offset of the cluster | ||
231 | * di_blkaddr = the blkaddr of the compressed cluster | ||
232 | * | ||
233 | * 2 - compressed cluster (for the other logical clusters) | ||
234 | * di_advise = 2 | ||
235 | * di_clusterofs = | ||
236 | * the decompressed data offset in its own head cluster | ||
237 | * di_u.delta[0] = distance to its corresponding head cluster | ||
238 | * di_u.delta[1] = distance to its corresponding tail cluster | ||
239 | * (di_advise could be 0, 1 or 2) | ||
240 | */ | ||
241 | enum { | ||
242 | Z_EROFS_VLE_CLUSTER_TYPE_PLAIN, | ||
243 | Z_EROFS_VLE_CLUSTER_TYPE_HEAD, | ||
244 | Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD, | ||
245 | Z_EROFS_VLE_CLUSTER_TYPE_RESERVED, | ||
246 | Z_EROFS_VLE_CLUSTER_TYPE_MAX | ||
247 | }; | ||
248 | |||
249 | #define Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS 2 | ||
250 | #define Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT 0 | ||
251 | |||
252 | struct z_erofs_vle_decompressed_index { | ||
253 | __le16 di_advise; | ||
254 | /* where to decompress in the head cluster */ | ||
255 | __le16 di_clusterofs; | ||
256 | |||
257 | union { | ||
258 | /* for the head cluster */ | ||
259 | __le32 blkaddr; | ||
260 | /* | ||
261 | * for the rest clusters | ||
262 | * eg. for 4k page-sized cluster, maximum 4K*64k = 256M) | ||
263 | * [0] - pointing to the head cluster | ||
264 | * [1] - pointing to the tail cluster | ||
265 | */ | ||
266 | __le16 delta[2]; | ||
267 | } di_u __packed; /* 8 bytes */ | ||
268 | } __packed; | ||
269 | |||
270 | #define Z_EROFS_VLE_LEGACY_INDEX_ALIGN(size) \ | ||
271 | (round_up(size, sizeof(struct z_erofs_vle_decompressed_index)) + \ | ||
272 | sizeof(struct z_erofs_map_header) + Z_EROFS_VLE_LEGACY_HEADER_PADDING) | ||
273 | |||
274 | /* dirent sorts in alphabet order, thus we can do binary search */ | ||
275 | struct erofs_dirent { | ||
276 | __le64 nid; /* 0, node number */ | ||
277 | __le16 nameoff; /* 8, start offset of file name */ | ||
278 | __u8 file_type; /* 10, file type */ | ||
279 | __u8 reserved; /* 11, reserved */ | ||
280 | } __packed; | ||
281 | |||
282 | /* | ||
283 | * EROFS file types should match generic FT_* types and | ||
284 | * it seems no need to add BUILD_BUG_ONs since potential | ||
285 | * unmatchness will break other fses as well... | ||
286 | */ | ||
287 | |||
288 | #define EROFS_NAME_LEN 255 | ||
289 | |||
290 | /* check the EROFS on-disk layout strictly at compile time */ | ||
291 | static inline void erofs_check_ondisk_layout_definitions(void) | ||
292 | { | ||
293 | BUILD_BUG_ON(sizeof(struct erofs_super_block) != 128); | ||
294 | BUILD_BUG_ON(sizeof(struct erofs_inode_v1) != 32); | ||
295 | BUILD_BUG_ON(sizeof(struct erofs_inode_v2) != 64); | ||
296 | BUILD_BUG_ON(sizeof(struct erofs_xattr_ibody_header) != 12); | ||
297 | BUILD_BUG_ON(sizeof(struct erofs_xattr_entry) != 4); | ||
298 | BUILD_BUG_ON(sizeof(struct z_erofs_map_header) != 8); | ||
299 | BUILD_BUG_ON(sizeof(struct z_erofs_vle_decompressed_index) != 8); | ||
300 | BUILD_BUG_ON(sizeof(struct erofs_dirent) != 12); | ||
301 | |||
302 | BUILD_BUG_ON(BIT(Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS) < | ||
303 | Z_EROFS_VLE_CLUSTER_TYPE_MAX - 1); | ||
304 | } | ||
305 | |||
306 | #endif | ||
307 | |||
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c new file mode 100644 index 000000000000..80f4fe919ee7 --- /dev/null +++ b/fs/erofs/inode.c | |||
@@ -0,0 +1,332 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-only | ||
2 | /* | ||
3 | * Copyright (C) 2017-2018 HUAWEI, Inc. | ||
4 | * http://www.huawei.com/ | ||
5 | * Created by Gao Xiang <gaoxiang25@huawei.com> | ||
6 | */ | ||
7 | #include "xattr.h" | ||
8 | |||
9 | #include <trace/events/erofs.h> | ||
10 | |||
11 | /* no locking */ | ||
12 | static int read_inode(struct inode *inode, void *data) | ||
13 | { | ||
14 | struct erofs_vnode *vi = EROFS_V(inode); | ||
15 | struct erofs_inode_v1 *v1 = data; | ||
16 | const unsigned int advise = le16_to_cpu(v1->i_advise); | ||
17 | erofs_blk_t nblks = 0; | ||
18 | |||
19 | vi->datamode = __inode_data_mapping(advise); | ||
20 | |||
21 | if (unlikely(vi->datamode >= EROFS_INODE_LAYOUT_MAX)) { | ||
22 | errln("unsupported data mapping %u of nid %llu", | ||
23 | vi->datamode, vi->nid); | ||
24 | DBG_BUGON(1); | ||
25 | return -EOPNOTSUPP; | ||
26 | } | ||
27 | |||
28 | if (__inode_version(advise) == EROFS_INODE_LAYOUT_V2) { | ||
29 | struct erofs_inode_v2 *v2 = data; | ||
30 | |||
31 | vi->inode_isize = sizeof(struct erofs_inode_v2); | ||
32 | vi->xattr_isize = ondisk_xattr_ibody_size(v2->i_xattr_icount); | ||
33 | |||
34 | inode->i_mode = le16_to_cpu(v2->i_mode); | ||
35 | if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || | ||
36 | S_ISLNK(inode->i_mode)) | ||
37 | vi->raw_blkaddr = le32_to_cpu(v2->i_u.raw_blkaddr); | ||
38 | else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) | ||
39 | inode->i_rdev = | ||
40 | new_decode_dev(le32_to_cpu(v2->i_u.rdev)); | ||
41 | else if (S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) | ||
42 | inode->i_rdev = 0; | ||
43 | else | ||
44 | goto bogusimode; | ||
45 | |||
46 | i_uid_write(inode, le32_to_cpu(v2->i_uid)); | ||
47 | i_gid_write(inode, le32_to_cpu(v2->i_gid)); | ||
48 | set_nlink(inode, le32_to_cpu(v2->i_nlink)); | ||
49 | |||
50 | /* ns timestamp */ | ||
51 | inode->i_mtime.tv_sec = inode->i_ctime.tv_sec = | ||
52 | le64_to_cpu(v2->i_ctime); | ||
53 | inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = | ||
54 | le32_to_cpu(v2->i_ctime_nsec); | ||
55 | |||
56 | inode->i_size = le64_to_cpu(v2->i_size); | ||
57 | |||
58 | /* total blocks for compressed files */ | ||
59 | if (is_inode_layout_compression(inode)) | ||
60 | nblks = le32_to_cpu(v2->i_u.compressed_blocks); | ||
61 | } else if (__inode_version(advise) == EROFS_INODE_LAYOUT_V1) { | ||
62 | struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb); | ||
63 | |||
64 | vi->inode_isize = sizeof(struct erofs_inode_v1); | ||
65 | vi->xattr_isize = ondisk_xattr_ibody_size(v1->i_xattr_icount); | ||
66 | |||
67 | inode->i_mode = le16_to_cpu(v1->i_mode); | ||
68 | if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || | ||
69 | S_ISLNK(inode->i_mode)) | ||
70 | vi->raw_blkaddr = le32_to_cpu(v1->i_u.raw_blkaddr); | ||
71 | else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) | ||
72 | inode->i_rdev = | ||
73 | new_decode_dev(le32_to_cpu(v1->i_u.rdev)); | ||
74 | else if (S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) | ||
75 | inode->i_rdev = 0; | ||
76 | else | ||
77 | goto bogusimode; | ||
78 | |||
79 | i_uid_write(inode, le16_to_cpu(v1->i_uid)); | ||
80 | i_gid_write(inode, le16_to_cpu(v1->i_gid)); | ||
81 | set_nlink(inode, le16_to_cpu(v1->i_nlink)); | ||
82 | |||
83 | /* use build time to derive all file time */ | ||
84 | inode->i_mtime.tv_sec = inode->i_ctime.tv_sec = | ||
85 | sbi->build_time; | ||
86 | inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec = | ||
87 | sbi->build_time_nsec; | ||
88 | |||
89 | inode->i_size = le32_to_cpu(v1->i_size); | ||
90 | if (is_inode_layout_compression(inode)) | ||
91 | nblks = le32_to_cpu(v1->i_u.compressed_blocks); | ||
92 | } else { | ||
93 | errln("unsupported on-disk inode version %u of nid %llu", | ||
94 | __inode_version(advise), vi->nid); | ||
95 | DBG_BUGON(1); | ||
96 | return -EOPNOTSUPP; | ||
97 | } | ||
98 | |||
99 | if (!nblks) | ||
100 | /* measure inode.i_blocks as generic filesystems */ | ||
101 | inode->i_blocks = roundup(inode->i_size, EROFS_BLKSIZ) >> 9; | ||
102 | else | ||
103 | inode->i_blocks = nblks << LOG_SECTORS_PER_BLOCK; | ||
104 | return 0; | ||
105 | |||
106 | bogusimode: | ||
107 | errln("bogus i_mode (%o) @ nid %llu", inode->i_mode, vi->nid); | ||
108 | DBG_BUGON(1); | ||
109 | return -EFSCORRUPTED; | ||
110 | } | ||
111 | |||
112 | /* | ||
113 | * try_lock can be required since locking order is: | ||
114 | * file data(fs_inode) | ||
115 | * meta(bd_inode) | ||
116 | * but the majority of the callers is "iget", | ||
117 | * in that case we are pretty sure no deadlock since | ||
118 | * no data operations exist. However I tend to | ||
119 | * try_lock since it takes no much overhead and | ||
120 | * will success immediately. | ||
121 | */ | ||
122 | static int fill_inline_data(struct inode *inode, void *data, | ||
123 | unsigned int m_pofs) | ||
124 | { | ||
125 | struct erofs_vnode *vi = EROFS_V(inode); | ||
126 | struct erofs_sb_info *sbi = EROFS_I_SB(inode); | ||
127 | |||
128 | /* should be inode inline C */ | ||
129 | if (!is_inode_flat_inline(inode)) | ||
130 | return 0; | ||
131 | |||
132 | /* fast symlink (following ext4) */ | ||
133 | if (S_ISLNK(inode->i_mode) && inode->i_size < PAGE_SIZE) { | ||
134 | char *lnk = erofs_kmalloc(sbi, inode->i_size + 1, GFP_KERNEL); | ||
135 | |||
136 | if (unlikely(!lnk)) | ||
137 | return -ENOMEM; | ||
138 | |||
139 | m_pofs += vi->inode_isize + vi->xattr_isize; | ||
140 | |||
141 | /* inline symlink data shouldn't across page boundary as well */ | ||
142 | if (unlikely(m_pofs + inode->i_size > PAGE_SIZE)) { | ||
143 | kfree(lnk); | ||
144 | errln("inline data cross block boundary @ nid %llu", | ||
145 | vi->nid); | ||
146 | DBG_BUGON(1); | ||
147 | return -EFSCORRUPTED; | ||
148 | } | ||
149 | |||
150 | /* get in-page inline data */ | ||
151 | memcpy(lnk, data + m_pofs, inode->i_size); | ||
152 | lnk[inode->i_size] = '\0'; | ||
153 | |||
154 | inode->i_link = lnk; | ||
155 | set_inode_fast_symlink(inode); | ||
156 | } | ||
157 | return 0; | ||
158 | } | ||
159 | |||
160 | static int fill_inode(struct inode *inode, int isdir) | ||
161 | { | ||
162 | struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb); | ||
163 | struct erofs_vnode *vi = EROFS_V(inode); | ||
164 | struct page *page; | ||
165 | void *data; | ||
166 | int err; | ||
167 | erofs_blk_t blkaddr; | ||
168 | unsigned int ofs; | ||
169 | erofs_off_t inode_loc; | ||
170 | |||
171 | trace_erofs_fill_inode(inode, isdir); | ||
172 | inode_loc = iloc(sbi, vi->nid); | ||
173 | blkaddr = erofs_blknr(inode_loc); | ||
174 | ofs = erofs_blkoff(inode_loc); | ||
175 | |||
176 | debugln("%s, reading inode nid %llu at %u of blkaddr %u", | ||
177 | __func__, vi->nid, ofs, blkaddr); | ||
178 | |||
179 | page = erofs_get_meta_page(inode->i_sb, blkaddr, isdir); | ||
180 | |||
181 | if (IS_ERR(page)) { | ||
182 | errln("failed to get inode (nid: %llu) page, err %ld", | ||
183 | vi->nid, PTR_ERR(page)); | ||
184 | return PTR_ERR(page); | ||
185 | } | ||
186 | |||
187 | DBG_BUGON(!PageUptodate(page)); | ||
188 | data = page_address(page); | ||
189 | |||
190 | err = read_inode(inode, data + ofs); | ||
191 | if (!err) { | ||
192 | /* setup the new inode */ | ||
193 | if (S_ISREG(inode->i_mode)) { | ||
194 | inode->i_op = &erofs_generic_iops; | ||
195 | inode->i_fop = &generic_ro_fops; | ||
196 | } else if (S_ISDIR(inode->i_mode)) { | ||
197 | inode->i_op = &erofs_dir_iops; | ||
198 | inode->i_fop = &erofs_dir_fops; | ||
199 | } else if (S_ISLNK(inode->i_mode)) { | ||
200 | /* by default, page_get_link is used for symlink */ | ||
201 | inode->i_op = &erofs_symlink_iops; | ||
202 | inode_nohighmem(inode); | ||
203 | } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || | ||
204 | S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { | ||
205 | inode->i_op = &erofs_generic_iops; | ||
206 | init_special_inode(inode, inode->i_mode, inode->i_rdev); | ||
207 | goto out_unlock; | ||
208 | } else { | ||
209 | err = -EFSCORRUPTED; | ||
210 | goto out_unlock; | ||
211 | } | ||
212 | |||
213 | if (is_inode_layout_compression(inode)) { | ||
214 | err = z_erofs_fill_inode(inode); | ||
215 | goto out_unlock; | ||
216 | } | ||
217 | |||
218 | inode->i_mapping->a_ops = &erofs_raw_access_aops; | ||
219 | |||
220 | /* fill last page if inline data is available */ | ||
221 | err = fill_inline_data(inode, data, ofs); | ||
222 | } | ||
223 | |||
224 | out_unlock: | ||
225 | unlock_page(page); | ||
226 | put_page(page); | ||
227 | return err; | ||
228 | } | ||
229 | |||
230 | /* | ||
231 | * erofs nid is 64bits, but i_ino is 'unsigned long', therefore | ||
232 | * we should do more for 32-bit platform to find the right inode. | ||
233 | */ | ||
234 | #if BITS_PER_LONG == 32 | ||
235 | static int erofs_ilookup_test_actor(struct inode *inode, void *opaque) | ||
236 | { | ||
237 | const erofs_nid_t nid = *(erofs_nid_t *)opaque; | ||
238 | |||
239 | return EROFS_V(inode)->nid == nid; | ||
240 | } | ||
241 | |||
242 | static int erofs_iget_set_actor(struct inode *inode, void *opaque) | ||
243 | { | ||
244 | const erofs_nid_t nid = *(erofs_nid_t *)opaque; | ||
245 | |||
246 | inode->i_ino = erofs_inode_hash(nid); | ||
247 | return 0; | ||
248 | } | ||
249 | #endif | ||
250 | |||
251 | static inline struct inode *erofs_iget_locked(struct super_block *sb, | ||
252 | erofs_nid_t nid) | ||
253 | { | ||
254 | const unsigned long hashval = erofs_inode_hash(nid); | ||
255 | |||
256 | #if BITS_PER_LONG >= 64 | ||
257 | /* it is safe to use iget_locked for >= 64-bit platform */ | ||
258 | return iget_locked(sb, hashval); | ||
259 | #else | ||
260 | return iget5_locked(sb, hashval, erofs_ilookup_test_actor, | ||
261 | erofs_iget_set_actor, &nid); | ||
262 | #endif | ||
263 | } | ||
264 | |||
265 | struct inode *erofs_iget(struct super_block *sb, | ||
266 | erofs_nid_t nid, | ||
267 | bool isdir) | ||
268 | { | ||
269 | struct inode *inode = erofs_iget_locked(sb, nid); | ||
270 | |||
271 | if (unlikely(!inode)) | ||
272 | return ERR_PTR(-ENOMEM); | ||
273 | |||
274 | if (inode->i_state & I_NEW) { | ||
275 | int err; | ||
276 | struct erofs_vnode *vi = EROFS_V(inode); | ||
277 | |||
278 | vi->nid = nid; | ||
279 | |||
280 | err = fill_inode(inode, isdir); | ||
281 | if (likely(!err)) | ||
282 | unlock_new_inode(inode); | ||
283 | else { | ||
284 | iget_failed(inode); | ||
285 | inode = ERR_PTR(err); | ||
286 | } | ||
287 | } | ||
288 | return inode; | ||
289 | } | ||
290 | |||
291 | int erofs_getattr(const struct path *path, struct kstat *stat, | ||
292 | u32 request_mask, unsigned int query_flags) | ||
293 | { | ||
294 | struct inode *const inode = d_inode(path->dentry); | ||
295 | |||
296 | if (is_inode_layout_compression(inode)) | ||
297 | stat->attributes |= STATX_ATTR_COMPRESSED; | ||
298 | |||
299 | stat->attributes |= STATX_ATTR_IMMUTABLE; | ||
300 | stat->attributes_mask |= (STATX_ATTR_COMPRESSED | | ||
301 | STATX_ATTR_IMMUTABLE); | ||
302 | |||
303 | generic_fillattr(inode, stat); | ||
304 | return 0; | ||
305 | } | ||
306 | |||
307 | const struct inode_operations erofs_generic_iops = { | ||
308 | .getattr = erofs_getattr, | ||
309 | #ifdef CONFIG_EROFS_FS_XATTR | ||
310 | .listxattr = erofs_listxattr, | ||
311 | #endif | ||
312 | .get_acl = erofs_get_acl, | ||
313 | }; | ||
314 | |||
315 | const struct inode_operations erofs_symlink_iops = { | ||
316 | .get_link = page_get_link, | ||
317 | .getattr = erofs_getattr, | ||
318 | #ifdef CONFIG_EROFS_FS_XATTR | ||
319 | .listxattr = erofs_listxattr, | ||
320 | #endif | ||
321 | .get_acl = erofs_get_acl, | ||
322 | }; | ||
323 | |||
324 | const struct inode_operations erofs_fast_symlink_iops = { | ||
325 | .get_link = simple_get_link, | ||
326 | .getattr = erofs_getattr, | ||
327 | #ifdef CONFIG_EROFS_FS_XATTR | ||
328 | .listxattr = erofs_listxattr, | ||
329 | #endif | ||
330 | .get_acl = erofs_get_acl, | ||
331 | }; | ||
332 | |||
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h new file mode 100644 index 000000000000..620b73fcc416 --- /dev/null +++ b/fs/erofs/internal.h | |||
@@ -0,0 +1,553 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0-only */ | ||
2 | /* | ||
3 | * Copyright (C) 2017-2018 HUAWEI, Inc. | ||
4 | * http://www.huawei.com/ | ||
5 | * Created by Gao Xiang <gaoxiang25@huawei.com> | ||
6 | */ | ||
7 | #ifndef __EROFS_INTERNAL_H | ||
8 | #define __EROFS_INTERNAL_H | ||
9 | |||
10 | #include <linux/fs.h> | ||
11 | #include <linux/dcache.h> | ||
12 | #include <linux/mm.h> | ||
13 | #include <linux/pagemap.h> | ||
14 | #include <linux/bio.h> | ||
15 | #include <linux/buffer_head.h> | ||
16 | #include <linux/magic.h> | ||
17 | #include <linux/slab.h> | ||
18 | #include <linux/vmalloc.h> | ||
19 | #include "erofs_fs.h" | ||
20 | |||
21 | /* redefine pr_fmt "erofs: " */ | ||
22 | #undef pr_fmt | ||
23 | #define pr_fmt(fmt) "erofs: " fmt | ||
24 | |||
25 | #define errln(x, ...) pr_err(x "\n", ##__VA_ARGS__) | ||
26 | #define infoln(x, ...) pr_info(x "\n", ##__VA_ARGS__) | ||
27 | #ifdef CONFIG_EROFS_FS_DEBUG | ||
28 | #define debugln(x, ...) pr_debug(x "\n", ##__VA_ARGS__) | ||
29 | #define DBG_BUGON BUG_ON | ||
30 | #else | ||
31 | #define debugln(x, ...) ((void)0) | ||
32 | #define DBG_BUGON(x) ((void)(x)) | ||
33 | #endif /* !CONFIG_EROFS_FS_DEBUG */ | ||
34 | |||
35 | enum { | ||
36 | FAULT_KMALLOC, | ||
37 | FAULT_READ_IO, | ||
38 | FAULT_MAX, | ||
39 | }; | ||
40 | |||
41 | #ifdef CONFIG_EROFS_FAULT_INJECTION | ||
42 | extern const char *erofs_fault_name[FAULT_MAX]; | ||
43 | #define IS_FAULT_SET(fi, type) ((fi)->inject_type & (1 << (type))) | ||
44 | |||
45 | struct erofs_fault_info { | ||
46 | atomic_t inject_ops; | ||
47 | unsigned int inject_rate; | ||
48 | unsigned int inject_type; | ||
49 | }; | ||
50 | #endif /* CONFIG_EROFS_FAULT_INJECTION */ | ||
51 | |||
52 | /* EROFS_SUPER_MAGIC_V1 to represent the whole file system */ | ||
53 | #define EROFS_SUPER_MAGIC EROFS_SUPER_MAGIC_V1 | ||
54 | |||
55 | typedef u64 erofs_nid_t; | ||
56 | typedef u64 erofs_off_t; | ||
57 | /* data type for filesystem-wide blocks number */ | ||
58 | typedef u32 erofs_blk_t; | ||
59 | |||
60 | struct erofs_sb_info { | ||
61 | #ifdef CONFIG_EROFS_FS_ZIP | ||
62 | /* list for all registered superblocks, mainly for shrinker */ | ||
63 | struct list_head list; | ||
64 | struct mutex umount_mutex; | ||
65 | |||
66 | /* the dedicated workstation for compression */ | ||
67 | struct radix_tree_root workstn_tree; | ||
68 | |||
69 | /* threshold for decompression synchronously */ | ||
70 | unsigned int max_sync_decompress_pages; | ||
71 | |||
72 | unsigned int shrinker_run_no; | ||
73 | |||
74 | /* current strategy of how to use managed cache */ | ||
75 | unsigned char cache_strategy; | ||
76 | |||
77 | /* pseudo inode to manage cached pages */ | ||
78 | struct inode *managed_cache; | ||
79 | #endif /* CONFIG_EROFS_FS_ZIP */ | ||
80 | u32 blocks; | ||
81 | u32 meta_blkaddr; | ||
82 | #ifdef CONFIG_EROFS_FS_XATTR | ||
83 | u32 xattr_blkaddr; | ||
84 | #endif | ||
85 | |||
86 | /* inode slot unit size in bit shift */ | ||
87 | unsigned char islotbits; | ||
88 | |||
89 | u32 build_time_nsec; | ||
90 | u64 build_time; | ||
91 | |||
92 | /* what we really care is nid, rather than ino.. */ | ||
93 | erofs_nid_t root_nid; | ||
94 | /* used for statfs, f_files - f_favail */ | ||
95 | u64 inos; | ||
96 | |||
97 | u8 uuid[16]; /* 128-bit uuid for volume */ | ||
98 | u8 volume_name[16]; /* volume name */ | ||
99 | u32 requirements; | ||
100 | |||
101 | unsigned int mount_opt; | ||
102 | |||
103 | #ifdef CONFIG_EROFS_FAULT_INJECTION | ||
104 | struct erofs_fault_info fault_info; /* For fault injection */ | ||
105 | #endif | ||
106 | }; | ||
107 | |||
108 | #ifdef CONFIG_EROFS_FAULT_INJECTION | ||
109 | #define erofs_show_injection_info(type) \ | ||
110 | infoln("inject %s in %s of %pS", erofs_fault_name[type], \ | ||
111 | __func__, __builtin_return_address(0)) | ||
112 | |||
113 | static inline bool time_to_inject(struct erofs_sb_info *sbi, int type) | ||
114 | { | ||
115 | struct erofs_fault_info *ffi = &sbi->fault_info; | ||
116 | |||
117 | if (!ffi->inject_rate) | ||
118 | return false; | ||
119 | |||
120 | if (!IS_FAULT_SET(ffi, type)) | ||
121 | return false; | ||
122 | |||
123 | atomic_inc(&ffi->inject_ops); | ||
124 | if (atomic_read(&ffi->inject_ops) >= ffi->inject_rate) { | ||
125 | atomic_set(&ffi->inject_ops, 0); | ||
126 | return true; | ||
127 | } | ||
128 | return false; | ||
129 | } | ||
130 | #else | ||
131 | static inline bool time_to_inject(struct erofs_sb_info *sbi, int type) | ||
132 | { | ||
133 | return false; | ||
134 | } | ||
135 | |||
136 | static inline void erofs_show_injection_info(int type) | ||
137 | { | ||
138 | } | ||
139 | #endif /* !CONFIG_EROFS_FAULT_INJECTION */ | ||
140 | |||
141 | static inline void *erofs_kmalloc(struct erofs_sb_info *sbi, | ||
142 | size_t size, gfp_t flags) | ||
143 | { | ||
144 | if (time_to_inject(sbi, FAULT_KMALLOC)) { | ||
145 | erofs_show_injection_info(FAULT_KMALLOC); | ||
146 | return NULL; | ||
147 | } | ||
148 | return kmalloc(size, flags); | ||
149 | } | ||
150 | |||
151 | #define EROFS_SB(sb) ((struct erofs_sb_info *)(sb)->s_fs_info) | ||
152 | #define EROFS_I_SB(inode) ((struct erofs_sb_info *)(inode)->i_sb->s_fs_info) | ||
153 | |||
154 | /* Mount flags set via mount options or defaults */ | ||
155 | #define EROFS_MOUNT_XATTR_USER 0x00000010 | ||
156 | #define EROFS_MOUNT_POSIX_ACL 0x00000020 | ||
157 | #define EROFS_MOUNT_FAULT_INJECTION 0x00000040 | ||
158 | |||
159 | #define clear_opt(sbi, option) ((sbi)->mount_opt &= ~EROFS_MOUNT_##option) | ||
160 | #define set_opt(sbi, option) ((sbi)->mount_opt |= EROFS_MOUNT_##option) | ||
161 | #define test_opt(sbi, option) ((sbi)->mount_opt & EROFS_MOUNT_##option) | ||
162 | |||
163 | #ifdef CONFIG_EROFS_FS_ZIP | ||
164 | enum { | ||
165 | EROFS_ZIP_CACHE_DISABLED, | ||
166 | EROFS_ZIP_CACHE_READAHEAD, | ||
167 | EROFS_ZIP_CACHE_READAROUND | ||
168 | }; | ||
169 | |||
170 | #define EROFS_LOCKED_MAGIC (INT_MIN | 0xE0F510CCL) | ||
171 | |||
172 | /* basic unit of the workstation of a super_block */ | ||
173 | struct erofs_workgroup { | ||
174 | /* the workgroup index in the workstation */ | ||
175 | pgoff_t index; | ||
176 | |||
177 | /* overall workgroup reference count */ | ||
178 | atomic_t refcount; | ||
179 | }; | ||
180 | |||
181 | #if defined(CONFIG_SMP) | ||
182 | static inline bool erofs_workgroup_try_to_freeze(struct erofs_workgroup *grp, | ||
183 | int val) | ||
184 | { | ||
185 | preempt_disable(); | ||
186 | if (val != atomic_cmpxchg(&grp->refcount, val, EROFS_LOCKED_MAGIC)) { | ||
187 | preempt_enable(); | ||
188 | return false; | ||
189 | } | ||
190 | return true; | ||
191 | } | ||
192 | |||
193 | static inline void erofs_workgroup_unfreeze(struct erofs_workgroup *grp, | ||
194 | int orig_val) | ||
195 | { | ||
196 | /* | ||
197 | * other observers should notice all modifications | ||
198 | * in the freezing period. | ||
199 | */ | ||
200 | smp_mb(); | ||
201 | atomic_set(&grp->refcount, orig_val); | ||
202 | preempt_enable(); | ||
203 | } | ||
204 | |||
205 | static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp) | ||
206 | { | ||
207 | return atomic_cond_read_relaxed(&grp->refcount, | ||
208 | VAL != EROFS_LOCKED_MAGIC); | ||
209 | } | ||
210 | #else | ||
211 | static inline bool erofs_workgroup_try_to_freeze(struct erofs_workgroup *grp, | ||
212 | int val) | ||
213 | { | ||
214 | preempt_disable(); | ||
215 | /* no need to spin on UP platforms, let's just disable preemption. */ | ||
216 | if (val != atomic_read(&grp->refcount)) { | ||
217 | preempt_enable(); | ||
218 | return false; | ||
219 | } | ||
220 | return true; | ||
221 | } | ||
222 | |||
223 | static inline void erofs_workgroup_unfreeze(struct erofs_workgroup *grp, | ||
224 | int orig_val) | ||
225 | { | ||
226 | preempt_enable(); | ||
227 | } | ||
228 | |||
229 | static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp) | ||
230 | { | ||
231 | int v = atomic_read(&grp->refcount); | ||
232 | |||
233 | /* workgroup is never freezed on uniprocessor systems */ | ||
234 | DBG_BUGON(v == EROFS_LOCKED_MAGIC); | ||
235 | return v; | ||
236 | } | ||
237 | #endif /* !CONFIG_SMP */ | ||
238 | |||
239 | /* hard limit of pages per compressed cluster */ | ||
240 | #define Z_EROFS_CLUSTER_MAX_PAGES (CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT) | ||
241 | #define EROFS_PCPUBUF_NR_PAGES Z_EROFS_CLUSTER_MAX_PAGES | ||
242 | #else | ||
243 | #define EROFS_PCPUBUF_NR_PAGES 0 | ||
244 | #endif /* !CONFIG_EROFS_FS_ZIP */ | ||
245 | |||
246 | /* we strictly follow PAGE_SIZE and no buffer head yet */ | ||
247 | #define LOG_BLOCK_SIZE PAGE_SHIFT | ||
248 | |||
249 | #undef LOG_SECTORS_PER_BLOCK | ||
250 | #define LOG_SECTORS_PER_BLOCK (PAGE_SHIFT - 9) | ||
251 | |||
252 | #undef SECTORS_PER_BLOCK | ||
253 | #define SECTORS_PER_BLOCK (1 << SECTORS_PER_BLOCK) | ||
254 | |||
255 | #define EROFS_BLKSIZ (1 << LOG_BLOCK_SIZE) | ||
256 | |||
257 | #if (EROFS_BLKSIZ % 4096 || !EROFS_BLKSIZ) | ||
258 | #error erofs cannot be used in this platform | ||
259 | #endif | ||
260 | |||
261 | #define EROFS_IO_MAX_RETRIES_NOFAIL 5 | ||
262 | |||
263 | #define ROOT_NID(sb) ((sb)->root_nid) | ||
264 | |||
265 | #define erofs_blknr(addr) ((addr) / EROFS_BLKSIZ) | ||
266 | #define erofs_blkoff(addr) ((addr) % EROFS_BLKSIZ) | ||
267 | #define blknr_to_addr(nr) ((erofs_off_t)(nr) * EROFS_BLKSIZ) | ||
268 | |||
269 | static inline erofs_off_t iloc(struct erofs_sb_info *sbi, erofs_nid_t nid) | ||
270 | { | ||
271 | return blknr_to_addr(sbi->meta_blkaddr) + (nid << sbi->islotbits); | ||
272 | } | ||
273 | |||
274 | /* atomic flag definitions */ | ||
275 | #define EROFS_V_EA_INITED_BIT 0 | ||
276 | #define EROFS_V_Z_INITED_BIT 1 | ||
277 | |||
278 | /* bitlock definitions (arranged in reverse order) */ | ||
279 | #define EROFS_V_BL_XATTR_BIT (BITS_PER_LONG - 1) | ||
280 | #define EROFS_V_BL_Z_BIT (BITS_PER_LONG - 2) | ||
281 | |||
282 | struct erofs_vnode { | ||
283 | erofs_nid_t nid; | ||
284 | |||
285 | /* atomic flags (including bitlocks) */ | ||
286 | unsigned long flags; | ||
287 | |||
288 | unsigned char datamode; | ||
289 | unsigned char inode_isize; | ||
290 | unsigned short xattr_isize; | ||
291 | |||
292 | unsigned int xattr_shared_count; | ||
293 | unsigned int *xattr_shared_xattrs; | ||
294 | |||
295 | union { | ||
296 | erofs_blk_t raw_blkaddr; | ||
297 | #ifdef CONFIG_EROFS_FS_ZIP | ||
298 | struct { | ||
299 | unsigned short z_advise; | ||
300 | unsigned char z_algorithmtype[2]; | ||
301 | unsigned char z_logical_clusterbits; | ||
302 | unsigned char z_physical_clusterbits[2]; | ||
303 | }; | ||
304 | #endif /* CONFIG_EROFS_FS_ZIP */ | ||
305 | }; | ||
306 | /* the corresponding vfs inode */ | ||
307 | struct inode vfs_inode; | ||
308 | }; | ||
309 | |||
310 | #define EROFS_V(ptr) \ | ||
311 | container_of(ptr, struct erofs_vnode, vfs_inode) | ||
312 | |||
313 | #define __inode_advise(x, bit, bits) \ | ||
314 | (((x) >> (bit)) & ((1 << (bits)) - 1)) | ||
315 | |||
316 | #define __inode_version(advise) \ | ||
317 | __inode_advise(advise, EROFS_I_VERSION_BIT, \ | ||
318 | EROFS_I_VERSION_BITS) | ||
319 | |||
320 | #define __inode_data_mapping(advise) \ | ||
321 | __inode_advise(advise, EROFS_I_DATA_MAPPING_BIT,\ | ||
322 | EROFS_I_DATA_MAPPING_BITS) | ||
323 | |||
324 | static inline unsigned long inode_datablocks(struct inode *inode) | ||
325 | { | ||
326 | /* since i_size cannot be changed */ | ||
327 | return DIV_ROUND_UP(inode->i_size, EROFS_BLKSIZ); | ||
328 | } | ||
329 | |||
330 | static inline bool is_inode_layout_compression(struct inode *inode) | ||
331 | { | ||
332 | return erofs_inode_is_data_compressed(EROFS_V(inode)->datamode); | ||
333 | } | ||
334 | |||
335 | static inline bool is_inode_flat_inline(struct inode *inode) | ||
336 | { | ||
337 | return EROFS_V(inode)->datamode == EROFS_INODE_FLAT_INLINE; | ||
338 | } | ||
339 | |||
340 | extern const struct super_operations erofs_sops; | ||
341 | |||
342 | extern const struct address_space_operations erofs_raw_access_aops; | ||
343 | #ifdef CONFIG_EROFS_FS_ZIP | ||
344 | extern const struct address_space_operations z_erofs_vle_normalaccess_aops; | ||
345 | #endif | ||
346 | |||
347 | /* | ||
348 | * Logical to physical block mapping, used by erofs_map_blocks() | ||
349 | * | ||
350 | * Different with other file systems, it is used for 2 access modes: | ||
351 | * | ||
352 | * 1) RAW access mode: | ||
353 | * | ||
354 | * Users pass a valid (m_lblk, m_lofs -- usually 0) pair, | ||
355 | * and get the valid m_pblk, m_pofs and the longest m_len(in bytes). | ||
356 | * | ||
357 | * Note that m_lblk in the RAW access mode refers to the number of | ||
358 | * the compressed ondisk block rather than the uncompressed | ||
359 | * in-memory block for the compressed file. | ||
360 | * | ||
361 | * m_pofs equals to m_lofs except for the inline data page. | ||
362 | * | ||
363 | * 2) Normal access mode: | ||
364 | * | ||
365 | * If the inode is not compressed, it has no difference with | ||
366 | * the RAW access mode. However, if the inode is compressed, | ||
367 | * users should pass a valid (m_lblk, m_lofs) pair, and get | ||
368 | * the needed m_pblk, m_pofs, m_len to get the compressed data | ||
369 | * and the updated m_lblk, m_lofs which indicates the start | ||
370 | * of the corresponding uncompressed data in the file. | ||
371 | */ | ||
372 | enum { | ||
373 | BH_Zipped = BH_PrivateStart, | ||
374 | BH_FullMapped, | ||
375 | }; | ||
376 | |||
377 | /* Has a disk mapping */ | ||
378 | #define EROFS_MAP_MAPPED (1 << BH_Mapped) | ||
379 | /* Located in metadata (could be copied from bd_inode) */ | ||
380 | #define EROFS_MAP_META (1 << BH_Meta) | ||
381 | /* The extent has been compressed */ | ||
382 | #define EROFS_MAP_ZIPPED (1 << BH_Zipped) | ||
383 | /* The length of extent is full */ | ||
384 | #define EROFS_MAP_FULL_MAPPED (1 << BH_FullMapped) | ||
385 | |||
386 | struct erofs_map_blocks { | ||
387 | erofs_off_t m_pa, m_la; | ||
388 | u64 m_plen, m_llen; | ||
389 | |||
390 | unsigned int m_flags; | ||
391 | |||
392 | struct page *mpage; | ||
393 | }; | ||
394 | |||
395 | /* Flags used by erofs_map_blocks() */ | ||
396 | #define EROFS_GET_BLOCKS_RAW 0x0001 | ||
397 | |||
398 | /* zmap.c */ | ||
399 | #ifdef CONFIG_EROFS_FS_ZIP | ||
400 | int z_erofs_fill_inode(struct inode *inode); | ||
401 | int z_erofs_map_blocks_iter(struct inode *inode, | ||
402 | struct erofs_map_blocks *map, | ||
403 | int flags); | ||
404 | #else | ||
405 | static inline int z_erofs_fill_inode(struct inode *inode) { return -EOPNOTSUPP; } | ||
406 | static inline int z_erofs_map_blocks_iter(struct inode *inode, | ||
407 | struct erofs_map_blocks *map, | ||
408 | int flags) | ||
409 | { | ||
410 | return -EOPNOTSUPP; | ||
411 | } | ||
412 | #endif /* !CONFIG_EROFS_FS_ZIP */ | ||
413 | |||
414 | /* data.c */ | ||
415 | static inline struct bio *erofs_grab_bio(struct super_block *sb, | ||
416 | erofs_blk_t blkaddr, | ||
417 | unsigned int nr_pages, | ||
418 | void *bi_private, bio_end_io_t endio, | ||
419 | bool nofail) | ||
420 | { | ||
421 | const gfp_t gfp = GFP_NOIO; | ||
422 | struct bio *bio; | ||
423 | |||
424 | do { | ||
425 | if (nr_pages == 1) { | ||
426 | bio = bio_alloc(gfp | (nofail ? __GFP_NOFAIL : 0), 1); | ||
427 | if (unlikely(!bio)) { | ||
428 | DBG_BUGON(nofail); | ||
429 | return ERR_PTR(-ENOMEM); | ||
430 | } | ||
431 | break; | ||
432 | } | ||
433 | bio = bio_alloc(gfp, nr_pages); | ||
434 | nr_pages /= 2; | ||
435 | } while (unlikely(!bio)); | ||
436 | |||
437 | bio->bi_end_io = endio; | ||
438 | bio_set_dev(bio, sb->s_bdev); | ||
439 | bio->bi_iter.bi_sector = (sector_t)blkaddr << LOG_SECTORS_PER_BLOCK; | ||
440 | bio->bi_private = bi_private; | ||
441 | return bio; | ||
442 | } | ||
443 | |||
444 | static inline void __submit_bio(struct bio *bio, unsigned int op, | ||
445 | unsigned int op_flags) | ||
446 | { | ||
447 | bio_set_op_attrs(bio, op, op_flags); | ||
448 | submit_bio(bio); | ||
449 | } | ||
450 | |||
451 | struct page *__erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr, | ||
452 | bool prio, bool nofail); | ||
453 | |||
454 | static inline struct page *erofs_get_meta_page(struct super_block *sb, | ||
455 | erofs_blk_t blkaddr, bool prio) | ||
456 | { | ||
457 | return __erofs_get_meta_page(sb, blkaddr, prio, false); | ||
458 | } | ||
459 | |||
460 | int erofs_map_blocks(struct inode *, struct erofs_map_blocks *, int); | ||
461 | |||
462 | static inline struct page *erofs_get_inline_page(struct inode *inode, | ||
463 | erofs_blk_t blkaddr) | ||
464 | { | ||
465 | return erofs_get_meta_page(inode->i_sb, blkaddr, | ||
466 | S_ISDIR(inode->i_mode)); | ||
467 | } | ||
468 | |||
469 | /* inode.c */ | ||
470 | static inline unsigned long erofs_inode_hash(erofs_nid_t nid) | ||
471 | { | ||
472 | #if BITS_PER_LONG == 32 | ||
473 | return (nid >> 32) ^ (nid & 0xffffffff); | ||
474 | #else | ||
475 | return nid; | ||
476 | #endif | ||
477 | } | ||
478 | |||
479 | extern const struct inode_operations erofs_generic_iops; | ||
480 | extern const struct inode_operations erofs_symlink_iops; | ||
481 | extern const struct inode_operations erofs_fast_symlink_iops; | ||
482 | |||
483 | static inline void set_inode_fast_symlink(struct inode *inode) | ||
484 | { | ||
485 | inode->i_op = &erofs_fast_symlink_iops; | ||
486 | } | ||
487 | |||
488 | static inline bool is_inode_fast_symlink(struct inode *inode) | ||
489 | { | ||
490 | return inode->i_op == &erofs_fast_symlink_iops; | ||
491 | } | ||
492 | |||
493 | struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid, bool dir); | ||
494 | int erofs_getattr(const struct path *path, struct kstat *stat, | ||
495 | u32 request_mask, unsigned int query_flags); | ||
496 | |||
497 | /* namei.c */ | ||
498 | extern const struct inode_operations erofs_dir_iops; | ||
499 | |||
500 | int erofs_namei(struct inode *dir, struct qstr *name, | ||
501 | erofs_nid_t *nid, unsigned int *d_type); | ||
502 | |||
503 | /* dir.c */ | ||
504 | extern const struct file_operations erofs_dir_fops; | ||
505 | |||
506 | /* utils.c / zdata.c */ | ||
507 | struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp, bool nofail); | ||
508 | |||
509 | #if (EROFS_PCPUBUF_NR_PAGES > 0) | ||
510 | void *erofs_get_pcpubuf(unsigned int pagenr); | ||
511 | #define erofs_put_pcpubuf(buf) do { \ | ||
512 | (void)&(buf); \ | ||
513 | preempt_enable(); \ | ||
514 | } while (0) | ||
515 | #else | ||
516 | static inline void *erofs_get_pcpubuf(unsigned int pagenr) | ||
517 | { | ||
518 | return ERR_PTR(-EOPNOTSUPP); | ||
519 | } | ||
520 | |||
521 | #define erofs_put_pcpubuf(buf) do {} while (0) | ||
522 | #endif | ||
523 | |||
524 | #ifdef CONFIG_EROFS_FS_ZIP | ||
525 | int erofs_workgroup_put(struct erofs_workgroup *grp); | ||
526 | struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb, | ||
527 | pgoff_t index, bool *tag); | ||
528 | int erofs_register_workgroup(struct super_block *sb, | ||
529 | struct erofs_workgroup *grp, bool tag); | ||
530 | void erofs_workgroup_free_rcu(struct erofs_workgroup *grp); | ||
531 | void erofs_shrinker_register(struct super_block *sb); | ||
532 | void erofs_shrinker_unregister(struct super_block *sb); | ||
533 | int __init erofs_init_shrinker(void); | ||
534 | void erofs_exit_shrinker(void); | ||
535 | int __init z_erofs_init_zip_subsystem(void); | ||
536 | void z_erofs_exit_zip_subsystem(void); | ||
537 | int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, | ||
538 | struct erofs_workgroup *egrp); | ||
539 | int erofs_try_to_free_cached_page(struct address_space *mapping, | ||
540 | struct page *page); | ||
541 | #else | ||
542 | static inline void erofs_shrinker_register(struct super_block *sb) {} | ||
543 | static inline void erofs_shrinker_unregister(struct super_block *sb) {} | ||
544 | static inline int erofs_init_shrinker(void) { return 0; } | ||
545 | static inline void erofs_exit_shrinker(void) {} | ||
546 | static inline int z_erofs_init_zip_subsystem(void) { return 0; } | ||
547 | static inline void z_erofs_exit_zip_subsystem(void) {} | ||
548 | #endif /* !CONFIG_EROFS_FS_ZIP */ | ||
549 | |||
550 | #define EFSCORRUPTED EUCLEAN /* Filesystem is corrupted */ | ||
551 | |||
552 | #endif /* __EROFS_INTERNAL_H */ | ||
553 | |||
diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c new file mode 100644 index 000000000000..8832b5d95d91 --- /dev/null +++ b/fs/erofs/namei.c | |||
@@ -0,0 +1,251 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-only | ||
2 | /* | ||
3 | * Copyright (C) 2017-2018 HUAWEI, Inc. | ||
4 | * http://www.huawei.com/ | ||
5 | * Created by Gao Xiang <gaoxiang25@huawei.com> | ||
6 | */ | ||
7 | #include "xattr.h" | ||
8 | |||
9 | #include <trace/events/erofs.h> | ||
10 | |||
11 | struct erofs_qstr { | ||
12 | const unsigned char *name; | ||
13 | const unsigned char *end; | ||
14 | }; | ||
15 | |||
16 | /* based on the end of qn is accurate and it must have the trailing '\0' */ | ||
17 | static inline int dirnamecmp(const struct erofs_qstr *qn, | ||
18 | const struct erofs_qstr *qd, | ||
19 | unsigned int *matched) | ||
20 | { | ||
21 | unsigned int i = *matched; | ||
22 | |||
23 | /* | ||
24 | * on-disk error, let's only BUG_ON in the debugging mode. | ||
25 | * otherwise, it will return 1 to just skip the invalid name | ||
26 | * and go on (in consideration of the lookup performance). | ||
27 | */ | ||
28 | DBG_BUGON(qd->name > qd->end); | ||
29 | |||
30 | /* qd could not have trailing '\0' */ | ||
31 | /* However it is absolutely safe if < qd->end */ | ||
32 | while (qd->name + i < qd->end && qd->name[i] != '\0') { | ||
33 | if (qn->name[i] != qd->name[i]) { | ||
34 | *matched = i; | ||
35 | return qn->name[i] > qd->name[i] ? 1 : -1; | ||
36 | } | ||
37 | ++i; | ||
38 | } | ||
39 | *matched = i; | ||
40 | /* See comments in __d_alloc on the terminating NUL character */ | ||
41 | return qn->name[i] == '\0' ? 0 : 1; | ||
42 | } | ||
43 | |||
44 | #define nameoff_from_disk(off, sz) (le16_to_cpu(off) & ((sz) - 1)) | ||
45 | |||
46 | static struct erofs_dirent *find_target_dirent(struct erofs_qstr *name, | ||
47 | u8 *data, | ||
48 | unsigned int dirblksize, | ||
49 | const int ndirents) | ||
50 | { | ||
51 | int head, back; | ||
52 | unsigned int startprfx, endprfx; | ||
53 | struct erofs_dirent *const de = (struct erofs_dirent *)data; | ||
54 | |||
55 | /* since the 1st dirent has been evaluated previously */ | ||
56 | head = 1; | ||
57 | back = ndirents - 1; | ||
58 | startprfx = endprfx = 0; | ||
59 | |||
60 | while (head <= back) { | ||
61 | const int mid = head + (back - head) / 2; | ||
62 | const int nameoff = nameoff_from_disk(de[mid].nameoff, | ||
63 | dirblksize); | ||
64 | unsigned int matched = min(startprfx, endprfx); | ||
65 | struct erofs_qstr dname = { | ||
66 | .name = data + nameoff, | ||
67 | .end = unlikely(mid >= ndirents - 1) ? | ||
68 | data + dirblksize : | ||
69 | data + nameoff_from_disk(de[mid + 1].nameoff, | ||
70 | dirblksize) | ||
71 | }; | ||
72 | |||
73 | /* string comparison without already matched prefix */ | ||
74 | int ret = dirnamecmp(name, &dname, &matched); | ||
75 | |||
76 | if (unlikely(!ret)) { | ||
77 | return de + mid; | ||
78 | } else if (ret > 0) { | ||
79 | head = mid + 1; | ||
80 | startprfx = matched; | ||
81 | } else { | ||
82 | back = mid - 1; | ||
83 | endprfx = matched; | ||
84 | } | ||
85 | } | ||
86 | |||
87 | return ERR_PTR(-ENOENT); | ||
88 | } | ||
89 | |||
90 | static struct page *find_target_block_classic(struct inode *dir, | ||
91 | struct erofs_qstr *name, | ||
92 | int *_ndirents) | ||
93 | { | ||
94 | unsigned int startprfx, endprfx; | ||
95 | int head, back; | ||
96 | struct address_space *const mapping = dir->i_mapping; | ||
97 | struct page *candidate = ERR_PTR(-ENOENT); | ||
98 | |||
99 | startprfx = endprfx = 0; | ||
100 | head = 0; | ||
101 | back = inode_datablocks(dir) - 1; | ||
102 | |||
103 | while (head <= back) { | ||
104 | const int mid = head + (back - head) / 2; | ||
105 | struct page *page = read_mapping_page(mapping, mid, NULL); | ||
106 | |||
107 | if (!IS_ERR(page)) { | ||
108 | struct erofs_dirent *de = kmap_atomic(page); | ||
109 | const int nameoff = nameoff_from_disk(de->nameoff, | ||
110 | EROFS_BLKSIZ); | ||
111 | const int ndirents = nameoff / sizeof(*de); | ||
112 | int diff; | ||
113 | unsigned int matched; | ||
114 | struct erofs_qstr dname; | ||
115 | |||
116 | if (unlikely(!ndirents)) { | ||
117 | kunmap_atomic(de); | ||
118 | put_page(page); | ||
119 | errln("corrupted dir block %d @ nid %llu", | ||
120 | mid, EROFS_V(dir)->nid); | ||
121 | DBG_BUGON(1); | ||
122 | page = ERR_PTR(-EFSCORRUPTED); | ||
123 | goto out; | ||
124 | } | ||
125 | |||
126 | matched = min(startprfx, endprfx); | ||
127 | |||
128 | dname.name = (u8 *)de + nameoff; | ||
129 | if (ndirents == 1) | ||
130 | dname.end = (u8 *)de + EROFS_BLKSIZ; | ||
131 | else | ||
132 | dname.end = (u8 *)de + | ||
133 | nameoff_from_disk(de[1].nameoff, | ||
134 | EROFS_BLKSIZ); | ||
135 | |||
136 | /* string comparison without already matched prefix */ | ||
137 | diff = dirnamecmp(name, &dname, &matched); | ||
138 | kunmap_atomic(de); | ||
139 | |||
140 | if (unlikely(!diff)) { | ||
141 | *_ndirents = 0; | ||
142 | goto out; | ||
143 | } else if (diff > 0) { | ||
144 | head = mid + 1; | ||
145 | startprfx = matched; | ||
146 | |||
147 | if (!IS_ERR(candidate)) | ||
148 | put_page(candidate); | ||
149 | candidate = page; | ||
150 | *_ndirents = ndirents; | ||
151 | } else { | ||
152 | put_page(page); | ||
153 | |||
154 | back = mid - 1; | ||
155 | endprfx = matched; | ||
156 | } | ||
157 | continue; | ||
158 | } | ||
159 | out: /* free if the candidate is valid */ | ||
160 | if (!IS_ERR(candidate)) | ||
161 | put_page(candidate); | ||
162 | return page; | ||
163 | } | ||
164 | return candidate; | ||
165 | } | ||
166 | |||
167 | int erofs_namei(struct inode *dir, | ||
168 | struct qstr *name, | ||
169 | erofs_nid_t *nid, unsigned int *d_type) | ||
170 | { | ||
171 | int ndirents; | ||
172 | struct page *page; | ||
173 | void *data; | ||
174 | struct erofs_dirent *de; | ||
175 | struct erofs_qstr qn; | ||
176 | |||
177 | if (unlikely(!dir->i_size)) | ||
178 | return -ENOENT; | ||
179 | |||
180 | qn.name = name->name; | ||
181 | qn.end = name->name + name->len; | ||
182 | |||
183 | ndirents = 0; | ||
184 | page = find_target_block_classic(dir, &qn, &ndirents); | ||
185 | |||
186 | if (IS_ERR(page)) | ||
187 | return PTR_ERR(page); | ||
188 | |||
189 | data = kmap_atomic(page); | ||
190 | /* the target page has been mapped */ | ||
191 | if (ndirents) | ||
192 | de = find_target_dirent(&qn, data, EROFS_BLKSIZ, ndirents); | ||
193 | else | ||
194 | de = (struct erofs_dirent *)data; | ||
195 | |||
196 | if (!IS_ERR(de)) { | ||
197 | *nid = le64_to_cpu(de->nid); | ||
198 | *d_type = de->file_type; | ||
199 | } | ||
200 | |||
201 | kunmap_atomic(data); | ||
202 | put_page(page); | ||
203 | |||
204 | return PTR_ERR_OR_ZERO(de); | ||
205 | } | ||
206 | |||
207 | /* NOTE: i_mutex is already held by vfs */ | ||
208 | static struct dentry *erofs_lookup(struct inode *dir, | ||
209 | struct dentry *dentry, | ||
210 | unsigned int flags) | ||
211 | { | ||
212 | int err; | ||
213 | erofs_nid_t nid; | ||
214 | unsigned int d_type; | ||
215 | struct inode *inode; | ||
216 | |||
217 | DBG_BUGON(!d_really_is_negative(dentry)); | ||
218 | /* dentry must be unhashed in lookup, no need to worry about */ | ||
219 | DBG_BUGON(!d_unhashed(dentry)); | ||
220 | |||
221 | trace_erofs_lookup(dir, dentry, flags); | ||
222 | |||
223 | /* file name exceeds fs limit */ | ||
224 | if (unlikely(dentry->d_name.len > EROFS_NAME_LEN)) | ||
225 | return ERR_PTR(-ENAMETOOLONG); | ||
226 | |||
227 | /* false uninitialized warnings on gcc 4.8.x */ | ||
228 | err = erofs_namei(dir, &dentry->d_name, &nid, &d_type); | ||
229 | |||
230 | if (err == -ENOENT) { | ||
231 | /* negative dentry */ | ||
232 | inode = NULL; | ||
233 | } else if (unlikely(err)) { | ||
234 | inode = ERR_PTR(err); | ||
235 | } else { | ||
236 | debugln("%s, %s (nid %llu) found, d_type %u", __func__, | ||
237 | dentry->d_name.name, nid, d_type); | ||
238 | inode = erofs_iget(dir->i_sb, nid, d_type == FT_DIR); | ||
239 | } | ||
240 | return d_splice_alias(inode, dentry); | ||
241 | } | ||
242 | |||
243 | const struct inode_operations erofs_dir_iops = { | ||
244 | .lookup = erofs_lookup, | ||
245 | .getattr = erofs_getattr, | ||
246 | #ifdef CONFIG_EROFS_FS_XATTR | ||
247 | .listxattr = erofs_listxattr, | ||
248 | #endif | ||
249 | .get_acl = erofs_get_acl, | ||
250 | }; | ||
251 | |||
diff --git a/fs/erofs/super.c b/fs/erofs/super.c new file mode 100644 index 000000000000..6d3a9bcb8daa --- /dev/null +++ b/fs/erofs/super.c | |||
@@ -0,0 +1,669 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-only | ||
2 | /* | ||
3 | * Copyright (C) 2017-2018 HUAWEI, Inc. | ||
4 | * http://www.huawei.com/ | ||
5 | * Created by Gao Xiang <gaoxiang25@huawei.com> | ||
6 | */ | ||
7 | #include <linux/module.h> | ||
8 | #include <linux/buffer_head.h> | ||
9 | #include <linux/statfs.h> | ||
10 | #include <linux/parser.h> | ||
11 | #include <linux/seq_file.h> | ||
12 | #include "xattr.h" | ||
13 | |||
14 | #define CREATE_TRACE_POINTS | ||
15 | #include <trace/events/erofs.h> | ||
16 | |||
17 | static struct kmem_cache *erofs_inode_cachep __read_mostly; | ||
18 | |||
19 | static void init_once(void *ptr) | ||
20 | { | ||
21 | struct erofs_vnode *vi = ptr; | ||
22 | |||
23 | inode_init_once(&vi->vfs_inode); | ||
24 | } | ||
25 | |||
26 | static int __init erofs_init_inode_cache(void) | ||
27 | { | ||
28 | erofs_inode_cachep = kmem_cache_create("erofs_inode", | ||
29 | sizeof(struct erofs_vnode), 0, | ||
30 | SLAB_RECLAIM_ACCOUNT, | ||
31 | init_once); | ||
32 | |||
33 | return erofs_inode_cachep ? 0 : -ENOMEM; | ||
34 | } | ||
35 | |||
36 | static void erofs_exit_inode_cache(void) | ||
37 | { | ||
38 | kmem_cache_destroy(erofs_inode_cachep); | ||
39 | } | ||
40 | |||
41 | static struct inode *alloc_inode(struct super_block *sb) | ||
42 | { | ||
43 | struct erofs_vnode *vi = | ||
44 | kmem_cache_alloc(erofs_inode_cachep, GFP_KERNEL); | ||
45 | |||
46 | if (!vi) | ||
47 | return NULL; | ||
48 | |||
49 | /* zero out everything except vfs_inode */ | ||
50 | memset(vi, 0, offsetof(struct erofs_vnode, vfs_inode)); | ||
51 | return &vi->vfs_inode; | ||
52 | } | ||
53 | |||
54 | static void free_inode(struct inode *inode) | ||
55 | { | ||
56 | struct erofs_vnode *vi = EROFS_V(inode); | ||
57 | |||
58 | /* be careful RCU symlink path (see ext4_inode_info->i_data)! */ | ||
59 | if (is_inode_fast_symlink(inode)) | ||
60 | kfree(inode->i_link); | ||
61 | |||
62 | kfree(vi->xattr_shared_xattrs); | ||
63 | |||
64 | kmem_cache_free(erofs_inode_cachep, vi); | ||
65 | } | ||
66 | |||
67 | static bool check_layout_compatibility(struct super_block *sb, | ||
68 | struct erofs_super_block *layout) | ||
69 | { | ||
70 | const unsigned int requirements = le32_to_cpu(layout->requirements); | ||
71 | |||
72 | EROFS_SB(sb)->requirements = requirements; | ||
73 | |||
74 | /* check if current kernel meets all mandatory requirements */ | ||
75 | if (requirements & (~EROFS_ALL_REQUIREMENTS)) { | ||
76 | errln("unidentified requirements %x, please upgrade kernel version", | ||
77 | requirements & ~EROFS_ALL_REQUIREMENTS); | ||
78 | return false; | ||
79 | } | ||
80 | return true; | ||
81 | } | ||
82 | |||
83 | static int superblock_read(struct super_block *sb) | ||
84 | { | ||
85 | struct erofs_sb_info *sbi; | ||
86 | struct buffer_head *bh; | ||
87 | struct erofs_super_block *layout; | ||
88 | unsigned int blkszbits; | ||
89 | int ret; | ||
90 | |||
91 | bh = sb_bread(sb, 0); | ||
92 | |||
93 | if (!bh) { | ||
94 | errln("cannot read erofs superblock"); | ||
95 | return -EIO; | ||
96 | } | ||
97 | |||
98 | sbi = EROFS_SB(sb); | ||
99 | layout = (struct erofs_super_block *)((u8 *)bh->b_data | ||
100 | + EROFS_SUPER_OFFSET); | ||
101 | |||
102 | ret = -EINVAL; | ||
103 | if (le32_to_cpu(layout->magic) != EROFS_SUPER_MAGIC_V1) { | ||
104 | errln("cannot find valid erofs superblock"); | ||
105 | goto out; | ||
106 | } | ||
107 | |||
108 | blkszbits = layout->blkszbits; | ||
109 | /* 9(512 bytes) + LOG_SECTORS_PER_BLOCK == LOG_BLOCK_SIZE */ | ||
110 | if (unlikely(blkszbits != LOG_BLOCK_SIZE)) { | ||
111 | errln("blksize %u isn't supported on this platform", | ||
112 | 1 << blkszbits); | ||
113 | goto out; | ||
114 | } | ||
115 | |||
116 | if (!check_layout_compatibility(sb, layout)) | ||
117 | goto out; | ||
118 | |||
119 | sbi->blocks = le32_to_cpu(layout->blocks); | ||
120 | sbi->meta_blkaddr = le32_to_cpu(layout->meta_blkaddr); | ||
121 | #ifdef CONFIG_EROFS_FS_XATTR | ||
122 | sbi->xattr_blkaddr = le32_to_cpu(layout->xattr_blkaddr); | ||
123 | #endif | ||
124 | sbi->islotbits = ffs(sizeof(struct erofs_inode_v1)) - 1; | ||
125 | sbi->root_nid = le16_to_cpu(layout->root_nid); | ||
126 | sbi->inos = le64_to_cpu(layout->inos); | ||
127 | |||
128 | sbi->build_time = le64_to_cpu(layout->build_time); | ||
129 | sbi->build_time_nsec = le32_to_cpu(layout->build_time_nsec); | ||
130 | |||
131 | memcpy(&sb->s_uuid, layout->uuid, sizeof(layout->uuid)); | ||
132 | |||
133 | ret = strscpy(sbi->volume_name, layout->volume_name, | ||
134 | sizeof(layout->volume_name)); | ||
135 | if (ret < 0) { /* -E2BIG */ | ||
136 | errln("bad volume name without NIL terminator"); | ||
137 | ret = -EFSCORRUPTED; | ||
138 | goto out; | ||
139 | } | ||
140 | ret = 0; | ||
141 | out: | ||
142 | brelse(bh); | ||
143 | return ret; | ||
144 | } | ||
145 | |||
146 | #ifdef CONFIG_EROFS_FAULT_INJECTION | ||
147 | const char *erofs_fault_name[FAULT_MAX] = { | ||
148 | [FAULT_KMALLOC] = "kmalloc", | ||
149 | [FAULT_READ_IO] = "read IO error", | ||
150 | }; | ||
151 | |||
152 | static void __erofs_build_fault_attr(struct erofs_sb_info *sbi, | ||
153 | unsigned int rate) | ||
154 | { | ||
155 | struct erofs_fault_info *ffi = &sbi->fault_info; | ||
156 | |||
157 | if (rate) { | ||
158 | atomic_set(&ffi->inject_ops, 0); | ||
159 | ffi->inject_rate = rate; | ||
160 | ffi->inject_type = (1 << FAULT_MAX) - 1; | ||
161 | } else { | ||
162 | memset(ffi, 0, sizeof(struct erofs_fault_info)); | ||
163 | } | ||
164 | |||
165 | set_opt(sbi, FAULT_INJECTION); | ||
166 | } | ||
167 | |||
168 | static int erofs_build_fault_attr(struct erofs_sb_info *sbi, | ||
169 | substring_t *args) | ||
170 | { | ||
171 | int rate = 0; | ||
172 | |||
173 | if (args->from && match_int(args, &rate)) | ||
174 | return -EINVAL; | ||
175 | |||
176 | __erofs_build_fault_attr(sbi, rate); | ||
177 | return 0; | ||
178 | } | ||
179 | |||
180 | static unsigned int erofs_get_fault_rate(struct erofs_sb_info *sbi) | ||
181 | { | ||
182 | return sbi->fault_info.inject_rate; | ||
183 | } | ||
184 | #else | ||
185 | static void __erofs_build_fault_attr(struct erofs_sb_info *sbi, | ||
186 | unsigned int rate) | ||
187 | { | ||
188 | } | ||
189 | |||
190 | static int erofs_build_fault_attr(struct erofs_sb_info *sbi, | ||
191 | substring_t *args) | ||
192 | { | ||
193 | infoln("fault_injection options not supported"); | ||
194 | return 0; | ||
195 | } | ||
196 | |||
197 | static unsigned int erofs_get_fault_rate(struct erofs_sb_info *sbi) | ||
198 | { | ||
199 | return 0; | ||
200 | } | ||
201 | #endif | ||
202 | |||
203 | #ifdef CONFIG_EROFS_FS_ZIP | ||
204 | static int erofs_build_cache_strategy(struct erofs_sb_info *sbi, | ||
205 | substring_t *args) | ||
206 | { | ||
207 | const char *cs = match_strdup(args); | ||
208 | int err = 0; | ||
209 | |||
210 | if (!cs) { | ||
211 | errln("Not enough memory to store cache strategy"); | ||
212 | return -ENOMEM; | ||
213 | } | ||
214 | |||
215 | if (!strcmp(cs, "disabled")) { | ||
216 | sbi->cache_strategy = EROFS_ZIP_CACHE_DISABLED; | ||
217 | } else if (!strcmp(cs, "readahead")) { | ||
218 | sbi->cache_strategy = EROFS_ZIP_CACHE_READAHEAD; | ||
219 | } else if (!strcmp(cs, "readaround")) { | ||
220 | sbi->cache_strategy = EROFS_ZIP_CACHE_READAROUND; | ||
221 | } else { | ||
222 | errln("Unrecognized cache strategy \"%s\"", cs); | ||
223 | err = -EINVAL; | ||
224 | } | ||
225 | kfree(cs); | ||
226 | return err; | ||
227 | } | ||
228 | #else | ||
229 | static int erofs_build_cache_strategy(struct erofs_sb_info *sbi, | ||
230 | substring_t *args) | ||
231 | { | ||
232 | infoln("EROFS compression is disabled, so cache strategy is ignored"); | ||
233 | return 0; | ||
234 | } | ||
235 | #endif | ||
236 | |||
237 | /* set up default EROFS parameters */ | ||
238 | static void default_options(struct erofs_sb_info *sbi) | ||
239 | { | ||
240 | #ifdef CONFIG_EROFS_FS_ZIP | ||
241 | sbi->cache_strategy = EROFS_ZIP_CACHE_READAROUND; | ||
242 | sbi->max_sync_decompress_pages = 3; | ||
243 | #endif | ||
244 | #ifdef CONFIG_EROFS_FS_XATTR | ||
245 | set_opt(sbi, XATTR_USER); | ||
246 | #endif | ||
247 | #ifdef CONFIG_EROFS_FS_POSIX_ACL | ||
248 | set_opt(sbi, POSIX_ACL); | ||
249 | #endif | ||
250 | } | ||
251 | |||
252 | enum { | ||
253 | Opt_user_xattr, | ||
254 | Opt_nouser_xattr, | ||
255 | Opt_acl, | ||
256 | Opt_noacl, | ||
257 | Opt_fault_injection, | ||
258 | Opt_cache_strategy, | ||
259 | Opt_err | ||
260 | }; | ||
261 | |||
262 | static match_table_t erofs_tokens = { | ||
263 | {Opt_user_xattr, "user_xattr"}, | ||
264 | {Opt_nouser_xattr, "nouser_xattr"}, | ||
265 | {Opt_acl, "acl"}, | ||
266 | {Opt_noacl, "noacl"}, | ||
267 | {Opt_fault_injection, "fault_injection=%u"}, | ||
268 | {Opt_cache_strategy, "cache_strategy=%s"}, | ||
269 | {Opt_err, NULL} | ||
270 | }; | ||
271 | |||
272 | static int parse_options(struct super_block *sb, char *options) | ||
273 | { | ||
274 | substring_t args[MAX_OPT_ARGS]; | ||
275 | char *p; | ||
276 | int err; | ||
277 | |||
278 | if (!options) | ||
279 | return 0; | ||
280 | |||
281 | while ((p = strsep(&options, ","))) { | ||
282 | int token; | ||
283 | |||
284 | if (!*p) | ||
285 | continue; | ||
286 | |||
287 | args[0].to = args[0].from = NULL; | ||
288 | token = match_token(p, erofs_tokens, args); | ||
289 | |||
290 | switch (token) { | ||
291 | #ifdef CONFIG_EROFS_FS_XATTR | ||
292 | case Opt_user_xattr: | ||
293 | set_opt(EROFS_SB(sb), XATTR_USER); | ||
294 | break; | ||
295 | case Opt_nouser_xattr: | ||
296 | clear_opt(EROFS_SB(sb), XATTR_USER); | ||
297 | break; | ||
298 | #else | ||
299 | case Opt_user_xattr: | ||
300 | infoln("user_xattr options not supported"); | ||
301 | break; | ||
302 | case Opt_nouser_xattr: | ||
303 | infoln("nouser_xattr options not supported"); | ||
304 | break; | ||
305 | #endif | ||
306 | #ifdef CONFIG_EROFS_FS_POSIX_ACL | ||
307 | case Opt_acl: | ||
308 | set_opt(EROFS_SB(sb), POSIX_ACL); | ||
309 | break; | ||
310 | case Opt_noacl: | ||
311 | clear_opt(EROFS_SB(sb), POSIX_ACL); | ||
312 | break; | ||
313 | #else | ||
314 | case Opt_acl: | ||
315 | infoln("acl options not supported"); | ||
316 | break; | ||
317 | case Opt_noacl: | ||
318 | infoln("noacl options not supported"); | ||
319 | break; | ||
320 | #endif | ||
321 | case Opt_fault_injection: | ||
322 | err = erofs_build_fault_attr(EROFS_SB(sb), args); | ||
323 | if (err) | ||
324 | return err; | ||
325 | break; | ||
326 | case Opt_cache_strategy: | ||
327 | err = erofs_build_cache_strategy(EROFS_SB(sb), args); | ||
328 | if (err) | ||
329 | return err; | ||
330 | break; | ||
331 | default: | ||
332 | errln("Unrecognized mount option \"%s\" or missing value", p); | ||
333 | return -EINVAL; | ||
334 | } | ||
335 | } | ||
336 | return 0; | ||
337 | } | ||
338 | |||
339 | #ifdef CONFIG_EROFS_FS_ZIP | ||
340 | static const struct address_space_operations managed_cache_aops; | ||
341 | |||
342 | static int managed_cache_releasepage(struct page *page, gfp_t gfp_mask) | ||
343 | { | ||
344 | int ret = 1; /* 0 - busy */ | ||
345 | struct address_space *const mapping = page->mapping; | ||
346 | |||
347 | DBG_BUGON(!PageLocked(page)); | ||
348 | DBG_BUGON(mapping->a_ops != &managed_cache_aops); | ||
349 | |||
350 | if (PagePrivate(page)) | ||
351 | ret = erofs_try_to_free_cached_page(mapping, page); | ||
352 | |||
353 | return ret; | ||
354 | } | ||
355 | |||
356 | static void managed_cache_invalidatepage(struct page *page, | ||
357 | unsigned int offset, | ||
358 | unsigned int length) | ||
359 | { | ||
360 | const unsigned int stop = length + offset; | ||
361 | |||
362 | DBG_BUGON(!PageLocked(page)); | ||
363 | |||
364 | /* Check for potential overflow in debug mode */ | ||
365 | DBG_BUGON(stop > PAGE_SIZE || stop < length); | ||
366 | |||
367 | if (offset == 0 && stop == PAGE_SIZE) | ||
368 | while (!managed_cache_releasepage(page, GFP_NOFS)) | ||
369 | cond_resched(); | ||
370 | } | ||
371 | |||
372 | static const struct address_space_operations managed_cache_aops = { | ||
373 | .releasepage = managed_cache_releasepage, | ||
374 | .invalidatepage = managed_cache_invalidatepage, | ||
375 | }; | ||
376 | |||
377 | static int erofs_init_managed_cache(struct super_block *sb) | ||
378 | { | ||
379 | struct erofs_sb_info *const sbi = EROFS_SB(sb); | ||
380 | struct inode *const inode = new_inode(sb); | ||
381 | |||
382 | if (unlikely(!inode)) | ||
383 | return -ENOMEM; | ||
384 | |||
385 | set_nlink(inode, 1); | ||
386 | inode->i_size = OFFSET_MAX; | ||
387 | |||
388 | inode->i_mapping->a_ops = &managed_cache_aops; | ||
389 | mapping_set_gfp_mask(inode->i_mapping, | ||
390 | GFP_NOFS | __GFP_HIGHMEM | __GFP_MOVABLE); | ||
391 | sbi->managed_cache = inode; | ||
392 | return 0; | ||
393 | } | ||
394 | #else | ||
395 | static int erofs_init_managed_cache(struct super_block *sb) { return 0; } | ||
396 | #endif | ||
397 | |||
398 | static int erofs_fill_super(struct super_block *sb, void *data, int silent) | ||
399 | { | ||
400 | struct inode *inode; | ||
401 | struct erofs_sb_info *sbi; | ||
402 | int err; | ||
403 | |||
404 | infoln("fill_super, device -> %s", sb->s_id); | ||
405 | infoln("options -> %s", (char *)data); | ||
406 | |||
407 | sb->s_magic = EROFS_SUPER_MAGIC; | ||
408 | |||
409 | if (unlikely(!sb_set_blocksize(sb, EROFS_BLKSIZ))) { | ||
410 | errln("failed to set erofs blksize"); | ||
411 | return -EINVAL; | ||
412 | } | ||
413 | |||
414 | sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); | ||
415 | if (unlikely(!sbi)) | ||
416 | return -ENOMEM; | ||
417 | |||
418 | sb->s_fs_info = sbi; | ||
419 | err = superblock_read(sb); | ||
420 | if (err) | ||
421 | return err; | ||
422 | |||
423 | sb->s_flags |= SB_RDONLY | SB_NOATIME; | ||
424 | sb->s_maxbytes = MAX_LFS_FILESIZE; | ||
425 | sb->s_time_gran = 1; | ||
426 | |||
427 | sb->s_op = &erofs_sops; | ||
428 | |||
429 | #ifdef CONFIG_EROFS_FS_XATTR | ||
430 | sb->s_xattr = erofs_xattr_handlers; | ||
431 | #endif | ||
432 | /* set erofs default mount options */ | ||
433 | default_options(sbi); | ||
434 | |||
435 | err = parse_options(sb, data); | ||
436 | if (unlikely(err)) | ||
437 | return err; | ||
438 | |||
439 | if (!silent) | ||
440 | infoln("root inode @ nid %llu", ROOT_NID(sbi)); | ||
441 | |||
442 | if (test_opt(sbi, POSIX_ACL)) | ||
443 | sb->s_flags |= SB_POSIXACL; | ||
444 | else | ||
445 | sb->s_flags &= ~SB_POSIXACL; | ||
446 | |||
447 | #ifdef CONFIG_EROFS_FS_ZIP | ||
448 | INIT_RADIX_TREE(&sbi->workstn_tree, GFP_ATOMIC); | ||
449 | #endif | ||
450 | |||
451 | /* get the root inode */ | ||
452 | inode = erofs_iget(sb, ROOT_NID(sbi), true); | ||
453 | if (IS_ERR(inode)) | ||
454 | return PTR_ERR(inode); | ||
455 | |||
456 | if (unlikely(!S_ISDIR(inode->i_mode))) { | ||
457 | errln("rootino(nid %llu) is not a directory(i_mode %o)", | ||
458 | ROOT_NID(sbi), inode->i_mode); | ||
459 | iput(inode); | ||
460 | return -EINVAL; | ||
461 | } | ||
462 | |||
463 | sb->s_root = d_make_root(inode); | ||
464 | if (unlikely(!sb->s_root)) | ||
465 | return -ENOMEM; | ||
466 | |||
467 | erofs_shrinker_register(sb); | ||
468 | /* sb->s_umount is already locked, SB_ACTIVE and SB_BORN are not set */ | ||
469 | err = erofs_init_managed_cache(sb); | ||
470 | if (unlikely(err)) | ||
471 | return err; | ||
472 | |||
473 | if (!silent) | ||
474 | infoln("mounted on %s with opts: %s.", sb->s_id, (char *)data); | ||
475 | return 0; | ||
476 | } | ||
477 | |||
478 | static struct dentry *erofs_mount(struct file_system_type *fs_type, int flags, | ||
479 | const char *dev_name, void *data) | ||
480 | { | ||
481 | return mount_bdev(fs_type, flags, dev_name, data, erofs_fill_super); | ||
482 | } | ||
483 | |||
484 | /* | ||
485 | * could be triggered after deactivate_locked_super() | ||
486 | * is called, thus including umount and failed to initialize. | ||
487 | */ | ||
488 | static void erofs_kill_sb(struct super_block *sb) | ||
489 | { | ||
490 | struct erofs_sb_info *sbi; | ||
491 | |||
492 | WARN_ON(sb->s_magic != EROFS_SUPER_MAGIC); | ||
493 | infoln("unmounting for %s", sb->s_id); | ||
494 | |||
495 | kill_block_super(sb); | ||
496 | |||
497 | sbi = EROFS_SB(sb); | ||
498 | if (!sbi) | ||
499 | return; | ||
500 | kfree(sbi); | ||
501 | sb->s_fs_info = NULL; | ||
502 | } | ||
503 | |||
504 | /* called when ->s_root is non-NULL */ | ||
505 | static void erofs_put_super(struct super_block *sb) | ||
506 | { | ||
507 | struct erofs_sb_info *const sbi = EROFS_SB(sb); | ||
508 | |||
509 | DBG_BUGON(!sbi); | ||
510 | |||
511 | erofs_shrinker_unregister(sb); | ||
512 | #ifdef CONFIG_EROFS_FS_ZIP | ||
513 | iput(sbi->managed_cache); | ||
514 | sbi->managed_cache = NULL; | ||
515 | #endif | ||
516 | } | ||
517 | |||
518 | static struct file_system_type erofs_fs_type = { | ||
519 | .owner = THIS_MODULE, | ||
520 | .name = "erofs", | ||
521 | .mount = erofs_mount, | ||
522 | .kill_sb = erofs_kill_sb, | ||
523 | .fs_flags = FS_REQUIRES_DEV, | ||
524 | }; | ||
525 | MODULE_ALIAS_FS("erofs"); | ||
526 | |||
527 | static int __init erofs_module_init(void) | ||
528 | { | ||
529 | int err; | ||
530 | |||
531 | erofs_check_ondisk_layout_definitions(); | ||
532 | infoln("initializing erofs " EROFS_VERSION); | ||
533 | |||
534 | err = erofs_init_inode_cache(); | ||
535 | if (err) | ||
536 | goto icache_err; | ||
537 | |||
538 | err = erofs_init_shrinker(); | ||
539 | if (err) | ||
540 | goto shrinker_err; | ||
541 | |||
542 | err = z_erofs_init_zip_subsystem(); | ||
543 | if (err) | ||
544 | goto zip_err; | ||
545 | |||
546 | err = register_filesystem(&erofs_fs_type); | ||
547 | if (err) | ||
548 | goto fs_err; | ||
549 | |||
550 | infoln("successfully to initialize erofs"); | ||
551 | return 0; | ||
552 | |||
553 | fs_err: | ||
554 | z_erofs_exit_zip_subsystem(); | ||
555 | zip_err: | ||
556 | erofs_exit_shrinker(); | ||
557 | shrinker_err: | ||
558 | erofs_exit_inode_cache(); | ||
559 | icache_err: | ||
560 | return err; | ||
561 | } | ||
562 | |||
563 | static void __exit erofs_module_exit(void) | ||
564 | { | ||
565 | unregister_filesystem(&erofs_fs_type); | ||
566 | z_erofs_exit_zip_subsystem(); | ||
567 | erofs_exit_shrinker(); | ||
568 | erofs_exit_inode_cache(); | ||
569 | infoln("successfully finalize erofs"); | ||
570 | } | ||
571 | |||
572 | /* get filesystem statistics */ | ||
573 | static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf) | ||
574 | { | ||
575 | struct super_block *sb = dentry->d_sb; | ||
576 | struct erofs_sb_info *sbi = EROFS_SB(sb); | ||
577 | u64 id = huge_encode_dev(sb->s_bdev->bd_dev); | ||
578 | |||
579 | buf->f_type = sb->s_magic; | ||
580 | buf->f_bsize = EROFS_BLKSIZ; | ||
581 | buf->f_blocks = sbi->blocks; | ||
582 | buf->f_bfree = buf->f_bavail = 0; | ||
583 | |||
584 | buf->f_files = ULLONG_MAX; | ||
585 | buf->f_ffree = ULLONG_MAX - sbi->inos; | ||
586 | |||
587 | buf->f_namelen = EROFS_NAME_LEN; | ||
588 | |||
589 | buf->f_fsid.val[0] = (u32)id; | ||
590 | buf->f_fsid.val[1] = (u32)(id >> 32); | ||
591 | return 0; | ||
592 | } | ||
593 | |||
594 | static int erofs_show_options(struct seq_file *seq, struct dentry *root) | ||
595 | { | ||
596 | struct erofs_sb_info *sbi __maybe_unused = EROFS_SB(root->d_sb); | ||
597 | |||
598 | #ifdef CONFIG_EROFS_FS_XATTR | ||
599 | if (test_opt(sbi, XATTR_USER)) | ||
600 | seq_puts(seq, ",user_xattr"); | ||
601 | else | ||
602 | seq_puts(seq, ",nouser_xattr"); | ||
603 | #endif | ||
604 | #ifdef CONFIG_EROFS_FS_POSIX_ACL | ||
605 | if (test_opt(sbi, POSIX_ACL)) | ||
606 | seq_puts(seq, ",acl"); | ||
607 | else | ||
608 | seq_puts(seq, ",noacl"); | ||
609 | #endif | ||
610 | if (test_opt(sbi, FAULT_INJECTION)) | ||
611 | seq_printf(seq, ",fault_injection=%u", | ||
612 | erofs_get_fault_rate(sbi)); | ||
613 | #ifdef CONFIG_EROFS_FS_ZIP | ||
614 | if (sbi->cache_strategy == EROFS_ZIP_CACHE_DISABLED) { | ||
615 | seq_puts(seq, ",cache_strategy=disabled"); | ||
616 | } else if (sbi->cache_strategy == EROFS_ZIP_CACHE_READAHEAD) { | ||
617 | seq_puts(seq, ",cache_strategy=readahead"); | ||
618 | } else if (sbi->cache_strategy == EROFS_ZIP_CACHE_READAROUND) { | ||
619 | seq_puts(seq, ",cache_strategy=readaround"); | ||
620 | } else { | ||
621 | seq_puts(seq, ",cache_strategy=(unknown)"); | ||
622 | DBG_BUGON(1); | ||
623 | } | ||
624 | #endif | ||
625 | return 0; | ||
626 | } | ||
627 | |||
628 | static int erofs_remount(struct super_block *sb, int *flags, char *data) | ||
629 | { | ||
630 | struct erofs_sb_info *sbi = EROFS_SB(sb); | ||
631 | unsigned int org_mnt_opt = sbi->mount_opt; | ||
632 | unsigned int org_inject_rate = erofs_get_fault_rate(sbi); | ||
633 | int err; | ||
634 | |||
635 | DBG_BUGON(!sb_rdonly(sb)); | ||
636 | err = parse_options(sb, data); | ||
637 | if (err) | ||
638 | goto out; | ||
639 | |||
640 | if (test_opt(sbi, POSIX_ACL)) | ||
641 | sb->s_flags |= SB_POSIXACL; | ||
642 | else | ||
643 | sb->s_flags &= ~SB_POSIXACL; | ||
644 | |||
645 | *flags |= SB_RDONLY; | ||
646 | return 0; | ||
647 | out: | ||
648 | __erofs_build_fault_attr(sbi, org_inject_rate); | ||
649 | sbi->mount_opt = org_mnt_opt; | ||
650 | |||
651 | return err; | ||
652 | } | ||
653 | |||
654 | const struct super_operations erofs_sops = { | ||
655 | .put_super = erofs_put_super, | ||
656 | .alloc_inode = alloc_inode, | ||
657 | .free_inode = free_inode, | ||
658 | .statfs = erofs_statfs, | ||
659 | .show_options = erofs_show_options, | ||
660 | .remount_fs = erofs_remount, | ||
661 | }; | ||
662 | |||
663 | module_init(erofs_module_init); | ||
664 | module_exit(erofs_module_exit); | ||
665 | |||
666 | MODULE_DESCRIPTION("Enhanced ROM File System"); | ||
667 | MODULE_AUTHOR("Gao Xiang, Chao Yu, Miao Xie, CONSUMER BG, HUAWEI Inc."); | ||
668 | MODULE_LICENSE("GPL"); | ||
669 | |||
diff --git a/fs/erofs/tagptr.h b/fs/erofs/tagptr.h new file mode 100644 index 000000000000..a72897c86744 --- /dev/null +++ b/fs/erofs/tagptr.h | |||
@@ -0,0 +1,110 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0-only */ | ||
2 | /* | ||
3 | * A tagged pointer implementation | ||
4 | * | ||
5 | * Copyright (C) 2018 Gao Xiang <gaoxiang25@huawei.com> | ||
6 | */ | ||
7 | #ifndef __EROFS_FS_TAGPTR_H | ||
8 | #define __EROFS_FS_TAGPTR_H | ||
9 | |||
10 | #include <linux/types.h> | ||
11 | #include <linux/build_bug.h> | ||
12 | |||
13 | /* | ||
14 | * the name of tagged pointer types are tagptr{1, 2, 3...}_t | ||
15 | * avoid directly using the internal structs __tagptr{1, 2, 3...} | ||
16 | */ | ||
17 | #define __MAKE_TAGPTR(n) \ | ||
18 | typedef struct __tagptr##n { \ | ||
19 | uintptr_t v; \ | ||
20 | } tagptr##n##_t; | ||
21 | |||
22 | __MAKE_TAGPTR(1) | ||
23 | __MAKE_TAGPTR(2) | ||
24 | __MAKE_TAGPTR(3) | ||
25 | __MAKE_TAGPTR(4) | ||
26 | |||
27 | #undef __MAKE_TAGPTR | ||
28 | |||
29 | extern void __compiletime_error("bad tagptr tags") | ||
30 | __bad_tagptr_tags(void); | ||
31 | |||
32 | extern void __compiletime_error("bad tagptr type") | ||
33 | __bad_tagptr_type(void); | ||
34 | |||
35 | /* fix the broken usage of "#define tagptr2_t tagptr3_t" by users */ | ||
36 | #define __tagptr_mask_1(ptr, n) \ | ||
37 | __builtin_types_compatible_p(typeof(ptr), struct __tagptr##n) ? \ | ||
38 | (1UL << (n)) - 1 : | ||
39 | |||
40 | #define __tagptr_mask(ptr) (\ | ||
41 | __tagptr_mask_1(ptr, 1) ( \ | ||
42 | __tagptr_mask_1(ptr, 2) ( \ | ||
43 | __tagptr_mask_1(ptr, 3) ( \ | ||
44 | __tagptr_mask_1(ptr, 4) ( \ | ||
45 | __bad_tagptr_type(), 0))))) | ||
46 | |||
47 | /* generate a tagged pointer from a raw value */ | ||
48 | #define tagptr_init(type, val) \ | ||
49 | ((typeof(type)){ .v = (uintptr_t)(val) }) | ||
50 | |||
51 | /* | ||
52 | * directly cast a tagged pointer to the native pointer type, which | ||
53 | * could be used for backward compatibility of existing code. | ||
54 | */ | ||
55 | #define tagptr_cast_ptr(tptr) ((void *)(tptr).v) | ||
56 | |||
57 | /* encode tagged pointers */ | ||
58 | #define tagptr_fold(type, ptr, _tags) ({ \ | ||
59 | const typeof(_tags) tags = (_tags); \ | ||
60 | if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(type))) \ | ||
61 | __bad_tagptr_tags(); \ | ||
62 | tagptr_init(type, (uintptr_t)(ptr) | tags); }) | ||
63 | |||
64 | /* decode tagged pointers */ | ||
65 | #define tagptr_unfold_ptr(tptr) \ | ||
66 | ((void *)((tptr).v & ~__tagptr_mask(tptr))) | ||
67 | |||
68 | #define tagptr_unfold_tags(tptr) \ | ||
69 | ((tptr).v & __tagptr_mask(tptr)) | ||
70 | |||
71 | /* operations for the tagger pointer */ | ||
72 | #define tagptr_eq(_tptr1, _tptr2) ({ \ | ||
73 | typeof(_tptr1) tptr1 = (_tptr1); \ | ||
74 | typeof(_tptr2) tptr2 = (_tptr2); \ | ||
75 | (void)(&tptr1 == &tptr2); \ | ||
76 | (tptr1).v == (tptr2).v; }) | ||
77 | |||
78 | /* lock-free CAS operation */ | ||
79 | #define tagptr_cmpxchg(_ptptr, _o, _n) ({ \ | ||
80 | typeof(_ptptr) ptptr = (_ptptr); \ | ||
81 | typeof(_o) o = (_o); \ | ||
82 | typeof(_n) n = (_n); \ | ||
83 | (void)(&o == &n); \ | ||
84 | (void)(&o == ptptr); \ | ||
85 | tagptr_init(o, cmpxchg(&ptptr->v, o.v, n.v)); }) | ||
86 | |||
87 | /* wrap WRITE_ONCE if atomic update is needed */ | ||
88 | #define tagptr_replace_tags(_ptptr, tags) ({ \ | ||
89 | typeof(_ptptr) ptptr = (_ptptr); \ | ||
90 | *ptptr = tagptr_fold(*ptptr, tagptr_unfold_ptr(*ptptr), tags); \ | ||
91 | *ptptr; }) | ||
92 | |||
93 | #define tagptr_set_tags(_ptptr, _tags) ({ \ | ||
94 | typeof(_ptptr) ptptr = (_ptptr); \ | ||
95 | const typeof(_tags) tags = (_tags); \ | ||
96 | if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(*ptptr))) \ | ||
97 | __bad_tagptr_tags(); \ | ||
98 | ptptr->v |= tags; \ | ||
99 | *ptptr; }) | ||
100 | |||
101 | #define tagptr_clear_tags(_ptptr, _tags) ({ \ | ||
102 | typeof(_ptptr) ptptr = (_ptptr); \ | ||
103 | const typeof(_tags) tags = (_tags); \ | ||
104 | if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(*ptptr))) \ | ||
105 | __bad_tagptr_tags(); \ | ||
106 | ptptr->v &= ~tags; \ | ||
107 | *ptptr; }) | ||
108 | |||
109 | #endif /* __EROFS_FS_TAGPTR_H */ | ||
110 | |||
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c new file mode 100644 index 000000000000..1dd041aa0f5a --- /dev/null +++ b/fs/erofs/utils.c | |||
@@ -0,0 +1,333 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-only | ||
2 | /* | ||
3 | * Copyright (C) 2018 HUAWEI, Inc. | ||
4 | * http://www.huawei.com/ | ||
5 | * Created by Gao Xiang <gaoxiang25@huawei.com> | ||
6 | */ | ||
7 | #include "internal.h" | ||
8 | #include <linux/pagevec.h> | ||
9 | |||
10 | struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp, bool nofail) | ||
11 | { | ||
12 | struct page *page; | ||
13 | |||
14 | if (!list_empty(pool)) { | ||
15 | page = lru_to_page(pool); | ||
16 | DBG_BUGON(page_ref_count(page) != 1); | ||
17 | list_del(&page->lru); | ||
18 | } else { | ||
19 | page = alloc_pages(gfp | (nofail ? __GFP_NOFAIL : 0), 0); | ||
20 | } | ||
21 | return page; | ||
22 | } | ||
23 | |||
24 | #if (EROFS_PCPUBUF_NR_PAGES > 0) | ||
25 | static struct { | ||
26 | u8 data[PAGE_SIZE * EROFS_PCPUBUF_NR_PAGES]; | ||
27 | } ____cacheline_aligned_in_smp erofs_pcpubuf[NR_CPUS]; | ||
28 | |||
29 | void *erofs_get_pcpubuf(unsigned int pagenr) | ||
30 | { | ||
31 | preempt_disable(); | ||
32 | return &erofs_pcpubuf[smp_processor_id()].data[pagenr * PAGE_SIZE]; | ||
33 | } | ||
34 | #endif | ||
35 | |||
36 | #ifdef CONFIG_EROFS_FS_ZIP | ||
37 | /* global shrink count (for all mounted EROFS instances) */ | ||
38 | static atomic_long_t erofs_global_shrink_cnt; | ||
39 | |||
40 | #define __erofs_workgroup_get(grp) atomic_inc(&(grp)->refcount) | ||
41 | #define __erofs_workgroup_put(grp) atomic_dec(&(grp)->refcount) | ||
42 | |||
43 | static int erofs_workgroup_get(struct erofs_workgroup *grp) | ||
44 | { | ||
45 | int o; | ||
46 | |||
47 | repeat: | ||
48 | o = erofs_wait_on_workgroup_freezed(grp); | ||
49 | if (unlikely(o <= 0)) | ||
50 | return -1; | ||
51 | |||
52 | if (unlikely(atomic_cmpxchg(&grp->refcount, o, o + 1) != o)) | ||
53 | goto repeat; | ||
54 | |||
55 | /* decrease refcount paired by erofs_workgroup_put */ | ||
56 | if (unlikely(o == 1)) | ||
57 | atomic_long_dec(&erofs_global_shrink_cnt); | ||
58 | return 0; | ||
59 | } | ||
60 | |||
61 | struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb, | ||
62 | pgoff_t index, bool *tag) | ||
63 | { | ||
64 | struct erofs_sb_info *sbi = EROFS_SB(sb); | ||
65 | struct erofs_workgroup *grp; | ||
66 | |||
67 | repeat: | ||
68 | rcu_read_lock(); | ||
69 | grp = radix_tree_lookup(&sbi->workstn_tree, index); | ||
70 | if (grp) { | ||
71 | *tag = xa_pointer_tag(grp); | ||
72 | grp = xa_untag_pointer(grp); | ||
73 | |||
74 | if (erofs_workgroup_get(grp)) { | ||
75 | /* prefer to relax rcu read side */ | ||
76 | rcu_read_unlock(); | ||
77 | goto repeat; | ||
78 | } | ||
79 | |||
80 | DBG_BUGON(index != grp->index); | ||
81 | } | ||
82 | rcu_read_unlock(); | ||
83 | return grp; | ||
84 | } | ||
85 | |||
86 | int erofs_register_workgroup(struct super_block *sb, | ||
87 | struct erofs_workgroup *grp, | ||
88 | bool tag) | ||
89 | { | ||
90 | struct erofs_sb_info *sbi; | ||
91 | int err; | ||
92 | |||
93 | /* grp shouldn't be broken or used before */ | ||
94 | if (unlikely(atomic_read(&grp->refcount) != 1)) { | ||
95 | DBG_BUGON(1); | ||
96 | return -EINVAL; | ||
97 | } | ||
98 | |||
99 | err = radix_tree_preload(GFP_NOFS); | ||
100 | if (err) | ||
101 | return err; | ||
102 | |||
103 | sbi = EROFS_SB(sb); | ||
104 | xa_lock(&sbi->workstn_tree); | ||
105 | |||
106 | grp = xa_tag_pointer(grp, tag); | ||
107 | |||
108 | /* | ||
109 | * Bump up reference count before making this workgroup | ||
110 | * visible to other users in order to avoid potential UAF | ||
111 | * without serialized by workstn_lock. | ||
112 | */ | ||
113 | __erofs_workgroup_get(grp); | ||
114 | |||
115 | err = radix_tree_insert(&sbi->workstn_tree, grp->index, grp); | ||
116 | if (unlikely(err)) | ||
117 | /* | ||
118 | * it's safe to decrease since the workgroup isn't visible | ||
119 | * and refcount >= 2 (cannot be freezed). | ||
120 | */ | ||
121 | __erofs_workgroup_put(grp); | ||
122 | |||
123 | xa_unlock(&sbi->workstn_tree); | ||
124 | radix_tree_preload_end(); | ||
125 | return err; | ||
126 | } | ||
127 | |||
128 | static void __erofs_workgroup_free(struct erofs_workgroup *grp) | ||
129 | { | ||
130 | atomic_long_dec(&erofs_global_shrink_cnt); | ||
131 | erofs_workgroup_free_rcu(grp); | ||
132 | } | ||
133 | |||
134 | int erofs_workgroup_put(struct erofs_workgroup *grp) | ||
135 | { | ||
136 | int count = atomic_dec_return(&grp->refcount); | ||
137 | |||
138 | if (count == 1) | ||
139 | atomic_long_inc(&erofs_global_shrink_cnt); | ||
140 | else if (!count) | ||
141 | __erofs_workgroup_free(grp); | ||
142 | return count; | ||
143 | } | ||
144 | |||
145 | static void erofs_workgroup_unfreeze_final(struct erofs_workgroup *grp) | ||
146 | { | ||
147 | erofs_workgroup_unfreeze(grp, 0); | ||
148 | __erofs_workgroup_free(grp); | ||
149 | } | ||
150 | |||
151 | static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi, | ||
152 | struct erofs_workgroup *grp, | ||
153 | bool cleanup) | ||
154 | { | ||
155 | /* | ||
156 | * If managed cache is on, refcount of workgroups | ||
157 | * themselves could be < 0 (freezed). In other words, | ||
158 | * there is no guarantee that all refcounts > 0. | ||
159 | */ | ||
160 | if (!erofs_workgroup_try_to_freeze(grp, 1)) | ||
161 | return false; | ||
162 | |||
163 | /* | ||
164 | * Note that all cached pages should be unattached | ||
165 | * before deleted from the radix tree. Otherwise some | ||
166 | * cached pages could be still attached to the orphan | ||
167 | * old workgroup when the new one is available in the tree. | ||
168 | */ | ||
169 | if (erofs_try_to_free_all_cached_pages(sbi, grp)) { | ||
170 | erofs_workgroup_unfreeze(grp, 1); | ||
171 | return false; | ||
172 | } | ||
173 | |||
174 | /* | ||
175 | * It's impossible to fail after the workgroup is freezed, | ||
176 | * however in order to avoid some race conditions, add a | ||
177 | * DBG_BUGON to observe this in advance. | ||
178 | */ | ||
179 | DBG_BUGON(xa_untag_pointer(radix_tree_delete(&sbi->workstn_tree, | ||
180 | grp->index)) != grp); | ||
181 | |||
182 | /* | ||
183 | * If managed cache is on, last refcount should indicate | ||
184 | * the related workstation. | ||
185 | */ | ||
186 | erofs_workgroup_unfreeze_final(grp); | ||
187 | return true; | ||
188 | } | ||
189 | |||
190 | static unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi, | ||
191 | unsigned long nr_shrink, | ||
192 | bool cleanup) | ||
193 | { | ||
194 | pgoff_t first_index = 0; | ||
195 | void *batch[PAGEVEC_SIZE]; | ||
196 | unsigned int freed = 0; | ||
197 | |||
198 | int i, found; | ||
199 | repeat: | ||
200 | xa_lock(&sbi->workstn_tree); | ||
201 | |||
202 | found = radix_tree_gang_lookup(&sbi->workstn_tree, | ||
203 | batch, first_index, PAGEVEC_SIZE); | ||
204 | |||
205 | for (i = 0; i < found; ++i) { | ||
206 | struct erofs_workgroup *grp = xa_untag_pointer(batch[i]); | ||
207 | |||
208 | first_index = grp->index + 1; | ||
209 | |||
210 | /* try to shrink each valid workgroup */ | ||
211 | if (!erofs_try_to_release_workgroup(sbi, grp, cleanup)) | ||
212 | continue; | ||
213 | |||
214 | ++freed; | ||
215 | if (unlikely(!--nr_shrink)) | ||
216 | break; | ||
217 | } | ||
218 | xa_unlock(&sbi->workstn_tree); | ||
219 | |||
220 | if (i && nr_shrink) | ||
221 | goto repeat; | ||
222 | return freed; | ||
223 | } | ||
224 | |||
225 | /* protected by 'erofs_sb_list_lock' */ | ||
226 | static unsigned int shrinker_run_no; | ||
227 | |||
228 | /* protects the mounted 'erofs_sb_list' */ | ||
229 | static DEFINE_SPINLOCK(erofs_sb_list_lock); | ||
230 | static LIST_HEAD(erofs_sb_list); | ||
231 | |||
232 | void erofs_shrinker_register(struct super_block *sb) | ||
233 | { | ||
234 | struct erofs_sb_info *sbi = EROFS_SB(sb); | ||
235 | |||
236 | mutex_init(&sbi->umount_mutex); | ||
237 | |||
238 | spin_lock(&erofs_sb_list_lock); | ||
239 | list_add(&sbi->list, &erofs_sb_list); | ||
240 | spin_unlock(&erofs_sb_list_lock); | ||
241 | } | ||
242 | |||
243 | void erofs_shrinker_unregister(struct super_block *sb) | ||
244 | { | ||
245 | struct erofs_sb_info *const sbi = EROFS_SB(sb); | ||
246 | |||
247 | mutex_lock(&sbi->umount_mutex); | ||
248 | erofs_shrink_workstation(sbi, ~0UL, true); | ||
249 | |||
250 | spin_lock(&erofs_sb_list_lock); | ||
251 | list_del(&sbi->list); | ||
252 | spin_unlock(&erofs_sb_list_lock); | ||
253 | mutex_unlock(&sbi->umount_mutex); | ||
254 | } | ||
255 | |||
256 | static unsigned long erofs_shrink_count(struct shrinker *shrink, | ||
257 | struct shrink_control *sc) | ||
258 | { | ||
259 | return atomic_long_read(&erofs_global_shrink_cnt); | ||
260 | } | ||
261 | |||
262 | static unsigned long erofs_shrink_scan(struct shrinker *shrink, | ||
263 | struct shrink_control *sc) | ||
264 | { | ||
265 | struct erofs_sb_info *sbi; | ||
266 | struct list_head *p; | ||
267 | |||
268 | unsigned long nr = sc->nr_to_scan; | ||
269 | unsigned int run_no; | ||
270 | unsigned long freed = 0; | ||
271 | |||
272 | spin_lock(&erofs_sb_list_lock); | ||
273 | do { | ||
274 | run_no = ++shrinker_run_no; | ||
275 | } while (run_no == 0); | ||
276 | |||
277 | /* Iterate over all mounted superblocks and try to shrink them */ | ||
278 | p = erofs_sb_list.next; | ||
279 | while (p != &erofs_sb_list) { | ||
280 | sbi = list_entry(p, struct erofs_sb_info, list); | ||
281 | |||
282 | /* | ||
283 | * We move the ones we do to the end of the list, so we stop | ||
284 | * when we see one we have already done. | ||
285 | */ | ||
286 | if (sbi->shrinker_run_no == run_no) | ||
287 | break; | ||
288 | |||
289 | if (!mutex_trylock(&sbi->umount_mutex)) { | ||
290 | p = p->next; | ||
291 | continue; | ||
292 | } | ||
293 | |||
294 | spin_unlock(&erofs_sb_list_lock); | ||
295 | sbi->shrinker_run_no = run_no; | ||
296 | |||
297 | freed += erofs_shrink_workstation(sbi, nr, false); | ||
298 | |||
299 | spin_lock(&erofs_sb_list_lock); | ||
300 | /* Get the next list element before we move this one */ | ||
301 | p = p->next; | ||
302 | |||
303 | /* | ||
304 | * Move this one to the end of the list to provide some | ||
305 | * fairness. | ||
306 | */ | ||
307 | list_move_tail(&sbi->list, &erofs_sb_list); | ||
308 | mutex_unlock(&sbi->umount_mutex); | ||
309 | |||
310 | if (freed >= nr) | ||
311 | break; | ||
312 | } | ||
313 | spin_unlock(&erofs_sb_list_lock); | ||
314 | return freed; | ||
315 | } | ||
316 | |||
317 | static struct shrinker erofs_shrinker_info = { | ||
318 | .scan_objects = erofs_shrink_scan, | ||
319 | .count_objects = erofs_shrink_count, | ||
320 | .seeks = DEFAULT_SEEKS, | ||
321 | }; | ||
322 | |||
323 | int __init erofs_init_shrinker(void) | ||
324 | { | ||
325 | return register_shrinker(&erofs_shrinker_info); | ||
326 | } | ||
327 | |||
328 | void erofs_exit_shrinker(void) | ||
329 | { | ||
330 | unregister_shrinker(&erofs_shrinker_info); | ||
331 | } | ||
332 | #endif /* !CONFIG_EROFS_FS_ZIP */ | ||
333 | |||
diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c new file mode 100644 index 000000000000..a8286998a079 --- /dev/null +++ b/fs/erofs/xattr.c | |||
@@ -0,0 +1,703 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-only | ||
2 | /* | ||
3 | * Copyright (C) 2017-2018 HUAWEI, Inc. | ||
4 | * http://www.huawei.com/ | ||
5 | * Created by Gao Xiang <gaoxiang25@huawei.com> | ||
6 | */ | ||
7 | #include <linux/security.h> | ||
8 | #include "xattr.h" | ||
9 | |||
10 | struct xattr_iter { | ||
11 | struct super_block *sb; | ||
12 | struct page *page; | ||
13 | void *kaddr; | ||
14 | |||
15 | erofs_blk_t blkaddr; | ||
16 | unsigned int ofs; | ||
17 | }; | ||
18 | |||
19 | static inline void xattr_iter_end(struct xattr_iter *it, bool atomic) | ||
20 | { | ||
21 | /* the only user of kunmap() is 'init_inode_xattrs' */ | ||
22 | if (unlikely(!atomic)) | ||
23 | kunmap(it->page); | ||
24 | else | ||
25 | kunmap_atomic(it->kaddr); | ||
26 | |||
27 | unlock_page(it->page); | ||
28 | put_page(it->page); | ||
29 | } | ||
30 | |||
31 | static inline void xattr_iter_end_final(struct xattr_iter *it) | ||
32 | { | ||
33 | if (!it->page) | ||
34 | return; | ||
35 | |||
36 | xattr_iter_end(it, true); | ||
37 | } | ||
38 | |||
39 | static int init_inode_xattrs(struct inode *inode) | ||
40 | { | ||
41 | struct erofs_vnode *const vi = EROFS_V(inode); | ||
42 | struct xattr_iter it; | ||
43 | unsigned int i; | ||
44 | struct erofs_xattr_ibody_header *ih; | ||
45 | struct super_block *sb; | ||
46 | struct erofs_sb_info *sbi; | ||
47 | bool atomic_map; | ||
48 | int ret = 0; | ||
49 | |||
50 | /* the most case is that xattrs of this inode are initialized. */ | ||
51 | if (test_bit(EROFS_V_EA_INITED_BIT, &vi->flags)) | ||
52 | return 0; | ||
53 | |||
54 | if (wait_on_bit_lock(&vi->flags, EROFS_V_BL_XATTR_BIT, TASK_KILLABLE)) | ||
55 | return -ERESTARTSYS; | ||
56 | |||
57 | /* someone has initialized xattrs for us? */ | ||
58 | if (test_bit(EROFS_V_EA_INITED_BIT, &vi->flags)) | ||
59 | goto out_unlock; | ||
60 | |||
61 | /* | ||
62 | * bypass all xattr operations if ->xattr_isize is not greater than | ||
63 | * sizeof(struct erofs_xattr_ibody_header), in detail: | ||
64 | * 1) it is not enough to contain erofs_xattr_ibody_header then | ||
65 | * ->xattr_isize should be 0 (it means no xattr); | ||
66 | * 2) it is just to contain erofs_xattr_ibody_header, which is on-disk | ||
67 | * undefined right now (maybe use later with some new sb feature). | ||
68 | */ | ||
69 | if (vi->xattr_isize == sizeof(struct erofs_xattr_ibody_header)) { | ||
70 | errln("xattr_isize %d of nid %llu is not supported yet", | ||
71 | vi->xattr_isize, vi->nid); | ||
72 | ret = -EOPNOTSUPP; | ||
73 | goto out_unlock; | ||
74 | } else if (vi->xattr_isize < sizeof(struct erofs_xattr_ibody_header)) { | ||
75 | if (unlikely(vi->xattr_isize)) { | ||
76 | errln("bogus xattr ibody @ nid %llu", vi->nid); | ||
77 | DBG_BUGON(1); | ||
78 | ret = -EFSCORRUPTED; | ||
79 | goto out_unlock; /* xattr ondisk layout error */ | ||
80 | } | ||
81 | ret = -ENOATTR; | ||
82 | goto out_unlock; | ||
83 | } | ||
84 | |||
85 | sb = inode->i_sb; | ||
86 | sbi = EROFS_SB(sb); | ||
87 | it.blkaddr = erofs_blknr(iloc(sbi, vi->nid) + vi->inode_isize); | ||
88 | it.ofs = erofs_blkoff(iloc(sbi, vi->nid) + vi->inode_isize); | ||
89 | |||
90 | it.page = erofs_get_inline_page(inode, it.blkaddr); | ||
91 | if (IS_ERR(it.page)) { | ||
92 | ret = PTR_ERR(it.page); | ||
93 | goto out_unlock; | ||
94 | } | ||
95 | |||
96 | /* read in shared xattr array (non-atomic, see kmalloc below) */ | ||
97 | it.kaddr = kmap(it.page); | ||
98 | atomic_map = false; | ||
99 | |||
100 | ih = (struct erofs_xattr_ibody_header *)(it.kaddr + it.ofs); | ||
101 | |||
102 | vi->xattr_shared_count = ih->h_shared_count; | ||
103 | vi->xattr_shared_xattrs = kmalloc_array(vi->xattr_shared_count, | ||
104 | sizeof(uint), GFP_KERNEL); | ||
105 | if (!vi->xattr_shared_xattrs) { | ||
106 | xattr_iter_end(&it, atomic_map); | ||
107 | ret = -ENOMEM; | ||
108 | goto out_unlock; | ||
109 | } | ||
110 | |||
111 | /* let's skip ibody header */ | ||
112 | it.ofs += sizeof(struct erofs_xattr_ibody_header); | ||
113 | |||
114 | for (i = 0; i < vi->xattr_shared_count; ++i) { | ||
115 | if (unlikely(it.ofs >= EROFS_BLKSIZ)) { | ||
116 | /* cannot be unaligned */ | ||
117 | DBG_BUGON(it.ofs != EROFS_BLKSIZ); | ||
118 | xattr_iter_end(&it, atomic_map); | ||
119 | |||
120 | it.page = erofs_get_meta_page(sb, ++it.blkaddr, | ||
121 | S_ISDIR(inode->i_mode)); | ||
122 | if (IS_ERR(it.page)) { | ||
123 | kfree(vi->xattr_shared_xattrs); | ||
124 | vi->xattr_shared_xattrs = NULL; | ||
125 | ret = PTR_ERR(it.page); | ||
126 | goto out_unlock; | ||
127 | } | ||
128 | |||
129 | it.kaddr = kmap_atomic(it.page); | ||
130 | atomic_map = true; | ||
131 | it.ofs = 0; | ||
132 | } | ||
133 | vi->xattr_shared_xattrs[i] = | ||
134 | le32_to_cpu(*(__le32 *)(it.kaddr + it.ofs)); | ||
135 | it.ofs += sizeof(__le32); | ||
136 | } | ||
137 | xattr_iter_end(&it, atomic_map); | ||
138 | |||
139 | set_bit(EROFS_V_EA_INITED_BIT, &vi->flags); | ||
140 | |||
141 | out_unlock: | ||
142 | clear_and_wake_up_bit(EROFS_V_BL_XATTR_BIT, &vi->flags); | ||
143 | return ret; | ||
144 | } | ||
145 | |||
146 | /* | ||
147 | * the general idea for these return values is | ||
148 | * if 0 is returned, go on processing the current xattr; | ||
149 | * 1 (> 0) is returned, skip this round to process the next xattr; | ||
150 | * -err (< 0) is returned, an error (maybe ENOXATTR) occurred | ||
151 | * and need to be handled | ||
152 | */ | ||
153 | struct xattr_iter_handlers { | ||
154 | int (*entry)(struct xattr_iter *_it, struct erofs_xattr_entry *entry); | ||
155 | int (*name)(struct xattr_iter *_it, unsigned int processed, char *buf, | ||
156 | unsigned int len); | ||
157 | int (*alloc_buffer)(struct xattr_iter *_it, unsigned int value_sz); | ||
158 | void (*value)(struct xattr_iter *_it, unsigned int processed, char *buf, | ||
159 | unsigned int len); | ||
160 | }; | ||
161 | |||
162 | static inline int xattr_iter_fixup(struct xattr_iter *it) | ||
163 | { | ||
164 | if (it->ofs < EROFS_BLKSIZ) | ||
165 | return 0; | ||
166 | |||
167 | xattr_iter_end(it, true); | ||
168 | |||
169 | it->blkaddr += erofs_blknr(it->ofs); | ||
170 | |||
171 | it->page = erofs_get_meta_page(it->sb, it->blkaddr, false); | ||
172 | if (IS_ERR(it->page)) { | ||
173 | int err = PTR_ERR(it->page); | ||
174 | |||
175 | it->page = NULL; | ||
176 | return err; | ||
177 | } | ||
178 | |||
179 | it->kaddr = kmap_atomic(it->page); | ||
180 | it->ofs = erofs_blkoff(it->ofs); | ||
181 | return 0; | ||
182 | } | ||
183 | |||
184 | static int inline_xattr_iter_begin(struct xattr_iter *it, | ||
185 | struct inode *inode) | ||
186 | { | ||
187 | struct erofs_vnode *const vi = EROFS_V(inode); | ||
188 | struct erofs_sb_info *const sbi = EROFS_SB(inode->i_sb); | ||
189 | unsigned int xattr_header_sz, inline_xattr_ofs; | ||
190 | |||
191 | xattr_header_sz = inlinexattr_header_size(inode); | ||
192 | if (unlikely(xattr_header_sz >= vi->xattr_isize)) { | ||
193 | DBG_BUGON(xattr_header_sz > vi->xattr_isize); | ||
194 | return -ENOATTR; | ||
195 | } | ||
196 | |||
197 | inline_xattr_ofs = vi->inode_isize + xattr_header_sz; | ||
198 | |||
199 | it->blkaddr = erofs_blknr(iloc(sbi, vi->nid) + inline_xattr_ofs); | ||
200 | it->ofs = erofs_blkoff(iloc(sbi, vi->nid) + inline_xattr_ofs); | ||
201 | |||
202 | it->page = erofs_get_inline_page(inode, it->blkaddr); | ||
203 | if (IS_ERR(it->page)) | ||
204 | return PTR_ERR(it->page); | ||
205 | |||
206 | it->kaddr = kmap_atomic(it->page); | ||
207 | return vi->xattr_isize - xattr_header_sz; | ||
208 | } | ||
209 | |||
210 | /* | ||
211 | * Regardless of success or failure, `xattr_foreach' will end up with | ||
212 | * `ofs' pointing to the next xattr item rather than an arbitrary position. | ||
213 | */ | ||
214 | static int xattr_foreach(struct xattr_iter *it, | ||
215 | const struct xattr_iter_handlers *op, | ||
216 | unsigned int *tlimit) | ||
217 | { | ||
218 | struct erofs_xattr_entry entry; | ||
219 | unsigned int value_sz, processed, slice; | ||
220 | int err; | ||
221 | |||
222 | /* 0. fixup blkaddr, ofs, ipage */ | ||
223 | err = xattr_iter_fixup(it); | ||
224 | if (err) | ||
225 | return err; | ||
226 | |||
227 | /* | ||
228 | * 1. read xattr entry to the memory, | ||
229 | * since we do EROFS_XATTR_ALIGN | ||
230 | * therefore entry should be in the page | ||
231 | */ | ||
232 | entry = *(struct erofs_xattr_entry *)(it->kaddr + it->ofs); | ||
233 | if (tlimit) { | ||
234 | unsigned int entry_sz = EROFS_XATTR_ENTRY_SIZE(&entry); | ||
235 | |||
236 | /* xattr on-disk corruption: xattr entry beyond xattr_isize */ | ||
237 | if (unlikely(*tlimit < entry_sz)) { | ||
238 | DBG_BUGON(1); | ||
239 | return -EFSCORRUPTED; | ||
240 | } | ||
241 | *tlimit -= entry_sz; | ||
242 | } | ||
243 | |||
244 | it->ofs += sizeof(struct erofs_xattr_entry); | ||
245 | value_sz = le16_to_cpu(entry.e_value_size); | ||
246 | |||
247 | /* handle entry */ | ||
248 | err = op->entry(it, &entry); | ||
249 | if (err) { | ||
250 | it->ofs += entry.e_name_len + value_sz; | ||
251 | goto out; | ||
252 | } | ||
253 | |||
254 | /* 2. handle xattr name (ofs will finally be at the end of name) */ | ||
255 | processed = 0; | ||
256 | |||
257 | while (processed < entry.e_name_len) { | ||
258 | if (it->ofs >= EROFS_BLKSIZ) { | ||
259 | DBG_BUGON(it->ofs > EROFS_BLKSIZ); | ||
260 | |||
261 | err = xattr_iter_fixup(it); | ||
262 | if (err) | ||
263 | goto out; | ||
264 | it->ofs = 0; | ||
265 | } | ||
266 | |||
267 | slice = min_t(unsigned int, PAGE_SIZE - it->ofs, | ||
268 | entry.e_name_len - processed); | ||
269 | |||
270 | /* handle name */ | ||
271 | err = op->name(it, processed, it->kaddr + it->ofs, slice); | ||
272 | if (err) { | ||
273 | it->ofs += entry.e_name_len - processed + value_sz; | ||
274 | goto out; | ||
275 | } | ||
276 | |||
277 | it->ofs += slice; | ||
278 | processed += slice; | ||
279 | } | ||
280 | |||
281 | /* 3. handle xattr value */ | ||
282 | processed = 0; | ||
283 | |||
284 | if (op->alloc_buffer) { | ||
285 | err = op->alloc_buffer(it, value_sz); | ||
286 | if (err) { | ||
287 | it->ofs += value_sz; | ||
288 | goto out; | ||
289 | } | ||
290 | } | ||
291 | |||
292 | while (processed < value_sz) { | ||
293 | if (it->ofs >= EROFS_BLKSIZ) { | ||
294 | DBG_BUGON(it->ofs > EROFS_BLKSIZ); | ||
295 | |||
296 | err = xattr_iter_fixup(it); | ||
297 | if (err) | ||
298 | goto out; | ||
299 | it->ofs = 0; | ||
300 | } | ||
301 | |||
302 | slice = min_t(unsigned int, PAGE_SIZE - it->ofs, | ||
303 | value_sz - processed); | ||
304 | op->value(it, processed, it->kaddr + it->ofs, slice); | ||
305 | it->ofs += slice; | ||
306 | processed += slice; | ||
307 | } | ||
308 | |||
309 | out: | ||
310 | /* xattrs should be 4-byte aligned (on-disk constraint) */ | ||
311 | it->ofs = EROFS_XATTR_ALIGN(it->ofs); | ||
312 | return err < 0 ? err : 0; | ||
313 | } | ||
314 | |||
315 | struct getxattr_iter { | ||
316 | struct xattr_iter it; | ||
317 | |||
318 | char *buffer; | ||
319 | int buffer_size, index; | ||
320 | struct qstr name; | ||
321 | }; | ||
322 | |||
323 | static int xattr_entrymatch(struct xattr_iter *_it, | ||
324 | struct erofs_xattr_entry *entry) | ||
325 | { | ||
326 | struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it); | ||
327 | |||
328 | return (it->index != entry->e_name_index || | ||
329 | it->name.len != entry->e_name_len) ? -ENOATTR : 0; | ||
330 | } | ||
331 | |||
332 | static int xattr_namematch(struct xattr_iter *_it, | ||
333 | unsigned int processed, char *buf, unsigned int len) | ||
334 | { | ||
335 | struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it); | ||
336 | |||
337 | return memcmp(buf, it->name.name + processed, len) ? -ENOATTR : 0; | ||
338 | } | ||
339 | |||
340 | static int xattr_checkbuffer(struct xattr_iter *_it, | ||
341 | unsigned int value_sz) | ||
342 | { | ||
343 | struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it); | ||
344 | int err = it->buffer_size < value_sz ? -ERANGE : 0; | ||
345 | |||
346 | it->buffer_size = value_sz; | ||
347 | return !it->buffer ? 1 : err; | ||
348 | } | ||
349 | |||
350 | static void xattr_copyvalue(struct xattr_iter *_it, | ||
351 | unsigned int processed, | ||
352 | char *buf, unsigned int len) | ||
353 | { | ||
354 | struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it); | ||
355 | |||
356 | memcpy(it->buffer + processed, buf, len); | ||
357 | } | ||
358 | |||
359 | static const struct xattr_iter_handlers find_xattr_handlers = { | ||
360 | .entry = xattr_entrymatch, | ||
361 | .name = xattr_namematch, | ||
362 | .alloc_buffer = xattr_checkbuffer, | ||
363 | .value = xattr_copyvalue | ||
364 | }; | ||
365 | |||
366 | static int inline_getxattr(struct inode *inode, struct getxattr_iter *it) | ||
367 | { | ||
368 | int ret; | ||
369 | unsigned int remaining; | ||
370 | |||
371 | ret = inline_xattr_iter_begin(&it->it, inode); | ||
372 | if (ret < 0) | ||
373 | return ret; | ||
374 | |||
375 | remaining = ret; | ||
376 | while (remaining) { | ||
377 | ret = xattr_foreach(&it->it, &find_xattr_handlers, &remaining); | ||
378 | if (ret != -ENOATTR) | ||
379 | break; | ||
380 | } | ||
381 | xattr_iter_end_final(&it->it); | ||
382 | |||
383 | return ret ? ret : it->buffer_size; | ||
384 | } | ||
385 | |||
386 | static int shared_getxattr(struct inode *inode, struct getxattr_iter *it) | ||
387 | { | ||
388 | struct erofs_vnode *const vi = EROFS_V(inode); | ||
389 | struct super_block *const sb = inode->i_sb; | ||
390 | struct erofs_sb_info *const sbi = EROFS_SB(sb); | ||
391 | unsigned int i; | ||
392 | int ret = -ENOATTR; | ||
393 | |||
394 | for (i = 0; i < vi->xattr_shared_count; ++i) { | ||
395 | erofs_blk_t blkaddr = | ||
396 | xattrblock_addr(sbi, vi->xattr_shared_xattrs[i]); | ||
397 | |||
398 | it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]); | ||
399 | |||
400 | if (!i || blkaddr != it->it.blkaddr) { | ||
401 | if (i) | ||
402 | xattr_iter_end(&it->it, true); | ||
403 | |||
404 | it->it.page = erofs_get_meta_page(sb, blkaddr, false); | ||
405 | if (IS_ERR(it->it.page)) | ||
406 | return PTR_ERR(it->it.page); | ||
407 | |||
408 | it->it.kaddr = kmap_atomic(it->it.page); | ||
409 | it->it.blkaddr = blkaddr; | ||
410 | } | ||
411 | |||
412 | ret = xattr_foreach(&it->it, &find_xattr_handlers, NULL); | ||
413 | if (ret != -ENOATTR) | ||
414 | break; | ||
415 | } | ||
416 | if (vi->xattr_shared_count) | ||
417 | xattr_iter_end_final(&it->it); | ||
418 | |||
419 | return ret ? ret : it->buffer_size; | ||
420 | } | ||
421 | |||
422 | static bool erofs_xattr_user_list(struct dentry *dentry) | ||
423 | { | ||
424 | return test_opt(EROFS_SB(dentry->d_sb), XATTR_USER); | ||
425 | } | ||
426 | |||
427 | static bool erofs_xattr_trusted_list(struct dentry *dentry) | ||
428 | { | ||
429 | return capable(CAP_SYS_ADMIN); | ||
430 | } | ||
431 | |||
432 | int erofs_getxattr(struct inode *inode, int index, | ||
433 | const char *name, | ||
434 | void *buffer, size_t buffer_size) | ||
435 | { | ||
436 | int ret; | ||
437 | struct getxattr_iter it; | ||
438 | |||
439 | if (unlikely(!name)) | ||
440 | return -EINVAL; | ||
441 | |||
442 | ret = init_inode_xattrs(inode); | ||
443 | if (ret) | ||
444 | return ret; | ||
445 | |||
446 | it.index = index; | ||
447 | |||
448 | it.name.len = strlen(name); | ||
449 | if (it.name.len > EROFS_NAME_LEN) | ||
450 | return -ERANGE; | ||
451 | it.name.name = name; | ||
452 | |||
453 | it.buffer = buffer; | ||
454 | it.buffer_size = buffer_size; | ||
455 | |||
456 | it.it.sb = inode->i_sb; | ||
457 | ret = inline_getxattr(inode, &it); | ||
458 | if (ret == -ENOATTR) | ||
459 | ret = shared_getxattr(inode, &it); | ||
460 | return ret; | ||
461 | } | ||
462 | |||
463 | static int erofs_xattr_generic_get(const struct xattr_handler *handler, | ||
464 | struct dentry *unused, struct inode *inode, | ||
465 | const char *name, void *buffer, size_t size) | ||
466 | { | ||
467 | struct erofs_sb_info *const sbi = EROFS_I_SB(inode); | ||
468 | |||
469 | switch (handler->flags) { | ||
470 | case EROFS_XATTR_INDEX_USER: | ||
471 | if (!test_opt(sbi, XATTR_USER)) | ||
472 | return -EOPNOTSUPP; | ||
473 | break; | ||
474 | case EROFS_XATTR_INDEX_TRUSTED: | ||
475 | if (!capable(CAP_SYS_ADMIN)) | ||
476 | return -EPERM; | ||
477 | break; | ||
478 | case EROFS_XATTR_INDEX_SECURITY: | ||
479 | break; | ||
480 | default: | ||
481 | return -EINVAL; | ||
482 | } | ||
483 | |||
484 | return erofs_getxattr(inode, handler->flags, name, buffer, size); | ||
485 | } | ||
486 | |||
487 | const struct xattr_handler erofs_xattr_user_handler = { | ||
488 | .prefix = XATTR_USER_PREFIX, | ||
489 | .flags = EROFS_XATTR_INDEX_USER, | ||
490 | .list = erofs_xattr_user_list, | ||
491 | .get = erofs_xattr_generic_get, | ||
492 | }; | ||
493 | |||
494 | const struct xattr_handler erofs_xattr_trusted_handler = { | ||
495 | .prefix = XATTR_TRUSTED_PREFIX, | ||
496 | .flags = EROFS_XATTR_INDEX_TRUSTED, | ||
497 | .list = erofs_xattr_trusted_list, | ||
498 | .get = erofs_xattr_generic_get, | ||
499 | }; | ||
500 | |||
501 | #ifdef CONFIG_EROFS_FS_SECURITY | ||
502 | const struct xattr_handler __maybe_unused erofs_xattr_security_handler = { | ||
503 | .prefix = XATTR_SECURITY_PREFIX, | ||
504 | .flags = EROFS_XATTR_INDEX_SECURITY, | ||
505 | .get = erofs_xattr_generic_get, | ||
506 | }; | ||
507 | #endif | ||
508 | |||
509 | const struct xattr_handler *erofs_xattr_handlers[] = { | ||
510 | &erofs_xattr_user_handler, | ||
511 | #ifdef CONFIG_EROFS_FS_POSIX_ACL | ||
512 | &posix_acl_access_xattr_handler, | ||
513 | &posix_acl_default_xattr_handler, | ||
514 | #endif | ||
515 | &erofs_xattr_trusted_handler, | ||
516 | #ifdef CONFIG_EROFS_FS_SECURITY | ||
517 | &erofs_xattr_security_handler, | ||
518 | #endif | ||
519 | NULL, | ||
520 | }; | ||
521 | |||
522 | struct listxattr_iter { | ||
523 | struct xattr_iter it; | ||
524 | |||
525 | struct dentry *dentry; | ||
526 | char *buffer; | ||
527 | int buffer_size, buffer_ofs; | ||
528 | }; | ||
529 | |||
530 | static int xattr_entrylist(struct xattr_iter *_it, | ||
531 | struct erofs_xattr_entry *entry) | ||
532 | { | ||
533 | struct listxattr_iter *it = | ||
534 | container_of(_it, struct listxattr_iter, it); | ||
535 | unsigned int prefix_len; | ||
536 | const char *prefix; | ||
537 | |||
538 | const struct xattr_handler *h = | ||
539 | erofs_xattr_handler(entry->e_name_index); | ||
540 | |||
541 | if (!h || (h->list && !h->list(it->dentry))) | ||
542 | return 1; | ||
543 | |||
544 | prefix = xattr_prefix(h); | ||
545 | prefix_len = strlen(prefix); | ||
546 | |||
547 | if (!it->buffer) { | ||
548 | it->buffer_ofs += prefix_len + entry->e_name_len + 1; | ||
549 | return 1; | ||
550 | } | ||
551 | |||
552 | if (it->buffer_ofs + prefix_len | ||
553 | + entry->e_name_len + 1 > it->buffer_size) | ||
554 | return -ERANGE; | ||
555 | |||
556 | memcpy(it->buffer + it->buffer_ofs, prefix, prefix_len); | ||
557 | it->buffer_ofs += prefix_len; | ||
558 | return 0; | ||
559 | } | ||
560 | |||
561 | static int xattr_namelist(struct xattr_iter *_it, | ||
562 | unsigned int processed, char *buf, unsigned int len) | ||
563 | { | ||
564 | struct listxattr_iter *it = | ||
565 | container_of(_it, struct listxattr_iter, it); | ||
566 | |||
567 | memcpy(it->buffer + it->buffer_ofs, buf, len); | ||
568 | it->buffer_ofs += len; | ||
569 | return 0; | ||
570 | } | ||
571 | |||
572 | static int xattr_skipvalue(struct xattr_iter *_it, | ||
573 | unsigned int value_sz) | ||
574 | { | ||
575 | struct listxattr_iter *it = | ||
576 | container_of(_it, struct listxattr_iter, it); | ||
577 | |||
578 | it->buffer[it->buffer_ofs++] = '\0'; | ||
579 | return 1; | ||
580 | } | ||
581 | |||
582 | static const struct xattr_iter_handlers list_xattr_handlers = { | ||
583 | .entry = xattr_entrylist, | ||
584 | .name = xattr_namelist, | ||
585 | .alloc_buffer = xattr_skipvalue, | ||
586 | .value = NULL | ||
587 | }; | ||
588 | |||
589 | static int inline_listxattr(struct listxattr_iter *it) | ||
590 | { | ||
591 | int ret; | ||
592 | unsigned int remaining; | ||
593 | |||
594 | ret = inline_xattr_iter_begin(&it->it, d_inode(it->dentry)); | ||
595 | if (ret < 0) | ||
596 | return ret; | ||
597 | |||
598 | remaining = ret; | ||
599 | while (remaining) { | ||
600 | ret = xattr_foreach(&it->it, &list_xattr_handlers, &remaining); | ||
601 | if (ret) | ||
602 | break; | ||
603 | } | ||
604 | xattr_iter_end_final(&it->it); | ||
605 | return ret ? ret : it->buffer_ofs; | ||
606 | } | ||
607 | |||
608 | static int shared_listxattr(struct listxattr_iter *it) | ||
609 | { | ||
610 | struct inode *const inode = d_inode(it->dentry); | ||
611 | struct erofs_vnode *const vi = EROFS_V(inode); | ||
612 | struct super_block *const sb = inode->i_sb; | ||
613 | struct erofs_sb_info *const sbi = EROFS_SB(sb); | ||
614 | unsigned int i; | ||
615 | int ret = 0; | ||
616 | |||
617 | for (i = 0; i < vi->xattr_shared_count; ++i) { | ||
618 | erofs_blk_t blkaddr = | ||
619 | xattrblock_addr(sbi, vi->xattr_shared_xattrs[i]); | ||
620 | |||
621 | it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]); | ||
622 | if (!i || blkaddr != it->it.blkaddr) { | ||
623 | if (i) | ||
624 | xattr_iter_end(&it->it, true); | ||
625 | |||
626 | it->it.page = erofs_get_meta_page(sb, blkaddr, false); | ||
627 | if (IS_ERR(it->it.page)) | ||
628 | return PTR_ERR(it->it.page); | ||
629 | |||
630 | it->it.kaddr = kmap_atomic(it->it.page); | ||
631 | it->it.blkaddr = blkaddr; | ||
632 | } | ||
633 | |||
634 | ret = xattr_foreach(&it->it, &list_xattr_handlers, NULL); | ||
635 | if (ret) | ||
636 | break; | ||
637 | } | ||
638 | if (vi->xattr_shared_count) | ||
639 | xattr_iter_end_final(&it->it); | ||
640 | |||
641 | return ret ? ret : it->buffer_ofs; | ||
642 | } | ||
643 | |||
644 | ssize_t erofs_listxattr(struct dentry *dentry, | ||
645 | char *buffer, size_t buffer_size) | ||
646 | { | ||
647 | int ret; | ||
648 | struct listxattr_iter it; | ||
649 | |||
650 | ret = init_inode_xattrs(d_inode(dentry)); | ||
651 | if (ret) | ||
652 | return ret; | ||
653 | |||
654 | it.dentry = dentry; | ||
655 | it.buffer = buffer; | ||
656 | it.buffer_size = buffer_size; | ||
657 | it.buffer_ofs = 0; | ||
658 | |||
659 | it.it.sb = dentry->d_sb; | ||
660 | |||
661 | ret = inline_listxattr(&it); | ||
662 | if (ret < 0 && ret != -ENOATTR) | ||
663 | return ret; | ||
664 | return shared_listxattr(&it); | ||
665 | } | ||
666 | |||
667 | #ifdef CONFIG_EROFS_FS_POSIX_ACL | ||
668 | struct posix_acl *erofs_get_acl(struct inode *inode, int type) | ||
669 | { | ||
670 | struct posix_acl *acl; | ||
671 | int prefix, rc; | ||
672 | char *value = NULL; | ||
673 | |||
674 | switch (type) { | ||
675 | case ACL_TYPE_ACCESS: | ||
676 | prefix = EROFS_XATTR_INDEX_POSIX_ACL_ACCESS; | ||
677 | break; | ||
678 | case ACL_TYPE_DEFAULT: | ||
679 | prefix = EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT; | ||
680 | break; | ||
681 | default: | ||
682 | return ERR_PTR(-EINVAL); | ||
683 | } | ||
684 | |||
685 | rc = erofs_getxattr(inode, prefix, "", NULL, 0); | ||
686 | if (rc > 0) { | ||
687 | value = kmalloc(rc, GFP_KERNEL); | ||
688 | if (!value) | ||
689 | return ERR_PTR(-ENOMEM); | ||
690 | rc = erofs_getxattr(inode, prefix, "", value, rc); | ||
691 | } | ||
692 | |||
693 | if (rc == -ENOATTR) | ||
694 | acl = NULL; | ||
695 | else if (rc < 0) | ||
696 | acl = ERR_PTR(rc); | ||
697 | else | ||
698 | acl = posix_acl_from_xattr(&init_user_ns, value, rc); | ||
699 | kfree(value); | ||
700 | return acl; | ||
701 | } | ||
702 | #endif | ||
703 | |||
diff --git a/fs/erofs/xattr.h b/fs/erofs/xattr.h new file mode 100644 index 000000000000..c5ca47d814dd --- /dev/null +++ b/fs/erofs/xattr.h | |||
@@ -0,0 +1,92 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0-only */ | ||
2 | /* | ||
3 | * Copyright (C) 2017-2018 HUAWEI, Inc. | ||
4 | * http://www.huawei.com/ | ||
5 | * Created by Gao Xiang <gaoxiang25@huawei.com> | ||
6 | */ | ||
7 | #ifndef __EROFS_XATTR_H | ||
8 | #define __EROFS_XATTR_H | ||
9 | |||
10 | #include "internal.h" | ||
11 | #include <linux/posix_acl_xattr.h> | ||
12 | #include <linux/xattr.h> | ||
13 | |||
14 | /* Attribute not found */ | ||
15 | #define ENOATTR ENODATA | ||
16 | |||
17 | static inline unsigned int inlinexattr_header_size(struct inode *inode) | ||
18 | { | ||
19 | return sizeof(struct erofs_xattr_ibody_header) | ||
20 | + sizeof(u32) * EROFS_V(inode)->xattr_shared_count; | ||
21 | } | ||
22 | |||
23 | static inline erofs_blk_t xattrblock_addr(struct erofs_sb_info *sbi, | ||
24 | unsigned int xattr_id) | ||
25 | { | ||
26 | #ifdef CONFIG_EROFS_FS_XATTR | ||
27 | return sbi->xattr_blkaddr + | ||
28 | xattr_id * sizeof(__u32) / EROFS_BLKSIZ; | ||
29 | #else | ||
30 | return 0; | ||
31 | #endif | ||
32 | } | ||
33 | |||
34 | static inline unsigned int xattrblock_offset(struct erofs_sb_info *sbi, | ||
35 | unsigned int xattr_id) | ||
36 | { | ||
37 | return (xattr_id * sizeof(__u32)) % EROFS_BLKSIZ; | ||
38 | } | ||
39 | |||
40 | #ifdef CONFIG_EROFS_FS_XATTR | ||
41 | extern const struct xattr_handler erofs_xattr_user_handler; | ||
42 | extern const struct xattr_handler erofs_xattr_trusted_handler; | ||
43 | #ifdef CONFIG_EROFS_FS_SECURITY | ||
44 | extern const struct xattr_handler erofs_xattr_security_handler; | ||
45 | #endif | ||
46 | |||
47 | static inline const struct xattr_handler *erofs_xattr_handler(unsigned int idx) | ||
48 | { | ||
49 | static const struct xattr_handler *xattr_handler_map[] = { | ||
50 | [EROFS_XATTR_INDEX_USER] = &erofs_xattr_user_handler, | ||
51 | #ifdef CONFIG_EROFS_FS_POSIX_ACL | ||
52 | [EROFS_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler, | ||
53 | [EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT] = | ||
54 | &posix_acl_default_xattr_handler, | ||
55 | #endif | ||
56 | [EROFS_XATTR_INDEX_TRUSTED] = &erofs_xattr_trusted_handler, | ||
57 | #ifdef CONFIG_EROFS_FS_SECURITY | ||
58 | [EROFS_XATTR_INDEX_SECURITY] = &erofs_xattr_security_handler, | ||
59 | #endif | ||
60 | }; | ||
61 | |||
62 | return idx && idx < ARRAY_SIZE(xattr_handler_map) ? | ||
63 | xattr_handler_map[idx] : NULL; | ||
64 | } | ||
65 | |||
66 | extern const struct xattr_handler *erofs_xattr_handlers[]; | ||
67 | |||
68 | int erofs_getxattr(struct inode *, int, const char *, void *, size_t); | ||
69 | ssize_t erofs_listxattr(struct dentry *, char *, size_t); | ||
70 | #else | ||
71 | static inline int erofs_getxattr(struct inode *inode, int index, | ||
72 | const char *name, void *buffer, | ||
73 | size_t buffer_size) | ||
74 | { | ||
75 | return -EOPNOTSUPP; | ||
76 | } | ||
77 | |||
78 | static inline ssize_t erofs_listxattr(struct dentry *dentry, | ||
79 | char *buffer, size_t buffer_size) | ||
80 | { | ||
81 | return -EOPNOTSUPP; | ||
82 | } | ||
83 | #endif /* !CONFIG_EROFS_FS_XATTR */ | ||
84 | |||
85 | #ifdef CONFIG_EROFS_FS_POSIX_ACL | ||
86 | struct posix_acl *erofs_get_acl(struct inode *inode, int type); | ||
87 | #else | ||
88 | #define erofs_get_acl (NULL) | ||
89 | #endif | ||
90 | |||
91 | #endif | ||
92 | |||
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c new file mode 100644 index 000000000000..b32ad585237c --- /dev/null +++ b/fs/erofs/zdata.c | |||
@@ -0,0 +1,1432 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-only | ||
2 | /* | ||
3 | * Copyright (C) 2018 HUAWEI, Inc. | ||
4 | * http://www.huawei.com/ | ||
5 | * Created by Gao Xiang <gaoxiang25@huawei.com> | ||
6 | */ | ||
7 | #include "zdata.h" | ||
8 | #include "compress.h" | ||
9 | #include <linux/prefetch.h> | ||
10 | |||
11 | #include <trace/events/erofs.h> | ||
12 | |||
13 | /* | ||
14 | * a compressed_pages[] placeholder in order to avoid | ||
15 | * being filled with file pages for in-place decompression. | ||
16 | */ | ||
17 | #define PAGE_UNALLOCATED ((void *)0x5F0E4B1D) | ||
18 | |||
19 | /* how to allocate cached pages for a pcluster */ | ||
20 | enum z_erofs_cache_alloctype { | ||
21 | DONTALLOC, /* don't allocate any cached pages */ | ||
22 | DELAYEDALLOC, /* delayed allocation (at the time of submitting io) */ | ||
23 | }; | ||
24 | |||
25 | /* | ||
26 | * tagged pointer with 1-bit tag for all compressed pages | ||
27 | * tag 0 - the page is just found with an extra page reference | ||
28 | */ | ||
29 | typedef tagptr1_t compressed_page_t; | ||
30 | |||
31 | #define tag_compressed_page_justfound(page) \ | ||
32 | tagptr_fold(compressed_page_t, page, 1) | ||
33 | |||
34 | static struct workqueue_struct *z_erofs_workqueue __read_mostly; | ||
35 | static struct kmem_cache *pcluster_cachep __read_mostly; | ||
36 | |||
37 | void z_erofs_exit_zip_subsystem(void) | ||
38 | { | ||
39 | destroy_workqueue(z_erofs_workqueue); | ||
40 | kmem_cache_destroy(pcluster_cachep); | ||
41 | } | ||
42 | |||
43 | static inline int init_unzip_workqueue(void) | ||
44 | { | ||
45 | const unsigned int onlinecpus = num_possible_cpus(); | ||
46 | const unsigned int flags = WQ_UNBOUND | WQ_HIGHPRI | WQ_CPU_INTENSIVE; | ||
47 | |||
48 | /* | ||
49 | * no need to spawn too many threads, limiting threads could minimum | ||
50 | * scheduling overhead, perhaps per-CPU threads should be better? | ||
51 | */ | ||
52 | z_erofs_workqueue = alloc_workqueue("erofs_unzipd", flags, | ||
53 | onlinecpus + onlinecpus / 4); | ||
54 | return z_erofs_workqueue ? 0 : -ENOMEM; | ||
55 | } | ||
56 | |||
57 | static void init_once(void *ptr) | ||
58 | { | ||
59 | struct z_erofs_pcluster *pcl = ptr; | ||
60 | struct z_erofs_collection *cl = z_erofs_primarycollection(pcl); | ||
61 | unsigned int i; | ||
62 | |||
63 | mutex_init(&cl->lock); | ||
64 | cl->nr_pages = 0; | ||
65 | cl->vcnt = 0; | ||
66 | for (i = 0; i < Z_EROFS_CLUSTER_MAX_PAGES; ++i) | ||
67 | pcl->compressed_pages[i] = NULL; | ||
68 | } | ||
69 | |||
70 | static void init_always(struct z_erofs_pcluster *pcl) | ||
71 | { | ||
72 | struct z_erofs_collection *cl = z_erofs_primarycollection(pcl); | ||
73 | |||
74 | atomic_set(&pcl->obj.refcount, 1); | ||
75 | |||
76 | DBG_BUGON(cl->nr_pages); | ||
77 | DBG_BUGON(cl->vcnt); | ||
78 | } | ||
79 | |||
80 | int __init z_erofs_init_zip_subsystem(void) | ||
81 | { | ||
82 | pcluster_cachep = kmem_cache_create("erofs_compress", | ||
83 | Z_EROFS_WORKGROUP_SIZE, 0, | ||
84 | SLAB_RECLAIM_ACCOUNT, init_once); | ||
85 | if (pcluster_cachep) { | ||
86 | if (!init_unzip_workqueue()) | ||
87 | return 0; | ||
88 | |||
89 | kmem_cache_destroy(pcluster_cachep); | ||
90 | } | ||
91 | return -ENOMEM; | ||
92 | } | ||
93 | |||
94 | enum z_erofs_collectmode { | ||
95 | COLLECT_SECONDARY, | ||
96 | COLLECT_PRIMARY, | ||
97 | /* | ||
98 | * The current collection was the tail of an exist chain, in addition | ||
99 | * that the previous processed chained collections are all decided to | ||
100 | * be hooked up to it. | ||
101 | * A new chain will be created for the remaining collections which are | ||
102 | * not processed yet, therefore different from COLLECT_PRIMARY_FOLLOWED, | ||
103 | * the next collection cannot reuse the whole page safely in | ||
104 | * the following scenario: | ||
105 | * ________________________________________________________________ | ||
106 | * | tail (partial) page | head (partial) page | | ||
107 | * | (belongs to the next cl) | (belongs to the current cl) | | ||
108 | * |_______PRIMARY_FOLLOWED_______|________PRIMARY_HOOKED___________| | ||
109 | */ | ||
110 | COLLECT_PRIMARY_HOOKED, | ||
111 | COLLECT_PRIMARY_FOLLOWED_NOINPLACE, | ||
112 | /* | ||
113 | * The current collection has been linked with the owned chain, and | ||
114 | * could also be linked with the remaining collections, which means | ||
115 | * if the processing page is the tail page of the collection, thus | ||
116 | * the current collection can safely use the whole page (since | ||
117 | * the previous collection is under control) for in-place I/O, as | ||
118 | * illustrated below: | ||
119 | * ________________________________________________________________ | ||
120 | * | tail (partial) page | head (partial) page | | ||
121 | * | (of the current cl) | (of the previous collection) | | ||
122 | * | PRIMARY_FOLLOWED or | | | ||
123 | * |_____PRIMARY_HOOKED___|____________PRIMARY_FOLLOWED____________| | ||
124 | * | ||
125 | * [ (*) the above page can be used as inplace I/O. ] | ||
126 | */ | ||
127 | COLLECT_PRIMARY_FOLLOWED, | ||
128 | }; | ||
129 | |||
130 | struct z_erofs_collector { | ||
131 | struct z_erofs_pagevec_ctor vector; | ||
132 | |||
133 | struct z_erofs_pcluster *pcl, *tailpcl; | ||
134 | struct z_erofs_collection *cl; | ||
135 | struct page **compressedpages; | ||
136 | z_erofs_next_pcluster_t owned_head; | ||
137 | |||
138 | enum z_erofs_collectmode mode; | ||
139 | }; | ||
140 | |||
141 | struct z_erofs_decompress_frontend { | ||
142 | struct inode *const inode; | ||
143 | |||
144 | struct z_erofs_collector clt; | ||
145 | struct erofs_map_blocks map; | ||
146 | |||
147 | /* used for applying cache strategy on the fly */ | ||
148 | bool backmost; | ||
149 | erofs_off_t headoffset; | ||
150 | }; | ||
151 | |||
152 | #define COLLECTOR_INIT() { \ | ||
153 | .owned_head = Z_EROFS_PCLUSTER_TAIL, \ | ||
154 | .mode = COLLECT_PRIMARY_FOLLOWED } | ||
155 | |||
156 | #define DECOMPRESS_FRONTEND_INIT(__i) { \ | ||
157 | .inode = __i, .clt = COLLECTOR_INIT(), \ | ||
158 | .backmost = true, } | ||
159 | |||
160 | static struct page *z_pagemap_global[Z_EROFS_VMAP_GLOBAL_PAGES]; | ||
161 | static DEFINE_MUTEX(z_pagemap_global_lock); | ||
162 | |||
163 | static void preload_compressed_pages(struct z_erofs_collector *clt, | ||
164 | struct address_space *mc, | ||
165 | enum z_erofs_cache_alloctype type, | ||
166 | struct list_head *pagepool) | ||
167 | { | ||
168 | const struct z_erofs_pcluster *pcl = clt->pcl; | ||
169 | const unsigned int clusterpages = BIT(pcl->clusterbits); | ||
170 | struct page **pages = clt->compressedpages; | ||
171 | pgoff_t index = pcl->obj.index + (pages - pcl->compressed_pages); | ||
172 | bool standalone = true; | ||
173 | |||
174 | if (clt->mode < COLLECT_PRIMARY_FOLLOWED) | ||
175 | return; | ||
176 | |||
177 | for (; pages < pcl->compressed_pages + clusterpages; ++pages) { | ||
178 | struct page *page; | ||
179 | compressed_page_t t; | ||
180 | |||
181 | /* the compressed page was loaded before */ | ||
182 | if (READ_ONCE(*pages)) | ||
183 | continue; | ||
184 | |||
185 | page = find_get_page(mc, index); | ||
186 | |||
187 | if (page) { | ||
188 | t = tag_compressed_page_justfound(page); | ||
189 | } else if (type == DELAYEDALLOC) { | ||
190 | t = tagptr_init(compressed_page_t, PAGE_UNALLOCATED); | ||
191 | } else { /* DONTALLOC */ | ||
192 | if (standalone) | ||
193 | clt->compressedpages = pages; | ||
194 | standalone = false; | ||
195 | continue; | ||
196 | } | ||
197 | |||
198 | if (!cmpxchg_relaxed(pages, NULL, tagptr_cast_ptr(t))) | ||
199 | continue; | ||
200 | |||
201 | if (page) | ||
202 | put_page(page); | ||
203 | } | ||
204 | |||
205 | if (standalone) /* downgrade to PRIMARY_FOLLOWED_NOINPLACE */ | ||
206 | clt->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE; | ||
207 | } | ||
208 | |||
209 | /* called by erofs_shrinker to get rid of all compressed_pages */ | ||
210 | int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi, | ||
211 | struct erofs_workgroup *grp) | ||
212 | { | ||
213 | struct z_erofs_pcluster *const pcl = | ||
214 | container_of(grp, struct z_erofs_pcluster, obj); | ||
215 | struct address_space *const mapping = MNGD_MAPPING(sbi); | ||
216 | const unsigned int clusterpages = BIT(pcl->clusterbits); | ||
217 | int i; | ||
218 | |||
219 | /* | ||
220 | * refcount of workgroup is now freezed as 1, | ||
221 | * therefore no need to worry about available decompression users. | ||
222 | */ | ||
223 | for (i = 0; i < clusterpages; ++i) { | ||
224 | struct page *page = pcl->compressed_pages[i]; | ||
225 | |||
226 | if (!page) | ||
227 | continue; | ||
228 | |||
229 | /* block other users from reclaiming or migrating the page */ | ||
230 | if (!trylock_page(page)) | ||
231 | return -EBUSY; | ||
232 | |||
233 | if (unlikely(page->mapping != mapping)) | ||
234 | continue; | ||
235 | |||
236 | /* barrier is implied in the following 'unlock_page' */ | ||
237 | WRITE_ONCE(pcl->compressed_pages[i], NULL); | ||
238 | set_page_private(page, 0); | ||
239 | ClearPagePrivate(page); | ||
240 | |||
241 | unlock_page(page); | ||
242 | put_page(page); | ||
243 | } | ||
244 | return 0; | ||
245 | } | ||
246 | |||
247 | int erofs_try_to_free_cached_page(struct address_space *mapping, | ||
248 | struct page *page) | ||
249 | { | ||
250 | struct z_erofs_pcluster *const pcl = (void *)page_private(page); | ||
251 | const unsigned int clusterpages = BIT(pcl->clusterbits); | ||
252 | int ret = 0; /* 0 - busy */ | ||
253 | |||
254 | if (erofs_workgroup_try_to_freeze(&pcl->obj, 1)) { | ||
255 | unsigned int i; | ||
256 | |||
257 | for (i = 0; i < clusterpages; ++i) { | ||
258 | if (pcl->compressed_pages[i] == page) { | ||
259 | WRITE_ONCE(pcl->compressed_pages[i], NULL); | ||
260 | ret = 1; | ||
261 | break; | ||
262 | } | ||
263 | } | ||
264 | erofs_workgroup_unfreeze(&pcl->obj, 1); | ||
265 | |||
266 | if (ret) { | ||
267 | ClearPagePrivate(page); | ||
268 | put_page(page); | ||
269 | } | ||
270 | } | ||
271 | return ret; | ||
272 | } | ||
273 | |||
274 | /* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */ | ||
275 | static inline bool try_inplace_io(struct z_erofs_collector *clt, | ||
276 | struct page *page) | ||
277 | { | ||
278 | struct z_erofs_pcluster *const pcl = clt->pcl; | ||
279 | const unsigned int clusterpages = BIT(pcl->clusterbits); | ||
280 | |||
281 | while (clt->compressedpages < pcl->compressed_pages + clusterpages) { | ||
282 | if (!cmpxchg(clt->compressedpages++, NULL, page)) | ||
283 | return true; | ||
284 | } | ||
285 | return false; | ||
286 | } | ||
287 | |||
288 | /* callers must be with collection lock held */ | ||
289 | static int z_erofs_attach_page(struct z_erofs_collector *clt, | ||
290 | struct page *page, | ||
291 | enum z_erofs_page_type type) | ||
292 | { | ||
293 | int ret; | ||
294 | bool occupied; | ||
295 | |||
296 | /* give priority for inplaceio */ | ||
297 | if (clt->mode >= COLLECT_PRIMARY && | ||
298 | type == Z_EROFS_PAGE_TYPE_EXCLUSIVE && | ||
299 | try_inplace_io(clt, page)) | ||
300 | return 0; | ||
301 | |||
302 | ret = z_erofs_pagevec_enqueue(&clt->vector, | ||
303 | page, type, &occupied); | ||
304 | clt->cl->vcnt += (unsigned int)ret; | ||
305 | |||
306 | return ret ? 0 : -EAGAIN; | ||
307 | } | ||
308 | |||
309 | static enum z_erofs_collectmode | ||
310 | try_to_claim_pcluster(struct z_erofs_pcluster *pcl, | ||
311 | z_erofs_next_pcluster_t *owned_head) | ||
312 | { | ||
313 | /* let's claim these following types of pclusters */ | ||
314 | retry: | ||
315 | if (pcl->next == Z_EROFS_PCLUSTER_NIL) { | ||
316 | /* type 1, nil pcluster */ | ||
317 | if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_NIL, | ||
318 | *owned_head) != Z_EROFS_PCLUSTER_NIL) | ||
319 | goto retry; | ||
320 | |||
321 | *owned_head = &pcl->next; | ||
322 | /* lucky, I am the followee :) */ | ||
323 | return COLLECT_PRIMARY_FOLLOWED; | ||
324 | } else if (pcl->next == Z_EROFS_PCLUSTER_TAIL) { | ||
325 | /* | ||
326 | * type 2, link to the end of a existing open chain, | ||
327 | * be careful that its submission itself is governed | ||
328 | * by the original owned chain. | ||
329 | */ | ||
330 | if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL, | ||
331 | *owned_head) != Z_EROFS_PCLUSTER_TAIL) | ||
332 | goto retry; | ||
333 | *owned_head = Z_EROFS_PCLUSTER_TAIL; | ||
334 | return COLLECT_PRIMARY_HOOKED; | ||
335 | } | ||
336 | return COLLECT_PRIMARY; /* :( better luck next time */ | ||
337 | } | ||
338 | |||
339 | static struct z_erofs_collection *cllookup(struct z_erofs_collector *clt, | ||
340 | struct inode *inode, | ||
341 | struct erofs_map_blocks *map) | ||
342 | { | ||
343 | struct erofs_workgroup *grp; | ||
344 | struct z_erofs_pcluster *pcl; | ||
345 | struct z_erofs_collection *cl; | ||
346 | unsigned int length; | ||
347 | bool tag; | ||
348 | |||
349 | grp = erofs_find_workgroup(inode->i_sb, map->m_pa >> PAGE_SHIFT, &tag); | ||
350 | if (!grp) | ||
351 | return NULL; | ||
352 | |||
353 | pcl = container_of(grp, struct z_erofs_pcluster, obj); | ||
354 | if (clt->owned_head == &pcl->next || pcl == clt->tailpcl) { | ||
355 | DBG_BUGON(1); | ||
356 | erofs_workgroup_put(grp); | ||
357 | return ERR_PTR(-EFSCORRUPTED); | ||
358 | } | ||
359 | |||
360 | cl = z_erofs_primarycollection(pcl); | ||
361 | if (unlikely(cl->pageofs != (map->m_la & ~PAGE_MASK))) { | ||
362 | DBG_BUGON(1); | ||
363 | erofs_workgroup_put(grp); | ||
364 | return ERR_PTR(-EFSCORRUPTED); | ||
365 | } | ||
366 | |||
367 | length = READ_ONCE(pcl->length); | ||
368 | if (length & Z_EROFS_PCLUSTER_FULL_LENGTH) { | ||
369 | if ((map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) > length) { | ||
370 | DBG_BUGON(1); | ||
371 | erofs_workgroup_put(grp); | ||
372 | return ERR_PTR(-EFSCORRUPTED); | ||
373 | } | ||
374 | } else { | ||
375 | unsigned int llen = map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT; | ||
376 | |||
377 | if (map->m_flags & EROFS_MAP_FULL_MAPPED) | ||
378 | llen |= Z_EROFS_PCLUSTER_FULL_LENGTH; | ||
379 | |||
380 | while (llen > length && | ||
381 | length != cmpxchg_relaxed(&pcl->length, length, llen)) { | ||
382 | cpu_relax(); | ||
383 | length = READ_ONCE(pcl->length); | ||
384 | } | ||
385 | } | ||
386 | mutex_lock(&cl->lock); | ||
387 | /* used to check tail merging loop due to corrupted images */ | ||
388 | if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL) | ||
389 | clt->tailpcl = pcl; | ||
390 | clt->mode = try_to_claim_pcluster(pcl, &clt->owned_head); | ||
391 | /* clean tailpcl if the current owned_head is Z_EROFS_PCLUSTER_TAIL */ | ||
392 | if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL) | ||
393 | clt->tailpcl = NULL; | ||
394 | clt->pcl = pcl; | ||
395 | clt->cl = cl; | ||
396 | return cl; | ||
397 | } | ||
398 | |||
399 | static struct z_erofs_collection *clregister(struct z_erofs_collector *clt, | ||
400 | struct inode *inode, | ||
401 | struct erofs_map_blocks *map) | ||
402 | { | ||
403 | struct z_erofs_pcluster *pcl; | ||
404 | struct z_erofs_collection *cl; | ||
405 | int err; | ||
406 | |||
407 | /* no available workgroup, let's allocate one */ | ||
408 | pcl = kmem_cache_alloc(pcluster_cachep, GFP_NOFS); | ||
409 | if (unlikely(!pcl)) | ||
410 | return ERR_PTR(-ENOMEM); | ||
411 | |||
412 | init_always(pcl); | ||
413 | pcl->obj.index = map->m_pa >> PAGE_SHIFT; | ||
414 | |||
415 | pcl->length = (map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) | | ||
416 | (map->m_flags & EROFS_MAP_FULL_MAPPED ? | ||
417 | Z_EROFS_PCLUSTER_FULL_LENGTH : 0); | ||
418 | |||
419 | if (map->m_flags & EROFS_MAP_ZIPPED) | ||
420 | pcl->algorithmformat = Z_EROFS_COMPRESSION_LZ4; | ||
421 | else | ||
422 | pcl->algorithmformat = Z_EROFS_COMPRESSION_SHIFTED; | ||
423 | |||
424 | pcl->clusterbits = EROFS_V(inode)->z_physical_clusterbits[0]; | ||
425 | pcl->clusterbits -= PAGE_SHIFT; | ||
426 | |||
427 | /* new pclusters should be claimed as type 1, primary and followed */ | ||
428 | pcl->next = clt->owned_head; | ||
429 | clt->mode = COLLECT_PRIMARY_FOLLOWED; | ||
430 | |||
431 | cl = z_erofs_primarycollection(pcl); | ||
432 | cl->pageofs = map->m_la & ~PAGE_MASK; | ||
433 | |||
434 | /* | ||
435 | * lock all primary followed works before visible to others | ||
436 | * and mutex_trylock *never* fails for a new pcluster. | ||
437 | */ | ||
438 | mutex_trylock(&cl->lock); | ||
439 | |||
440 | err = erofs_register_workgroup(inode->i_sb, &pcl->obj, 0); | ||
441 | if (err) { | ||
442 | mutex_unlock(&cl->lock); | ||
443 | kmem_cache_free(pcluster_cachep, pcl); | ||
444 | return ERR_PTR(-EAGAIN); | ||
445 | } | ||
446 | /* used to check tail merging loop due to corrupted images */ | ||
447 | if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL) | ||
448 | clt->tailpcl = pcl; | ||
449 | clt->owned_head = &pcl->next; | ||
450 | clt->pcl = pcl; | ||
451 | clt->cl = cl; | ||
452 | return cl; | ||
453 | } | ||
454 | |||
455 | static int z_erofs_collector_begin(struct z_erofs_collector *clt, | ||
456 | struct inode *inode, | ||
457 | struct erofs_map_blocks *map) | ||
458 | { | ||
459 | struct z_erofs_collection *cl; | ||
460 | |||
461 | DBG_BUGON(clt->cl); | ||
462 | |||
463 | /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous collection */ | ||
464 | DBG_BUGON(clt->owned_head == Z_EROFS_PCLUSTER_NIL); | ||
465 | DBG_BUGON(clt->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED); | ||
466 | |||
467 | if (!PAGE_ALIGNED(map->m_pa)) { | ||
468 | DBG_BUGON(1); | ||
469 | return -EINVAL; | ||
470 | } | ||
471 | |||
472 | repeat: | ||
473 | cl = cllookup(clt, inode, map); | ||
474 | if (!cl) { | ||
475 | cl = clregister(clt, inode, map); | ||
476 | |||
477 | if (unlikely(cl == ERR_PTR(-EAGAIN))) | ||
478 | goto repeat; | ||
479 | } | ||
480 | |||
481 | if (IS_ERR(cl)) | ||
482 | return PTR_ERR(cl); | ||
483 | |||
484 | z_erofs_pagevec_ctor_init(&clt->vector, Z_EROFS_NR_INLINE_PAGEVECS, | ||
485 | cl->pagevec, cl->vcnt); | ||
486 | |||
487 | clt->compressedpages = clt->pcl->compressed_pages; | ||
488 | if (clt->mode <= COLLECT_PRIMARY) /* cannot do in-place I/O */ | ||
489 | clt->compressedpages += Z_EROFS_CLUSTER_MAX_PAGES; | ||
490 | return 0; | ||
491 | } | ||
492 | |||
493 | /* | ||
494 | * keep in mind that no referenced pclusters will be freed | ||
495 | * only after a RCU grace period. | ||
496 | */ | ||
497 | static void z_erofs_rcu_callback(struct rcu_head *head) | ||
498 | { | ||
499 | struct z_erofs_collection *const cl = | ||
500 | container_of(head, struct z_erofs_collection, rcu); | ||
501 | |||
502 | kmem_cache_free(pcluster_cachep, | ||
503 | container_of(cl, struct z_erofs_pcluster, | ||
504 | primary_collection)); | ||
505 | } | ||
506 | |||
507 | void erofs_workgroup_free_rcu(struct erofs_workgroup *grp) | ||
508 | { | ||
509 | struct z_erofs_pcluster *const pcl = | ||
510 | container_of(grp, struct z_erofs_pcluster, obj); | ||
511 | struct z_erofs_collection *const cl = z_erofs_primarycollection(pcl); | ||
512 | |||
513 | call_rcu(&cl->rcu, z_erofs_rcu_callback); | ||
514 | } | ||
515 | |||
516 | static void z_erofs_collection_put(struct z_erofs_collection *cl) | ||
517 | { | ||
518 | struct z_erofs_pcluster *const pcl = | ||
519 | container_of(cl, struct z_erofs_pcluster, primary_collection); | ||
520 | |||
521 | erofs_workgroup_put(&pcl->obj); | ||
522 | } | ||
523 | |||
524 | static bool z_erofs_collector_end(struct z_erofs_collector *clt) | ||
525 | { | ||
526 | struct z_erofs_collection *cl = clt->cl; | ||
527 | |||
528 | if (!cl) | ||
529 | return false; | ||
530 | |||
531 | z_erofs_pagevec_ctor_exit(&clt->vector, false); | ||
532 | mutex_unlock(&cl->lock); | ||
533 | |||
534 | /* | ||
535 | * if all pending pages are added, don't hold its reference | ||
536 | * any longer if the pcluster isn't hosted by ourselves. | ||
537 | */ | ||
538 | if (clt->mode < COLLECT_PRIMARY_FOLLOWED_NOINPLACE) | ||
539 | z_erofs_collection_put(cl); | ||
540 | |||
541 | clt->cl = NULL; | ||
542 | return true; | ||
543 | } | ||
544 | |||
545 | static inline struct page *__stagingpage_alloc(struct list_head *pagepool, | ||
546 | gfp_t gfp) | ||
547 | { | ||
548 | struct page *page = erofs_allocpage(pagepool, gfp, true); | ||
549 | |||
550 | page->mapping = Z_EROFS_MAPPING_STAGING; | ||
551 | return page; | ||
552 | } | ||
553 | |||
554 | static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend *fe, | ||
555 | unsigned int cachestrategy, | ||
556 | erofs_off_t la) | ||
557 | { | ||
558 | if (cachestrategy <= EROFS_ZIP_CACHE_DISABLED) | ||
559 | return false; | ||
560 | |||
561 | if (fe->backmost) | ||
562 | return true; | ||
563 | |||
564 | return cachestrategy >= EROFS_ZIP_CACHE_READAROUND && | ||
565 | la < fe->headoffset; | ||
566 | } | ||
567 | |||
568 | static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, | ||
569 | struct page *page, | ||
570 | struct list_head *pagepool) | ||
571 | { | ||
572 | struct inode *const inode = fe->inode; | ||
573 | struct erofs_sb_info *const sbi __maybe_unused = EROFS_I_SB(inode); | ||
574 | struct erofs_map_blocks *const map = &fe->map; | ||
575 | struct z_erofs_collector *const clt = &fe->clt; | ||
576 | const loff_t offset = page_offset(page); | ||
577 | bool tight = (clt->mode >= COLLECT_PRIMARY_HOOKED); | ||
578 | |||
579 | enum z_erofs_cache_alloctype cache_strategy; | ||
580 | enum z_erofs_page_type page_type; | ||
581 | unsigned int cur, end, spiltted, index; | ||
582 | int err = 0; | ||
583 | |||
584 | /* register locked file pages as online pages in pack */ | ||
585 | z_erofs_onlinepage_init(page); | ||
586 | |||
587 | spiltted = 0; | ||
588 | end = PAGE_SIZE; | ||
589 | repeat: | ||
590 | cur = end - 1; | ||
591 | |||
592 | /* lucky, within the range of the current map_blocks */ | ||
593 | if (offset + cur >= map->m_la && | ||
594 | offset + cur < map->m_la + map->m_llen) { | ||
595 | /* didn't get a valid collection previously (very rare) */ | ||
596 | if (!clt->cl) | ||
597 | goto restart_now; | ||
598 | goto hitted; | ||
599 | } | ||
600 | |||
601 | /* go ahead the next map_blocks */ | ||
602 | debugln("%s: [out-of-range] pos %llu", __func__, offset + cur); | ||
603 | |||
604 | if (z_erofs_collector_end(clt)) | ||
605 | fe->backmost = false; | ||
606 | |||
607 | map->m_la = offset + cur; | ||
608 | map->m_llen = 0; | ||
609 | err = z_erofs_map_blocks_iter(inode, map, 0); | ||
610 | if (unlikely(err)) | ||
611 | goto err_out; | ||
612 | |||
613 | restart_now: | ||
614 | if (unlikely(!(map->m_flags & EROFS_MAP_MAPPED))) | ||
615 | goto hitted; | ||
616 | |||
617 | err = z_erofs_collector_begin(clt, inode, map); | ||
618 | if (unlikely(err)) | ||
619 | goto err_out; | ||
620 | |||
621 | /* preload all compressed pages (maybe downgrade role if necessary) */ | ||
622 | if (should_alloc_managed_pages(fe, sbi->cache_strategy, map->m_la)) | ||
623 | cache_strategy = DELAYEDALLOC; | ||
624 | else | ||
625 | cache_strategy = DONTALLOC; | ||
626 | |||
627 | preload_compressed_pages(clt, MNGD_MAPPING(sbi), | ||
628 | cache_strategy, pagepool); | ||
629 | |||
630 | tight &= (clt->mode >= COLLECT_PRIMARY_HOOKED); | ||
631 | hitted: | ||
632 | cur = end - min_t(unsigned int, offset + end - map->m_la, end); | ||
633 | if (unlikely(!(map->m_flags & EROFS_MAP_MAPPED))) { | ||
634 | zero_user_segment(page, cur, end); | ||
635 | goto next_part; | ||
636 | } | ||
637 | |||
638 | /* let's derive page type */ | ||
639 | page_type = cur ? Z_EROFS_VLE_PAGE_TYPE_HEAD : | ||
640 | (!spiltted ? Z_EROFS_PAGE_TYPE_EXCLUSIVE : | ||
641 | (tight ? Z_EROFS_PAGE_TYPE_EXCLUSIVE : | ||
642 | Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED)); | ||
643 | |||
644 | if (cur) | ||
645 | tight &= (clt->mode >= COLLECT_PRIMARY_FOLLOWED); | ||
646 | |||
647 | retry: | ||
648 | err = z_erofs_attach_page(clt, page, page_type); | ||
649 | /* should allocate an additional staging page for pagevec */ | ||
650 | if (err == -EAGAIN) { | ||
651 | struct page *const newpage = | ||
652 | __stagingpage_alloc(pagepool, GFP_NOFS); | ||
653 | |||
654 | err = z_erofs_attach_page(clt, newpage, | ||
655 | Z_EROFS_PAGE_TYPE_EXCLUSIVE); | ||
656 | if (likely(!err)) | ||
657 | goto retry; | ||
658 | } | ||
659 | |||
660 | if (unlikely(err)) | ||
661 | goto err_out; | ||
662 | |||
663 | index = page->index - (map->m_la >> PAGE_SHIFT); | ||
664 | |||
665 | z_erofs_onlinepage_fixup(page, index, true); | ||
666 | |||
667 | /* bump up the number of spiltted parts of a page */ | ||
668 | ++spiltted; | ||
669 | /* also update nr_pages */ | ||
670 | clt->cl->nr_pages = max_t(pgoff_t, clt->cl->nr_pages, index + 1); | ||
671 | next_part: | ||
672 | /* can be used for verification */ | ||
673 | map->m_llen = offset + cur - map->m_la; | ||
674 | |||
675 | end = cur; | ||
676 | if (end > 0) | ||
677 | goto repeat; | ||
678 | |||
679 | out: | ||
680 | z_erofs_onlinepage_endio(page); | ||
681 | |||
682 | debugln("%s, finish page: %pK spiltted: %u map->m_llen %llu", | ||
683 | __func__, page, spiltted, map->m_llen); | ||
684 | return err; | ||
685 | |||
686 | /* if some error occurred while processing this page */ | ||
687 | err_out: | ||
688 | SetPageError(page); | ||
689 | goto out; | ||
690 | } | ||
691 | |||
692 | static void z_erofs_vle_unzip_kickoff(void *ptr, int bios) | ||
693 | { | ||
694 | tagptr1_t t = tagptr_init(tagptr1_t, ptr); | ||
695 | struct z_erofs_unzip_io *io = tagptr_unfold_ptr(t); | ||
696 | bool background = tagptr_unfold_tags(t); | ||
697 | |||
698 | if (!background) { | ||
699 | unsigned long flags; | ||
700 | |||
701 | spin_lock_irqsave(&io->u.wait.lock, flags); | ||
702 | if (!atomic_add_return(bios, &io->pending_bios)) | ||
703 | wake_up_locked(&io->u.wait); | ||
704 | spin_unlock_irqrestore(&io->u.wait.lock, flags); | ||
705 | return; | ||
706 | } | ||
707 | |||
708 | if (!atomic_add_return(bios, &io->pending_bios)) | ||
709 | queue_work(z_erofs_workqueue, &io->u.work); | ||
710 | } | ||
711 | |||
712 | static inline void z_erofs_vle_read_endio(struct bio *bio) | ||
713 | { | ||
714 | struct erofs_sb_info *sbi = NULL; | ||
715 | blk_status_t err = bio->bi_status; | ||
716 | struct bio_vec *bvec; | ||
717 | struct bvec_iter_all iter_all; | ||
718 | |||
719 | bio_for_each_segment_all(bvec, bio, iter_all) { | ||
720 | struct page *page = bvec->bv_page; | ||
721 | bool cachemngd = false; | ||
722 | |||
723 | DBG_BUGON(PageUptodate(page)); | ||
724 | DBG_BUGON(!page->mapping); | ||
725 | |||
726 | if (unlikely(!sbi && !z_erofs_page_is_staging(page))) { | ||
727 | sbi = EROFS_SB(page->mapping->host->i_sb); | ||
728 | |||
729 | if (time_to_inject(sbi, FAULT_READ_IO)) { | ||
730 | erofs_show_injection_info(FAULT_READ_IO); | ||
731 | err = BLK_STS_IOERR; | ||
732 | } | ||
733 | } | ||
734 | |||
735 | /* sbi should already be gotten if the page is managed */ | ||
736 | if (sbi) | ||
737 | cachemngd = erofs_page_is_managed(sbi, page); | ||
738 | |||
739 | if (unlikely(err)) | ||
740 | SetPageError(page); | ||
741 | else if (cachemngd) | ||
742 | SetPageUptodate(page); | ||
743 | |||
744 | if (cachemngd) | ||
745 | unlock_page(page); | ||
746 | } | ||
747 | |||
748 | z_erofs_vle_unzip_kickoff(bio->bi_private, -1); | ||
749 | bio_put(bio); | ||
750 | } | ||
751 | |||
752 | static int z_erofs_decompress_pcluster(struct super_block *sb, | ||
753 | struct z_erofs_pcluster *pcl, | ||
754 | struct list_head *pagepool) | ||
755 | { | ||
756 | struct erofs_sb_info *const sbi = EROFS_SB(sb); | ||
757 | const unsigned int clusterpages = BIT(pcl->clusterbits); | ||
758 | struct z_erofs_pagevec_ctor ctor; | ||
759 | unsigned int i, outputsize, llen, nr_pages; | ||
760 | struct page *pages_onstack[Z_EROFS_VMAP_ONSTACK_PAGES]; | ||
761 | struct page **pages, **compressed_pages, *page; | ||
762 | |||
763 | enum z_erofs_page_type page_type; | ||
764 | bool overlapped, partial; | ||
765 | struct z_erofs_collection *cl; | ||
766 | int err; | ||
767 | |||
768 | might_sleep(); | ||
769 | cl = z_erofs_primarycollection(pcl); | ||
770 | DBG_BUGON(!READ_ONCE(cl->nr_pages)); | ||
771 | |||
772 | mutex_lock(&cl->lock); | ||
773 | nr_pages = cl->nr_pages; | ||
774 | |||
775 | if (likely(nr_pages <= Z_EROFS_VMAP_ONSTACK_PAGES)) { | ||
776 | pages = pages_onstack; | ||
777 | } else if (nr_pages <= Z_EROFS_VMAP_GLOBAL_PAGES && | ||
778 | mutex_trylock(&z_pagemap_global_lock)) { | ||
779 | pages = z_pagemap_global; | ||
780 | } else { | ||
781 | gfp_t gfp_flags = GFP_KERNEL; | ||
782 | |||
783 | if (nr_pages > Z_EROFS_VMAP_GLOBAL_PAGES) | ||
784 | gfp_flags |= __GFP_NOFAIL; | ||
785 | |||
786 | pages = kvmalloc_array(nr_pages, sizeof(struct page *), | ||
787 | gfp_flags); | ||
788 | |||
789 | /* fallback to global pagemap for the lowmem scenario */ | ||
790 | if (unlikely(!pages)) { | ||
791 | mutex_lock(&z_pagemap_global_lock); | ||
792 | pages = z_pagemap_global; | ||
793 | } | ||
794 | } | ||
795 | |||
796 | for (i = 0; i < nr_pages; ++i) | ||
797 | pages[i] = NULL; | ||
798 | |||
799 | err = 0; | ||
800 | z_erofs_pagevec_ctor_init(&ctor, Z_EROFS_NR_INLINE_PAGEVECS, | ||
801 | cl->pagevec, 0); | ||
802 | |||
803 | for (i = 0; i < cl->vcnt; ++i) { | ||
804 | unsigned int pagenr; | ||
805 | |||
806 | page = z_erofs_pagevec_dequeue(&ctor, &page_type); | ||
807 | |||
808 | /* all pages in pagevec ought to be valid */ | ||
809 | DBG_BUGON(!page); | ||
810 | DBG_BUGON(!page->mapping); | ||
811 | |||
812 | if (z_erofs_put_stagingpage(pagepool, page)) | ||
813 | continue; | ||
814 | |||
815 | if (page_type == Z_EROFS_VLE_PAGE_TYPE_HEAD) | ||
816 | pagenr = 0; | ||
817 | else | ||
818 | pagenr = z_erofs_onlinepage_index(page); | ||
819 | |||
820 | DBG_BUGON(pagenr >= nr_pages); | ||
821 | |||
822 | /* | ||
823 | * currently EROFS doesn't support multiref(dedup), | ||
824 | * so here erroring out one multiref page. | ||
825 | */ | ||
826 | if (unlikely(pages[pagenr])) { | ||
827 | DBG_BUGON(1); | ||
828 | SetPageError(pages[pagenr]); | ||
829 | z_erofs_onlinepage_endio(pages[pagenr]); | ||
830 | err = -EFSCORRUPTED; | ||
831 | } | ||
832 | pages[pagenr] = page; | ||
833 | } | ||
834 | z_erofs_pagevec_ctor_exit(&ctor, true); | ||
835 | |||
836 | overlapped = false; | ||
837 | compressed_pages = pcl->compressed_pages; | ||
838 | |||
839 | for (i = 0; i < clusterpages; ++i) { | ||
840 | unsigned int pagenr; | ||
841 | |||
842 | page = compressed_pages[i]; | ||
843 | |||
844 | /* all compressed pages ought to be valid */ | ||
845 | DBG_BUGON(!page); | ||
846 | DBG_BUGON(!page->mapping); | ||
847 | |||
848 | if (!z_erofs_page_is_staging(page)) { | ||
849 | if (erofs_page_is_managed(sbi, page)) { | ||
850 | if (unlikely(!PageUptodate(page))) | ||
851 | err = -EIO; | ||
852 | continue; | ||
853 | } | ||
854 | |||
855 | /* | ||
856 | * only if non-head page can be selected | ||
857 | * for inplace decompression | ||
858 | */ | ||
859 | pagenr = z_erofs_onlinepage_index(page); | ||
860 | |||
861 | DBG_BUGON(pagenr >= nr_pages); | ||
862 | if (unlikely(pages[pagenr])) { | ||
863 | DBG_BUGON(1); | ||
864 | SetPageError(pages[pagenr]); | ||
865 | z_erofs_onlinepage_endio(pages[pagenr]); | ||
866 | err = -EFSCORRUPTED; | ||
867 | } | ||
868 | pages[pagenr] = page; | ||
869 | |||
870 | overlapped = true; | ||
871 | } | ||
872 | |||
873 | /* PG_error needs checking for inplaced and staging pages */ | ||
874 | if (unlikely(PageError(page))) { | ||
875 | DBG_BUGON(PageUptodate(page)); | ||
876 | err = -EIO; | ||
877 | } | ||
878 | } | ||
879 | |||
880 | if (unlikely(err)) | ||
881 | goto out; | ||
882 | |||
883 | llen = pcl->length >> Z_EROFS_PCLUSTER_LENGTH_BIT; | ||
884 | if (nr_pages << PAGE_SHIFT >= cl->pageofs + llen) { | ||
885 | outputsize = llen; | ||
886 | partial = !(pcl->length & Z_EROFS_PCLUSTER_FULL_LENGTH); | ||
887 | } else { | ||
888 | outputsize = (nr_pages << PAGE_SHIFT) - cl->pageofs; | ||
889 | partial = true; | ||
890 | } | ||
891 | |||
892 | err = z_erofs_decompress(&(struct z_erofs_decompress_req) { | ||
893 | .sb = sb, | ||
894 | .in = compressed_pages, | ||
895 | .out = pages, | ||
896 | .pageofs_out = cl->pageofs, | ||
897 | .inputsize = PAGE_SIZE, | ||
898 | .outputsize = outputsize, | ||
899 | .alg = pcl->algorithmformat, | ||
900 | .inplace_io = overlapped, | ||
901 | .partial_decoding = partial | ||
902 | }, pagepool); | ||
903 | |||
904 | out: | ||
905 | /* must handle all compressed pages before endding pages */ | ||
906 | for (i = 0; i < clusterpages; ++i) { | ||
907 | page = compressed_pages[i]; | ||
908 | |||
909 | if (erofs_page_is_managed(sbi, page)) | ||
910 | continue; | ||
911 | |||
912 | /* recycle all individual staging pages */ | ||
913 | (void)z_erofs_put_stagingpage(pagepool, page); | ||
914 | |||
915 | WRITE_ONCE(compressed_pages[i], NULL); | ||
916 | } | ||
917 | |||
918 | for (i = 0; i < nr_pages; ++i) { | ||
919 | page = pages[i]; | ||
920 | if (!page) | ||
921 | continue; | ||
922 | |||
923 | DBG_BUGON(!page->mapping); | ||
924 | |||
925 | /* recycle all individual staging pages */ | ||
926 | if (z_erofs_put_stagingpage(pagepool, page)) | ||
927 | continue; | ||
928 | |||
929 | if (unlikely(err < 0)) | ||
930 | SetPageError(page); | ||
931 | |||
932 | z_erofs_onlinepage_endio(page); | ||
933 | } | ||
934 | |||
935 | if (pages == z_pagemap_global) | ||
936 | mutex_unlock(&z_pagemap_global_lock); | ||
937 | else if (unlikely(pages != pages_onstack)) | ||
938 | kvfree(pages); | ||
939 | |||
940 | cl->nr_pages = 0; | ||
941 | cl->vcnt = 0; | ||
942 | |||
943 | /* all cl locks MUST be taken before the following line */ | ||
944 | WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_NIL); | ||
945 | |||
946 | /* all cl locks SHOULD be released right now */ | ||
947 | mutex_unlock(&cl->lock); | ||
948 | |||
949 | z_erofs_collection_put(cl); | ||
950 | return err; | ||
951 | } | ||
952 | |||
953 | static void z_erofs_vle_unzip_all(struct super_block *sb, | ||
954 | struct z_erofs_unzip_io *io, | ||
955 | struct list_head *pagepool) | ||
956 | { | ||
957 | z_erofs_next_pcluster_t owned = io->head; | ||
958 | |||
959 | while (owned != Z_EROFS_PCLUSTER_TAIL_CLOSED) { | ||
960 | struct z_erofs_pcluster *pcl; | ||
961 | |||
962 | /* no possible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */ | ||
963 | DBG_BUGON(owned == Z_EROFS_PCLUSTER_TAIL); | ||
964 | |||
965 | /* no possible that 'owned' equals NULL */ | ||
966 | DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL); | ||
967 | |||
968 | pcl = container_of(owned, struct z_erofs_pcluster, next); | ||
969 | owned = READ_ONCE(pcl->next); | ||
970 | |||
971 | z_erofs_decompress_pcluster(sb, pcl, pagepool); | ||
972 | } | ||
973 | } | ||
974 | |||
975 | static void z_erofs_vle_unzip_wq(struct work_struct *work) | ||
976 | { | ||
977 | struct z_erofs_unzip_io_sb *iosb = | ||
978 | container_of(work, struct z_erofs_unzip_io_sb, io.u.work); | ||
979 | LIST_HEAD(pagepool); | ||
980 | |||
981 | DBG_BUGON(iosb->io.head == Z_EROFS_PCLUSTER_TAIL_CLOSED); | ||
982 | z_erofs_vle_unzip_all(iosb->sb, &iosb->io, &pagepool); | ||
983 | |||
984 | put_pages_list(&pagepool); | ||
985 | kvfree(iosb); | ||
986 | } | ||
987 | |||
988 | static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl, | ||
989 | unsigned int nr, | ||
990 | struct list_head *pagepool, | ||
991 | struct address_space *mc, | ||
992 | gfp_t gfp) | ||
993 | { | ||
994 | /* determined at compile time to avoid too many #ifdefs */ | ||
995 | const bool nocache = __builtin_constant_p(mc) ? !mc : false; | ||
996 | const pgoff_t index = pcl->obj.index; | ||
997 | bool tocache = false; | ||
998 | |||
999 | struct address_space *mapping; | ||
1000 | struct page *oldpage, *page; | ||
1001 | |||
1002 | compressed_page_t t; | ||
1003 | int justfound; | ||
1004 | |||
1005 | repeat: | ||
1006 | page = READ_ONCE(pcl->compressed_pages[nr]); | ||
1007 | oldpage = page; | ||
1008 | |||
1009 | if (!page) | ||
1010 | goto out_allocpage; | ||
1011 | |||
1012 | /* | ||
1013 | * the cached page has not been allocated and | ||
1014 | * an placeholder is out there, prepare it now. | ||
1015 | */ | ||
1016 | if (!nocache && page == PAGE_UNALLOCATED) { | ||
1017 | tocache = true; | ||
1018 | goto out_allocpage; | ||
1019 | } | ||
1020 | |||
1021 | /* process the target tagged pointer */ | ||
1022 | t = tagptr_init(compressed_page_t, page); | ||
1023 | justfound = tagptr_unfold_tags(t); | ||
1024 | page = tagptr_unfold_ptr(t); | ||
1025 | |||
1026 | mapping = READ_ONCE(page->mapping); | ||
1027 | |||
1028 | /* | ||
1029 | * if managed cache is disabled, it's no way to | ||
1030 | * get such a cached-like page. | ||
1031 | */ | ||
1032 | if (nocache) { | ||
1033 | /* if managed cache is disabled, it is impossible `justfound' */ | ||
1034 | DBG_BUGON(justfound); | ||
1035 | |||
1036 | /* and it should be locked, not uptodate, and not truncated */ | ||
1037 | DBG_BUGON(!PageLocked(page)); | ||
1038 | DBG_BUGON(PageUptodate(page)); | ||
1039 | DBG_BUGON(!mapping); | ||
1040 | goto out; | ||
1041 | } | ||
1042 | |||
1043 | /* | ||
1044 | * unmanaged (file) pages are all locked solidly, | ||
1045 | * therefore it is impossible for `mapping' to be NULL. | ||
1046 | */ | ||
1047 | if (mapping && mapping != mc) | ||
1048 | /* ought to be unmanaged pages */ | ||
1049 | goto out; | ||
1050 | |||
1051 | lock_page(page); | ||
1052 | |||
1053 | /* only true if page reclaim goes wrong, should never happen */ | ||
1054 | DBG_BUGON(justfound && PagePrivate(page)); | ||
1055 | |||
1056 | /* the page is still in manage cache */ | ||
1057 | if (page->mapping == mc) { | ||
1058 | WRITE_ONCE(pcl->compressed_pages[nr], page); | ||
1059 | |||
1060 | ClearPageError(page); | ||
1061 | if (!PagePrivate(page)) { | ||
1062 | /* | ||
1063 | * impossible to be !PagePrivate(page) for | ||
1064 | * the current restriction as well if | ||
1065 | * the page is already in compressed_pages[]. | ||
1066 | */ | ||
1067 | DBG_BUGON(!justfound); | ||
1068 | |||
1069 | justfound = 0; | ||
1070 | set_page_private(page, (unsigned long)pcl); | ||
1071 | SetPagePrivate(page); | ||
1072 | } | ||
1073 | |||
1074 | /* no need to submit io if it is already up-to-date */ | ||
1075 | if (PageUptodate(page)) { | ||
1076 | unlock_page(page); | ||
1077 | page = NULL; | ||
1078 | } | ||
1079 | goto out; | ||
1080 | } | ||
1081 | |||
1082 | /* | ||
1083 | * the managed page has been truncated, it's unsafe to | ||
1084 | * reuse this one, let's allocate a new cache-managed page. | ||
1085 | */ | ||
1086 | DBG_BUGON(page->mapping); | ||
1087 | DBG_BUGON(!justfound); | ||
1088 | |||
1089 | tocache = true; | ||
1090 | unlock_page(page); | ||
1091 | put_page(page); | ||
1092 | out_allocpage: | ||
1093 | page = __stagingpage_alloc(pagepool, gfp); | ||
1094 | if (oldpage != cmpxchg(&pcl->compressed_pages[nr], oldpage, page)) { | ||
1095 | list_add(&page->lru, pagepool); | ||
1096 | cpu_relax(); | ||
1097 | goto repeat; | ||
1098 | } | ||
1099 | if (nocache || !tocache) | ||
1100 | goto out; | ||
1101 | if (add_to_page_cache_lru(page, mc, index + nr, gfp)) { | ||
1102 | page->mapping = Z_EROFS_MAPPING_STAGING; | ||
1103 | goto out; | ||
1104 | } | ||
1105 | |||
1106 | set_page_private(page, (unsigned long)pcl); | ||
1107 | SetPagePrivate(page); | ||
1108 | out: /* the only exit (for tracing and debugging) */ | ||
1109 | return page; | ||
1110 | } | ||
1111 | |||
1112 | static struct z_erofs_unzip_io *jobqueue_init(struct super_block *sb, | ||
1113 | struct z_erofs_unzip_io *io, | ||
1114 | bool foreground) | ||
1115 | { | ||
1116 | struct z_erofs_unzip_io_sb *iosb; | ||
1117 | |||
1118 | if (foreground) { | ||
1119 | /* waitqueue available for foreground io */ | ||
1120 | DBG_BUGON(!io); | ||
1121 | |||
1122 | init_waitqueue_head(&io->u.wait); | ||
1123 | atomic_set(&io->pending_bios, 0); | ||
1124 | goto out; | ||
1125 | } | ||
1126 | |||
1127 | iosb = kvzalloc(sizeof(*iosb), GFP_KERNEL | __GFP_NOFAIL); | ||
1128 | DBG_BUGON(!iosb); | ||
1129 | |||
1130 | /* initialize fields in the allocated descriptor */ | ||
1131 | io = &iosb->io; | ||
1132 | iosb->sb = sb; | ||
1133 | INIT_WORK(&io->u.work, z_erofs_vle_unzip_wq); | ||
1134 | out: | ||
1135 | io->head = Z_EROFS_PCLUSTER_TAIL_CLOSED; | ||
1136 | return io; | ||
1137 | } | ||
1138 | |||
1139 | /* define decompression jobqueue types */ | ||
1140 | enum { | ||
1141 | JQ_BYPASS, | ||
1142 | JQ_SUBMIT, | ||
1143 | NR_JOBQUEUES, | ||
1144 | }; | ||
1145 | |||
1146 | static void *jobqueueset_init(struct super_block *sb, | ||
1147 | z_erofs_next_pcluster_t qtail[], | ||
1148 | struct z_erofs_unzip_io *q[], | ||
1149 | struct z_erofs_unzip_io *fgq, | ||
1150 | bool forcefg) | ||
1151 | { | ||
1152 | /* | ||
1153 | * if managed cache is enabled, bypass jobqueue is needed, | ||
1154 | * no need to read from device for all pclusters in this queue. | ||
1155 | */ | ||
1156 | q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, true); | ||
1157 | qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head; | ||
1158 | |||
1159 | q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, forcefg); | ||
1160 | qtail[JQ_SUBMIT] = &q[JQ_SUBMIT]->head; | ||
1161 | |||
1162 | return tagptr_cast_ptr(tagptr_fold(tagptr1_t, q[JQ_SUBMIT], !forcefg)); | ||
1163 | } | ||
1164 | |||
1165 | static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl, | ||
1166 | z_erofs_next_pcluster_t qtail[], | ||
1167 | z_erofs_next_pcluster_t owned_head) | ||
1168 | { | ||
1169 | z_erofs_next_pcluster_t *const submit_qtail = qtail[JQ_SUBMIT]; | ||
1170 | z_erofs_next_pcluster_t *const bypass_qtail = qtail[JQ_BYPASS]; | ||
1171 | |||
1172 | DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED); | ||
1173 | if (owned_head == Z_EROFS_PCLUSTER_TAIL) | ||
1174 | owned_head = Z_EROFS_PCLUSTER_TAIL_CLOSED; | ||
1175 | |||
1176 | WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL_CLOSED); | ||
1177 | |||
1178 | WRITE_ONCE(*submit_qtail, owned_head); | ||
1179 | WRITE_ONCE(*bypass_qtail, &pcl->next); | ||
1180 | |||
1181 | qtail[JQ_BYPASS] = &pcl->next; | ||
1182 | } | ||
1183 | |||
1184 | static bool postsubmit_is_all_bypassed(struct z_erofs_unzip_io *q[], | ||
1185 | unsigned int nr_bios, | ||
1186 | bool force_fg) | ||
1187 | { | ||
1188 | /* | ||
1189 | * although background is preferred, no one is pending for submission. | ||
1190 | * don't issue workqueue for decompression but drop it directly instead. | ||
1191 | */ | ||
1192 | if (force_fg || nr_bios) | ||
1193 | return false; | ||
1194 | |||
1195 | kvfree(container_of(q[JQ_SUBMIT], struct z_erofs_unzip_io_sb, io)); | ||
1196 | return true; | ||
1197 | } | ||
1198 | |||
1199 | static bool z_erofs_vle_submit_all(struct super_block *sb, | ||
1200 | z_erofs_next_pcluster_t owned_head, | ||
1201 | struct list_head *pagepool, | ||
1202 | struct z_erofs_unzip_io *fgq, | ||
1203 | bool force_fg) | ||
1204 | { | ||
1205 | struct erofs_sb_info *const sbi __maybe_unused = EROFS_SB(sb); | ||
1206 | z_erofs_next_pcluster_t qtail[NR_JOBQUEUES]; | ||
1207 | struct z_erofs_unzip_io *q[NR_JOBQUEUES]; | ||
1208 | struct bio *bio; | ||
1209 | void *bi_private; | ||
1210 | /* since bio will be NULL, no need to initialize last_index */ | ||
1211 | pgoff_t uninitialized_var(last_index); | ||
1212 | bool force_submit = false; | ||
1213 | unsigned int nr_bios; | ||
1214 | |||
1215 | if (unlikely(owned_head == Z_EROFS_PCLUSTER_TAIL)) | ||
1216 | return false; | ||
1217 | |||
1218 | force_submit = false; | ||
1219 | bio = NULL; | ||
1220 | nr_bios = 0; | ||
1221 | bi_private = jobqueueset_init(sb, qtail, q, fgq, force_fg); | ||
1222 | |||
1223 | /* by default, all need io submission */ | ||
1224 | q[JQ_SUBMIT]->head = owned_head; | ||
1225 | |||
1226 | do { | ||
1227 | struct z_erofs_pcluster *pcl; | ||
1228 | unsigned int clusterpages; | ||
1229 | pgoff_t first_index; | ||
1230 | struct page *page; | ||
1231 | unsigned int i = 0, bypass = 0; | ||
1232 | int err; | ||
1233 | |||
1234 | /* no possible 'owned_head' equals the following */ | ||
1235 | DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED); | ||
1236 | DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_NIL); | ||
1237 | |||
1238 | pcl = container_of(owned_head, struct z_erofs_pcluster, next); | ||
1239 | |||
1240 | clusterpages = BIT(pcl->clusterbits); | ||
1241 | |||
1242 | /* close the main owned chain at first */ | ||
1243 | owned_head = cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL, | ||
1244 | Z_EROFS_PCLUSTER_TAIL_CLOSED); | ||
1245 | |||
1246 | first_index = pcl->obj.index; | ||
1247 | force_submit |= (first_index != last_index + 1); | ||
1248 | |||
1249 | repeat: | ||
1250 | page = pickup_page_for_submission(pcl, i, pagepool, | ||
1251 | MNGD_MAPPING(sbi), | ||
1252 | GFP_NOFS); | ||
1253 | if (!page) { | ||
1254 | force_submit = true; | ||
1255 | ++bypass; | ||
1256 | goto skippage; | ||
1257 | } | ||
1258 | |||
1259 | if (bio && force_submit) { | ||
1260 | submit_bio_retry: | ||
1261 | __submit_bio(bio, REQ_OP_READ, 0); | ||
1262 | bio = NULL; | ||
1263 | } | ||
1264 | |||
1265 | if (!bio) { | ||
1266 | bio = erofs_grab_bio(sb, first_index + i, | ||
1267 | BIO_MAX_PAGES, bi_private, | ||
1268 | z_erofs_vle_read_endio, true); | ||
1269 | ++nr_bios; | ||
1270 | } | ||
1271 | |||
1272 | err = bio_add_page(bio, page, PAGE_SIZE, 0); | ||
1273 | if (err < PAGE_SIZE) | ||
1274 | goto submit_bio_retry; | ||
1275 | |||
1276 | force_submit = false; | ||
1277 | last_index = first_index + i; | ||
1278 | skippage: | ||
1279 | if (++i < clusterpages) | ||
1280 | goto repeat; | ||
1281 | |||
1282 | if (bypass < clusterpages) | ||
1283 | qtail[JQ_SUBMIT] = &pcl->next; | ||
1284 | else | ||
1285 | move_to_bypass_jobqueue(pcl, qtail, owned_head); | ||
1286 | } while (owned_head != Z_EROFS_PCLUSTER_TAIL); | ||
1287 | |||
1288 | if (bio) | ||
1289 | __submit_bio(bio, REQ_OP_READ, 0); | ||
1290 | |||
1291 | if (postsubmit_is_all_bypassed(q, nr_bios, force_fg)) | ||
1292 | return true; | ||
1293 | |||
1294 | z_erofs_vle_unzip_kickoff(bi_private, nr_bios); | ||
1295 | return true; | ||
1296 | } | ||
1297 | |||
1298 | static void z_erofs_submit_and_unzip(struct super_block *sb, | ||
1299 | struct z_erofs_collector *clt, | ||
1300 | struct list_head *pagepool, | ||
1301 | bool force_fg) | ||
1302 | { | ||
1303 | struct z_erofs_unzip_io io[NR_JOBQUEUES]; | ||
1304 | |||
1305 | if (!z_erofs_vle_submit_all(sb, clt->owned_head, | ||
1306 | pagepool, io, force_fg)) | ||
1307 | return; | ||
1308 | |||
1309 | /* decompress no I/O pclusters immediately */ | ||
1310 | z_erofs_vle_unzip_all(sb, &io[JQ_BYPASS], pagepool); | ||
1311 | |||
1312 | if (!force_fg) | ||
1313 | return; | ||
1314 | |||
1315 | /* wait until all bios are completed */ | ||
1316 | wait_event(io[JQ_SUBMIT].u.wait, | ||
1317 | !atomic_read(&io[JQ_SUBMIT].pending_bios)); | ||
1318 | |||
1319 | /* let's synchronous decompression */ | ||
1320 | z_erofs_vle_unzip_all(sb, &io[JQ_SUBMIT], pagepool); | ||
1321 | } | ||
1322 | |||
1323 | static int z_erofs_vle_normalaccess_readpage(struct file *file, | ||
1324 | struct page *page) | ||
1325 | { | ||
1326 | struct inode *const inode = page->mapping->host; | ||
1327 | struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); | ||
1328 | int err; | ||
1329 | LIST_HEAD(pagepool); | ||
1330 | |||
1331 | trace_erofs_readpage(page, false); | ||
1332 | |||
1333 | f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT; | ||
1334 | |||
1335 | err = z_erofs_do_read_page(&f, page, &pagepool); | ||
1336 | (void)z_erofs_collector_end(&f.clt); | ||
1337 | |||
1338 | /* if some compressed cluster ready, need submit them anyway */ | ||
1339 | z_erofs_submit_and_unzip(inode->i_sb, &f.clt, &pagepool, true); | ||
1340 | |||
1341 | if (err) | ||
1342 | errln("%s, failed to read, err [%d]", __func__, err); | ||
1343 | |||
1344 | if (f.map.mpage) | ||
1345 | put_page(f.map.mpage); | ||
1346 | |||
1347 | /* clean up the remaining free pages */ | ||
1348 | put_pages_list(&pagepool); | ||
1349 | return err; | ||
1350 | } | ||
1351 | |||
1352 | static bool should_decompress_synchronously(struct erofs_sb_info *sbi, | ||
1353 | unsigned int nr) | ||
1354 | { | ||
1355 | return nr <= sbi->max_sync_decompress_pages; | ||
1356 | } | ||
1357 | |||
1358 | static int z_erofs_vle_normalaccess_readpages(struct file *filp, | ||
1359 | struct address_space *mapping, | ||
1360 | struct list_head *pages, | ||
1361 | unsigned int nr_pages) | ||
1362 | { | ||
1363 | struct inode *const inode = mapping->host; | ||
1364 | struct erofs_sb_info *const sbi = EROFS_I_SB(inode); | ||
1365 | |||
1366 | bool sync = should_decompress_synchronously(sbi, nr_pages); | ||
1367 | struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); | ||
1368 | gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL); | ||
1369 | struct page *head = NULL; | ||
1370 | LIST_HEAD(pagepool); | ||
1371 | |||
1372 | trace_erofs_readpages(mapping->host, lru_to_page(pages), | ||
1373 | nr_pages, false); | ||
1374 | |||
1375 | f.headoffset = (erofs_off_t)lru_to_page(pages)->index << PAGE_SHIFT; | ||
1376 | |||
1377 | for (; nr_pages; --nr_pages) { | ||
1378 | struct page *page = lru_to_page(pages); | ||
1379 | |||
1380 | prefetchw(&page->flags); | ||
1381 | list_del(&page->lru); | ||
1382 | |||
1383 | /* | ||
1384 | * A pure asynchronous readahead is indicated if | ||
1385 | * a PG_readahead marked page is hitted at first. | ||
1386 | * Let's also do asynchronous decompression for this case. | ||
1387 | */ | ||
1388 | sync &= !(PageReadahead(page) && !head); | ||
1389 | |||
1390 | if (add_to_page_cache_lru(page, mapping, page->index, gfp)) { | ||
1391 | list_add(&page->lru, &pagepool); | ||
1392 | continue; | ||
1393 | } | ||
1394 | |||
1395 | set_page_private(page, (unsigned long)head); | ||
1396 | head = page; | ||
1397 | } | ||
1398 | |||
1399 | while (head) { | ||
1400 | struct page *page = head; | ||
1401 | int err; | ||
1402 | |||
1403 | /* traversal in reverse order */ | ||
1404 | head = (void *)page_private(page); | ||
1405 | |||
1406 | err = z_erofs_do_read_page(&f, page, &pagepool); | ||
1407 | if (err) { | ||
1408 | struct erofs_vnode *vi = EROFS_V(inode); | ||
1409 | |||
1410 | errln("%s, readahead error at page %lu of nid %llu", | ||
1411 | __func__, page->index, vi->nid); | ||
1412 | } | ||
1413 | put_page(page); | ||
1414 | } | ||
1415 | |||
1416 | (void)z_erofs_collector_end(&f.clt); | ||
1417 | |||
1418 | z_erofs_submit_and_unzip(inode->i_sb, &f.clt, &pagepool, sync); | ||
1419 | |||
1420 | if (f.map.mpage) | ||
1421 | put_page(f.map.mpage); | ||
1422 | |||
1423 | /* clean up the remaining free pages */ | ||
1424 | put_pages_list(&pagepool); | ||
1425 | return 0; | ||
1426 | } | ||
1427 | |||
1428 | const struct address_space_operations z_erofs_vle_normalaccess_aops = { | ||
1429 | .readpage = z_erofs_vle_normalaccess_readpage, | ||
1430 | .readpages = z_erofs_vle_normalaccess_readpages, | ||
1431 | }; | ||
1432 | |||
diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h new file mode 100644 index 000000000000..4fc547bc01f9 --- /dev/null +++ b/fs/erofs/zdata.h | |||
@@ -0,0 +1,193 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0-only */ | ||
2 | /* | ||
3 | * Copyright (C) 2018 HUAWEI, Inc. | ||
4 | * http://www.huawei.com/ | ||
5 | * Created by Gao Xiang <gaoxiang25@huawei.com> | ||
6 | */ | ||
7 | #ifndef __EROFS_FS_ZDATA_H | ||
8 | #define __EROFS_FS_ZDATA_H | ||
9 | |||
10 | #include "internal.h" | ||
11 | #include "zpvec.h" | ||
12 | |||
13 | #define Z_EROFS_NR_INLINE_PAGEVECS 3 | ||
14 | |||
15 | /* | ||
16 | * Structure fields follow one of the following exclusion rules. | ||
17 | * | ||
18 | * I: Modifiable by initialization/destruction paths and read-only | ||
19 | * for everyone else; | ||
20 | * | ||
21 | * L: Field should be protected by pageset lock; | ||
22 | * | ||
23 | * A: Field should be accessed / updated in atomic for parallelized code. | ||
24 | */ | ||
25 | struct z_erofs_collection { | ||
26 | struct mutex lock; | ||
27 | |||
28 | /* I: page offset of start position of decompression */ | ||
29 | unsigned short pageofs; | ||
30 | |||
31 | /* L: maximum relative page index in pagevec[] */ | ||
32 | unsigned short nr_pages; | ||
33 | |||
34 | /* L: total number of pages in pagevec[] */ | ||
35 | unsigned int vcnt; | ||
36 | |||
37 | union { | ||
38 | /* L: inline a certain number of pagevecs for bootstrap */ | ||
39 | erofs_vtptr_t pagevec[Z_EROFS_NR_INLINE_PAGEVECS]; | ||
40 | |||
41 | /* I: can be used to free the pcluster by RCU. */ | ||
42 | struct rcu_head rcu; | ||
43 | }; | ||
44 | }; | ||
45 | |||
46 | #define Z_EROFS_PCLUSTER_FULL_LENGTH 0x00000001 | ||
47 | #define Z_EROFS_PCLUSTER_LENGTH_BIT 1 | ||
48 | |||
49 | /* | ||
50 | * let's leave a type here in case of introducing | ||
51 | * another tagged pointer later. | ||
52 | */ | ||
53 | typedef void *z_erofs_next_pcluster_t; | ||
54 | |||
55 | struct z_erofs_pcluster { | ||
56 | struct erofs_workgroup obj; | ||
57 | struct z_erofs_collection primary_collection; | ||
58 | |||
59 | /* A: point to next chained pcluster or TAILs */ | ||
60 | z_erofs_next_pcluster_t next; | ||
61 | |||
62 | /* A: compressed pages (including multi-usage pages) */ | ||
63 | struct page *compressed_pages[Z_EROFS_CLUSTER_MAX_PAGES]; | ||
64 | |||
65 | /* A: lower limit of decompressed length and if full length or not */ | ||
66 | unsigned int length; | ||
67 | |||
68 | /* I: compression algorithm format */ | ||
69 | unsigned char algorithmformat; | ||
70 | /* I: bit shift of physical cluster size */ | ||
71 | unsigned char clusterbits; | ||
72 | }; | ||
73 | |||
74 | #define z_erofs_primarycollection(pcluster) (&(pcluster)->primary_collection) | ||
75 | |||
76 | /* let's avoid the valid 32-bit kernel addresses */ | ||
77 | |||
78 | /* the chained workgroup has't submitted io (still open) */ | ||
79 | #define Z_EROFS_PCLUSTER_TAIL ((void *)0x5F0ECAFE) | ||
80 | /* the chained workgroup has already submitted io */ | ||
81 | #define Z_EROFS_PCLUSTER_TAIL_CLOSED ((void *)0x5F0EDEAD) | ||
82 | |||
83 | #define Z_EROFS_PCLUSTER_NIL (NULL) | ||
84 | |||
85 | #define Z_EROFS_WORKGROUP_SIZE sizeof(struct z_erofs_pcluster) | ||
86 | |||
87 | struct z_erofs_unzip_io { | ||
88 | atomic_t pending_bios; | ||
89 | z_erofs_next_pcluster_t head; | ||
90 | |||
91 | union { | ||
92 | wait_queue_head_t wait; | ||
93 | struct work_struct work; | ||
94 | } u; | ||
95 | }; | ||
96 | |||
97 | struct z_erofs_unzip_io_sb { | ||
98 | struct z_erofs_unzip_io io; | ||
99 | struct super_block *sb; | ||
100 | }; | ||
101 | |||
102 | #define MNGD_MAPPING(sbi) ((sbi)->managed_cache->i_mapping) | ||
103 | static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi, | ||
104 | struct page *page) | ||
105 | { | ||
106 | return page->mapping == MNGD_MAPPING(sbi); | ||
107 | } | ||
108 | |||
109 | #define Z_EROFS_ONLINEPAGE_COUNT_BITS 2 | ||
110 | #define Z_EROFS_ONLINEPAGE_COUNT_MASK ((1 << Z_EROFS_ONLINEPAGE_COUNT_BITS) - 1) | ||
111 | #define Z_EROFS_ONLINEPAGE_INDEX_SHIFT (Z_EROFS_ONLINEPAGE_COUNT_BITS) | ||
112 | |||
113 | /* | ||
114 | * waiters (aka. ongoing_packs): # to unlock the page | ||
115 | * sub-index: 0 - for partial page, >= 1 full page sub-index | ||
116 | */ | ||
117 | typedef atomic_t z_erofs_onlinepage_t; | ||
118 | |||
119 | /* type punning */ | ||
120 | union z_erofs_onlinepage_converter { | ||
121 | z_erofs_onlinepage_t *o; | ||
122 | unsigned long *v; | ||
123 | }; | ||
124 | |||
125 | static inline unsigned int z_erofs_onlinepage_index(struct page *page) | ||
126 | { | ||
127 | union z_erofs_onlinepage_converter u; | ||
128 | |||
129 | DBG_BUGON(!PagePrivate(page)); | ||
130 | u.v = &page_private(page); | ||
131 | |||
132 | return atomic_read(u.o) >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT; | ||
133 | } | ||
134 | |||
135 | static inline void z_erofs_onlinepage_init(struct page *page) | ||
136 | { | ||
137 | union { | ||
138 | z_erofs_onlinepage_t o; | ||
139 | unsigned long v; | ||
140 | /* keep from being unlocked in advance */ | ||
141 | } u = { .o = ATOMIC_INIT(1) }; | ||
142 | |||
143 | set_page_private(page, u.v); | ||
144 | smp_wmb(); | ||
145 | SetPagePrivate(page); | ||
146 | } | ||
147 | |||
148 | static inline void z_erofs_onlinepage_fixup(struct page *page, | ||
149 | uintptr_t index, bool down) | ||
150 | { | ||
151 | unsigned long *p, o, v, id; | ||
152 | repeat: | ||
153 | p = &page_private(page); | ||
154 | o = READ_ONCE(*p); | ||
155 | |||
156 | id = o >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT; | ||
157 | if (id) { | ||
158 | if (!index) | ||
159 | return; | ||
160 | |||
161 | DBG_BUGON(id != index); | ||
162 | } | ||
163 | |||
164 | v = (index << Z_EROFS_ONLINEPAGE_INDEX_SHIFT) | | ||
165 | ((o & Z_EROFS_ONLINEPAGE_COUNT_MASK) + (unsigned int)down); | ||
166 | if (cmpxchg(p, o, v) != o) | ||
167 | goto repeat; | ||
168 | } | ||
169 | |||
170 | static inline void z_erofs_onlinepage_endio(struct page *page) | ||
171 | { | ||
172 | union z_erofs_onlinepage_converter u; | ||
173 | unsigned int v; | ||
174 | |||
175 | DBG_BUGON(!PagePrivate(page)); | ||
176 | u.v = &page_private(page); | ||
177 | |||
178 | v = atomic_dec_return(u.o); | ||
179 | if (!(v & Z_EROFS_ONLINEPAGE_COUNT_MASK)) { | ||
180 | ClearPagePrivate(page); | ||
181 | if (!PageError(page)) | ||
182 | SetPageUptodate(page); | ||
183 | unlock_page(page); | ||
184 | } | ||
185 | debugln("%s, page %p value %x", __func__, page, atomic_read(u.o)); | ||
186 | } | ||
187 | |||
188 | #define Z_EROFS_VMAP_ONSTACK_PAGES \ | ||
189 | min_t(unsigned int, THREAD_SIZE / 8 / sizeof(struct page *), 96U) | ||
190 | #define Z_EROFS_VMAP_GLOBAL_PAGES 2048 | ||
191 | |||
192 | #endif | ||
193 | |||
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c new file mode 100644 index 000000000000..4dc9cec01297 --- /dev/null +++ b/fs/erofs/zmap.c | |||
@@ -0,0 +1,466 @@ | |||
1 | // SPDX-License-Identifier: GPL-2.0-only | ||
2 | /* | ||
3 | * Copyright (C) 2018-2019 HUAWEI, Inc. | ||
4 | * http://www.huawei.com/ | ||
5 | * Created by Gao Xiang <gaoxiang25@huawei.com> | ||
6 | */ | ||
7 | #include "internal.h" | ||
8 | #include <asm/unaligned.h> | ||
9 | #include <trace/events/erofs.h> | ||
10 | |||
11 | int z_erofs_fill_inode(struct inode *inode) | ||
12 | { | ||
13 | struct erofs_vnode *const vi = EROFS_V(inode); | ||
14 | |||
15 | if (vi->datamode == EROFS_INODE_FLAT_COMPRESSION_LEGACY) { | ||
16 | vi->z_advise = 0; | ||
17 | vi->z_algorithmtype[0] = 0; | ||
18 | vi->z_algorithmtype[1] = 0; | ||
19 | vi->z_logical_clusterbits = LOG_BLOCK_SIZE; | ||
20 | vi->z_physical_clusterbits[0] = vi->z_logical_clusterbits; | ||
21 | vi->z_physical_clusterbits[1] = vi->z_logical_clusterbits; | ||
22 | set_bit(EROFS_V_Z_INITED_BIT, &vi->flags); | ||
23 | } | ||
24 | |||
25 | inode->i_mapping->a_ops = &z_erofs_vle_normalaccess_aops; | ||
26 | return 0; | ||
27 | } | ||
28 | |||
29 | static int fill_inode_lazy(struct inode *inode) | ||
30 | { | ||
31 | struct erofs_vnode *const vi = EROFS_V(inode); | ||
32 | struct super_block *const sb = inode->i_sb; | ||
33 | int err; | ||
34 | erofs_off_t pos; | ||
35 | struct page *page; | ||
36 | void *kaddr; | ||
37 | struct z_erofs_map_header *h; | ||
38 | |||
39 | if (test_bit(EROFS_V_Z_INITED_BIT, &vi->flags)) | ||
40 | return 0; | ||
41 | |||
42 | if (wait_on_bit_lock(&vi->flags, EROFS_V_BL_Z_BIT, TASK_KILLABLE)) | ||
43 | return -ERESTARTSYS; | ||
44 | |||
45 | err = 0; | ||
46 | if (test_bit(EROFS_V_Z_INITED_BIT, &vi->flags)) | ||
47 | goto out_unlock; | ||
48 | |||
49 | DBG_BUGON(vi->datamode == EROFS_INODE_FLAT_COMPRESSION_LEGACY); | ||
50 | |||
51 | pos = ALIGN(iloc(EROFS_SB(sb), vi->nid) + vi->inode_isize + | ||
52 | vi->xattr_isize, 8); | ||
53 | page = erofs_get_meta_page(sb, erofs_blknr(pos), false); | ||
54 | if (IS_ERR(page)) { | ||
55 | err = PTR_ERR(page); | ||
56 | goto out_unlock; | ||
57 | } | ||
58 | |||
59 | kaddr = kmap_atomic(page); | ||
60 | |||
61 | h = kaddr + erofs_blkoff(pos); | ||
62 | vi->z_advise = le16_to_cpu(h->h_advise); | ||
63 | vi->z_algorithmtype[0] = h->h_algorithmtype & 15; | ||
64 | vi->z_algorithmtype[1] = h->h_algorithmtype >> 4; | ||
65 | |||
66 | if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX) { | ||
67 | errln("unknown compression format %u for nid %llu, please upgrade kernel", | ||
68 | vi->z_algorithmtype[0], vi->nid); | ||
69 | err = -EOPNOTSUPP; | ||
70 | goto unmap_done; | ||
71 | } | ||
72 | |||
73 | vi->z_logical_clusterbits = LOG_BLOCK_SIZE + (h->h_clusterbits & 7); | ||
74 | vi->z_physical_clusterbits[0] = vi->z_logical_clusterbits + | ||
75 | ((h->h_clusterbits >> 3) & 3); | ||
76 | |||
77 | if (vi->z_physical_clusterbits[0] != LOG_BLOCK_SIZE) { | ||
78 | errln("unsupported physical clusterbits %u for nid %llu, please upgrade kernel", | ||
79 | vi->z_physical_clusterbits[0], vi->nid); | ||
80 | err = -EOPNOTSUPP; | ||
81 | goto unmap_done; | ||
82 | } | ||
83 | |||
84 | vi->z_physical_clusterbits[1] = vi->z_logical_clusterbits + | ||
85 | ((h->h_clusterbits >> 5) & 7); | ||
86 | set_bit(EROFS_V_Z_INITED_BIT, &vi->flags); | ||
87 | unmap_done: | ||
88 | kunmap_atomic(kaddr); | ||
89 | unlock_page(page); | ||
90 | put_page(page); | ||
91 | out_unlock: | ||
92 | clear_and_wake_up_bit(EROFS_V_BL_Z_BIT, &vi->flags); | ||
93 | return err; | ||
94 | } | ||
95 | |||
96 | struct z_erofs_maprecorder { | ||
97 | struct inode *inode; | ||
98 | struct erofs_map_blocks *map; | ||
99 | void *kaddr; | ||
100 | |||
101 | unsigned long lcn; | ||
102 | /* compression extent information gathered */ | ||
103 | u8 type; | ||
104 | u16 clusterofs; | ||
105 | u16 delta[2]; | ||
106 | erofs_blk_t pblk; | ||
107 | }; | ||
108 | |||
109 | static int z_erofs_reload_indexes(struct z_erofs_maprecorder *m, | ||
110 | erofs_blk_t eblk) | ||
111 | { | ||
112 | struct super_block *const sb = m->inode->i_sb; | ||
113 | struct erofs_map_blocks *const map = m->map; | ||
114 | struct page *mpage = map->mpage; | ||
115 | |||
116 | if (mpage) { | ||
117 | if (mpage->index == eblk) { | ||
118 | if (!m->kaddr) | ||
119 | m->kaddr = kmap_atomic(mpage); | ||
120 | return 0; | ||
121 | } | ||
122 | |||
123 | if (m->kaddr) { | ||
124 | kunmap_atomic(m->kaddr); | ||
125 | m->kaddr = NULL; | ||
126 | } | ||
127 | put_page(mpage); | ||
128 | } | ||
129 | |||
130 | mpage = erofs_get_meta_page(sb, eblk, false); | ||
131 | if (IS_ERR(mpage)) { | ||
132 | map->mpage = NULL; | ||
133 | return PTR_ERR(mpage); | ||
134 | } | ||
135 | m->kaddr = kmap_atomic(mpage); | ||
136 | unlock_page(mpage); | ||
137 | map->mpage = mpage; | ||
138 | return 0; | ||
139 | } | ||
140 | |||
141 | static int vle_legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m, | ||
142 | unsigned long lcn) | ||
143 | { | ||
144 | struct inode *const inode = m->inode; | ||
145 | struct erofs_vnode *const vi = EROFS_V(inode); | ||
146 | const erofs_off_t ibase = iloc(EROFS_I_SB(inode), vi->nid); | ||
147 | const erofs_off_t pos = | ||
148 | Z_EROFS_VLE_LEGACY_INDEX_ALIGN(ibase + vi->inode_isize + | ||
149 | vi->xattr_isize) + | ||
150 | lcn * sizeof(struct z_erofs_vle_decompressed_index); | ||
151 | struct z_erofs_vle_decompressed_index *di; | ||
152 | unsigned int advise, type; | ||
153 | int err; | ||
154 | |||
155 | err = z_erofs_reload_indexes(m, erofs_blknr(pos)); | ||
156 | if (err) | ||
157 | return err; | ||
158 | |||
159 | m->lcn = lcn; | ||
160 | di = m->kaddr + erofs_blkoff(pos); | ||
161 | |||
162 | advise = le16_to_cpu(di->di_advise); | ||
163 | type = (advise >> Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT) & | ||
164 | ((1 << Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS) - 1); | ||
165 | switch (type) { | ||
166 | case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD: | ||
167 | m->clusterofs = 1 << vi->z_logical_clusterbits; | ||
168 | m->delta[0] = le16_to_cpu(di->di_u.delta[0]); | ||
169 | m->delta[1] = le16_to_cpu(di->di_u.delta[1]); | ||
170 | break; | ||
171 | case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: | ||
172 | case Z_EROFS_VLE_CLUSTER_TYPE_HEAD: | ||
173 | m->clusterofs = le16_to_cpu(di->di_clusterofs); | ||
174 | m->pblk = le32_to_cpu(di->di_u.blkaddr); | ||
175 | break; | ||
176 | default: | ||
177 | DBG_BUGON(1); | ||
178 | return -EOPNOTSUPP; | ||
179 | } | ||
180 | m->type = type; | ||
181 | return 0; | ||
182 | } | ||
183 | |||
184 | static unsigned int decode_compactedbits(unsigned int lobits, | ||
185 | unsigned int lomask, | ||
186 | u8 *in, unsigned int pos, u8 *type) | ||
187 | { | ||
188 | const unsigned int v = get_unaligned_le32(in + pos / 8) >> (pos & 7); | ||
189 | const unsigned int lo = v & lomask; | ||
190 | |||
191 | *type = (v >> lobits) & 3; | ||
192 | return lo; | ||
193 | } | ||
194 | |||
195 | static int unpack_compacted_index(struct z_erofs_maprecorder *m, | ||
196 | unsigned int amortizedshift, | ||
197 | unsigned int eofs) | ||
198 | { | ||
199 | struct erofs_vnode *const vi = EROFS_V(m->inode); | ||
200 | const unsigned int lclusterbits = vi->z_logical_clusterbits; | ||
201 | const unsigned int lomask = (1 << lclusterbits) - 1; | ||
202 | unsigned int vcnt, base, lo, encodebits, nblk; | ||
203 | int i; | ||
204 | u8 *in, type; | ||
205 | |||
206 | if (1 << amortizedshift == 4) | ||
207 | vcnt = 2; | ||
208 | else if (1 << amortizedshift == 2 && lclusterbits == 12) | ||
209 | vcnt = 16; | ||
210 | else | ||
211 | return -EOPNOTSUPP; | ||
212 | |||
213 | encodebits = ((vcnt << amortizedshift) - sizeof(__le32)) * 8 / vcnt; | ||
214 | base = round_down(eofs, vcnt << amortizedshift); | ||
215 | in = m->kaddr + base; | ||
216 | |||
217 | i = (eofs - base) >> amortizedshift; | ||
218 | |||
219 | lo = decode_compactedbits(lclusterbits, lomask, | ||
220 | in, encodebits * i, &type); | ||
221 | m->type = type; | ||
222 | if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) { | ||
223 | m->clusterofs = 1 << lclusterbits; | ||
224 | if (i + 1 != vcnt) { | ||
225 | m->delta[0] = lo; | ||
226 | return 0; | ||
227 | } | ||
228 | /* | ||
229 | * since the last lcluster in the pack is special, | ||
230 | * of which lo saves delta[1] rather than delta[0]. | ||
231 | * Hence, get delta[0] by the previous lcluster indirectly. | ||
232 | */ | ||
233 | lo = decode_compactedbits(lclusterbits, lomask, | ||
234 | in, encodebits * (i - 1), &type); | ||
235 | if (type != Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) | ||
236 | lo = 0; | ||
237 | m->delta[0] = lo + 1; | ||
238 | return 0; | ||
239 | } | ||
240 | m->clusterofs = lo; | ||
241 | m->delta[0] = 0; | ||
242 | /* figout out blkaddr (pblk) for HEAD lclusters */ | ||
243 | nblk = 1; | ||
244 | while (i > 0) { | ||
245 | --i; | ||
246 | lo = decode_compactedbits(lclusterbits, lomask, | ||
247 | in, encodebits * i, &type); | ||
248 | if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) | ||
249 | i -= lo; | ||
250 | |||
251 | if (i >= 0) | ||
252 | ++nblk; | ||
253 | } | ||
254 | in += (vcnt << amortizedshift) - sizeof(__le32); | ||
255 | m->pblk = le32_to_cpu(*(__le32 *)in) + nblk; | ||
256 | return 0; | ||
257 | } | ||
258 | |||
259 | static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m, | ||
260 | unsigned long lcn) | ||
261 | { | ||
262 | struct inode *const inode = m->inode; | ||
263 | struct erofs_vnode *const vi = EROFS_V(inode); | ||
264 | const unsigned int lclusterbits = vi->z_logical_clusterbits; | ||
265 | const erofs_off_t ebase = ALIGN(iloc(EROFS_I_SB(inode), vi->nid) + | ||
266 | vi->inode_isize + vi->xattr_isize, 8) + | ||
267 | sizeof(struct z_erofs_map_header); | ||
268 | const unsigned int totalidx = DIV_ROUND_UP(inode->i_size, EROFS_BLKSIZ); | ||
269 | unsigned int compacted_4b_initial, compacted_2b; | ||
270 | unsigned int amortizedshift; | ||
271 | erofs_off_t pos; | ||
272 | int err; | ||
273 | |||
274 | if (lclusterbits != 12) | ||
275 | return -EOPNOTSUPP; | ||
276 | |||
277 | if (lcn >= totalidx) | ||
278 | return -EINVAL; | ||
279 | |||
280 | m->lcn = lcn; | ||
281 | /* used to align to 32-byte (compacted_2b) alignment */ | ||
282 | compacted_4b_initial = (32 - ebase % 32) / 4; | ||
283 | if (compacted_4b_initial == 32 / 4) | ||
284 | compacted_4b_initial = 0; | ||
285 | |||
286 | if (vi->z_advise & Z_EROFS_ADVISE_COMPACTED_2B) | ||
287 | compacted_2b = rounddown(totalidx - compacted_4b_initial, 16); | ||
288 | else | ||
289 | compacted_2b = 0; | ||
290 | |||
291 | pos = ebase; | ||
292 | if (lcn < compacted_4b_initial) { | ||
293 | amortizedshift = 2; | ||
294 | goto out; | ||
295 | } | ||
296 | pos += compacted_4b_initial * 4; | ||
297 | lcn -= compacted_4b_initial; | ||
298 | |||
299 | if (lcn < compacted_2b) { | ||
300 | amortizedshift = 1; | ||
301 | goto out; | ||
302 | } | ||
303 | pos += compacted_2b * 2; | ||
304 | lcn -= compacted_2b; | ||
305 | amortizedshift = 2; | ||
306 | out: | ||
307 | pos += lcn * (1 << amortizedshift); | ||
308 | err = z_erofs_reload_indexes(m, erofs_blknr(pos)); | ||
309 | if (err) | ||
310 | return err; | ||
311 | return unpack_compacted_index(m, amortizedshift, erofs_blkoff(pos)); | ||
312 | } | ||
313 | |||
314 | static int vle_load_cluster_from_disk(struct z_erofs_maprecorder *m, | ||
315 | unsigned int lcn) | ||
316 | { | ||
317 | const unsigned int datamode = EROFS_V(m->inode)->datamode; | ||
318 | |||
319 | if (datamode == EROFS_INODE_FLAT_COMPRESSION_LEGACY) | ||
320 | return vle_legacy_load_cluster_from_disk(m, lcn); | ||
321 | |||
322 | if (datamode == EROFS_INODE_FLAT_COMPRESSION) | ||
323 | return compacted_load_cluster_from_disk(m, lcn); | ||
324 | |||
325 | return -EINVAL; | ||
326 | } | ||
327 | |||
328 | static int vle_extent_lookback(struct z_erofs_maprecorder *m, | ||
329 | unsigned int lookback_distance) | ||
330 | { | ||
331 | struct erofs_vnode *const vi = EROFS_V(m->inode); | ||
332 | struct erofs_map_blocks *const map = m->map; | ||
333 | const unsigned int lclusterbits = vi->z_logical_clusterbits; | ||
334 | unsigned long lcn = m->lcn; | ||
335 | int err; | ||
336 | |||
337 | if (lcn < lookback_distance) { | ||
338 | errln("bogus lookback distance @ nid %llu", vi->nid); | ||
339 | DBG_BUGON(1); | ||
340 | return -EFSCORRUPTED; | ||
341 | } | ||
342 | |||
343 | /* load extent head logical cluster if needed */ | ||
344 | lcn -= lookback_distance; | ||
345 | err = vle_load_cluster_from_disk(m, lcn); | ||
346 | if (err) | ||
347 | return err; | ||
348 | |||
349 | switch (m->type) { | ||
350 | case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD: | ||
351 | if (unlikely(!m->delta[0])) { | ||
352 | errln("invalid lookback distance 0 at nid %llu", | ||
353 | vi->nid); | ||
354 | DBG_BUGON(1); | ||
355 | return -EFSCORRUPTED; | ||
356 | } | ||
357 | return vle_extent_lookback(m, m->delta[0]); | ||
358 | case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: | ||
359 | map->m_flags &= ~EROFS_MAP_ZIPPED; | ||
360 | /* fallthrough */ | ||
361 | case Z_EROFS_VLE_CLUSTER_TYPE_HEAD: | ||
362 | map->m_la = (lcn << lclusterbits) | m->clusterofs; | ||
363 | break; | ||
364 | default: | ||
365 | errln("unknown type %u at lcn %lu of nid %llu", | ||
366 | m->type, lcn, vi->nid); | ||
367 | DBG_BUGON(1); | ||
368 | return -EOPNOTSUPP; | ||
369 | } | ||
370 | return 0; | ||
371 | } | ||
372 | |||
373 | int z_erofs_map_blocks_iter(struct inode *inode, | ||
374 | struct erofs_map_blocks *map, | ||
375 | int flags) | ||
376 | { | ||
377 | struct erofs_vnode *const vi = EROFS_V(inode); | ||
378 | struct z_erofs_maprecorder m = { | ||
379 | .inode = inode, | ||
380 | .map = map, | ||
381 | }; | ||
382 | int err = 0; | ||
383 | unsigned int lclusterbits, endoff; | ||
384 | unsigned long long ofs, end; | ||
385 | |||
386 | trace_z_erofs_map_blocks_iter_enter(inode, map, flags); | ||
387 | |||
388 | /* when trying to read beyond EOF, leave it unmapped */ | ||
389 | if (unlikely(map->m_la >= inode->i_size)) { | ||
390 | map->m_llen = map->m_la + 1 - inode->i_size; | ||
391 | map->m_la = inode->i_size; | ||
392 | map->m_flags = 0; | ||
393 | goto out; | ||
394 | } | ||
395 | |||
396 | err = fill_inode_lazy(inode); | ||
397 | if (err) | ||
398 | goto out; | ||
399 | |||
400 | lclusterbits = vi->z_logical_clusterbits; | ||
401 | ofs = map->m_la; | ||
402 | m.lcn = ofs >> lclusterbits; | ||
403 | endoff = ofs & ((1 << lclusterbits) - 1); | ||
404 | |||
405 | err = vle_load_cluster_from_disk(&m, m.lcn); | ||
406 | if (err) | ||
407 | goto unmap_out; | ||
408 | |||
409 | map->m_flags = EROFS_MAP_ZIPPED; /* by default, compressed */ | ||
410 | end = (m.lcn + 1ULL) << lclusterbits; | ||
411 | |||
412 | switch (m.type) { | ||
413 | case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: | ||
414 | if (endoff >= m.clusterofs) | ||
415 | map->m_flags &= ~EROFS_MAP_ZIPPED; | ||
416 | /* fallthrough */ | ||
417 | case Z_EROFS_VLE_CLUSTER_TYPE_HEAD: | ||
418 | if (endoff >= m.clusterofs) { | ||
419 | map->m_la = (m.lcn << lclusterbits) | m.clusterofs; | ||
420 | break; | ||
421 | } | ||
422 | /* m.lcn should be >= 1 if endoff < m.clusterofs */ | ||
423 | if (unlikely(!m.lcn)) { | ||
424 | errln("invalid logical cluster 0 at nid %llu", | ||
425 | vi->nid); | ||
426 | err = -EFSCORRUPTED; | ||
427 | goto unmap_out; | ||
428 | } | ||
429 | end = (m.lcn << lclusterbits) | m.clusterofs; | ||
430 | map->m_flags |= EROFS_MAP_FULL_MAPPED; | ||
431 | m.delta[0] = 1; | ||
432 | /* fallthrough */ | ||
433 | case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD: | ||
434 | /* get the correspoinding first chunk */ | ||
435 | err = vle_extent_lookback(&m, m.delta[0]); | ||
436 | if (unlikely(err)) | ||
437 | goto unmap_out; | ||
438 | break; | ||
439 | default: | ||
440 | errln("unknown type %u at offset %llu of nid %llu", | ||
441 | m.type, ofs, vi->nid); | ||
442 | err = -EOPNOTSUPP; | ||
443 | goto unmap_out; | ||
444 | } | ||
445 | |||
446 | map->m_llen = end - map->m_la; | ||
447 | map->m_plen = 1 << lclusterbits; | ||
448 | map->m_pa = blknr_to_addr(m.pblk); | ||
449 | map->m_flags |= EROFS_MAP_MAPPED; | ||
450 | |||
451 | unmap_out: | ||
452 | if (m.kaddr) | ||
453 | kunmap_atomic(m.kaddr); | ||
454 | |||
455 | out: | ||
456 | debugln("%s, m_la %llu m_pa %llu m_llen %llu m_plen %llu m_flags 0%o", | ||
457 | __func__, map->m_la, map->m_pa, | ||
458 | map->m_llen, map->m_plen, map->m_flags); | ||
459 | |||
460 | trace_z_erofs_map_blocks_iter_exit(inode, map, flags, err); | ||
461 | |||
462 | /* aggressively BUG_ON iff CONFIG_EROFS_FS_DEBUG is on */ | ||
463 | DBG_BUGON(err < 0 && err != -ENOMEM); | ||
464 | return err; | ||
465 | } | ||
466 | |||
diff --git a/fs/erofs/zpvec.h b/fs/erofs/zpvec.h new file mode 100644 index 000000000000..bd3cee16491c --- /dev/null +++ b/fs/erofs/zpvec.h | |||
@@ -0,0 +1,157 @@ | |||
1 | /* SPDX-License-Identifier: GPL-2.0-only */ | ||
2 | /* | ||
3 | * Copyright (C) 2018 HUAWEI, Inc. | ||
4 | * http://www.huawei.com/ | ||
5 | * Created by Gao Xiang <gaoxiang25@huawei.com> | ||
6 | */ | ||
7 | #ifndef __EROFS_FS_ZPVEC_H | ||
8 | #define __EROFS_FS_ZPVEC_H | ||
9 | |||
10 | #include "tagptr.h" | ||
11 | |||
12 | /* page type in pagevec for decompress subsystem */ | ||
13 | enum z_erofs_page_type { | ||
14 | /* including Z_EROFS_VLE_PAGE_TAIL_EXCLUSIVE */ | ||
15 | Z_EROFS_PAGE_TYPE_EXCLUSIVE, | ||
16 | |||
17 | Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED, | ||
18 | |||
19 | Z_EROFS_VLE_PAGE_TYPE_HEAD, | ||
20 | Z_EROFS_VLE_PAGE_TYPE_MAX | ||
21 | }; | ||
22 | |||
23 | extern void __compiletime_error("Z_EROFS_PAGE_TYPE_EXCLUSIVE != 0") | ||
24 | __bad_page_type_exclusive(void); | ||
25 | |||
26 | /* pagevec tagged pointer */ | ||
27 | typedef tagptr2_t erofs_vtptr_t; | ||
28 | |||
29 | /* pagevec collector */ | ||
30 | struct z_erofs_pagevec_ctor { | ||
31 | struct page *curr, *next; | ||
32 | erofs_vtptr_t *pages; | ||
33 | |||
34 | unsigned int nr, index; | ||
35 | }; | ||
36 | |||
37 | static inline void z_erofs_pagevec_ctor_exit(struct z_erofs_pagevec_ctor *ctor, | ||
38 | bool atomic) | ||
39 | { | ||
40 | if (!ctor->curr) | ||
41 | return; | ||
42 | |||
43 | if (atomic) | ||
44 | kunmap_atomic(ctor->pages); | ||
45 | else | ||
46 | kunmap(ctor->curr); | ||
47 | } | ||
48 | |||
49 | static inline struct page * | ||
50 | z_erofs_pagevec_ctor_next_page(struct z_erofs_pagevec_ctor *ctor, | ||
51 | unsigned int nr) | ||
52 | { | ||
53 | unsigned int index; | ||
54 | |||
55 | /* keep away from occupied pages */ | ||
56 | if (ctor->next) | ||
57 | return ctor->next; | ||
58 | |||
59 | for (index = 0; index < nr; ++index) { | ||
60 | const erofs_vtptr_t t = ctor->pages[index]; | ||
61 | const unsigned int tags = tagptr_unfold_tags(t); | ||
62 | |||
63 | if (tags == Z_EROFS_PAGE_TYPE_EXCLUSIVE) | ||
64 | return tagptr_unfold_ptr(t); | ||
65 | } | ||
66 | DBG_BUGON(nr >= ctor->nr); | ||
67 | return NULL; | ||
68 | } | ||
69 | |||
70 | static inline void | ||
71 | z_erofs_pagevec_ctor_pagedown(struct z_erofs_pagevec_ctor *ctor, | ||
72 | bool atomic) | ||
73 | { | ||
74 | struct page *next = z_erofs_pagevec_ctor_next_page(ctor, ctor->nr); | ||
75 | |||
76 | z_erofs_pagevec_ctor_exit(ctor, atomic); | ||
77 | |||
78 | ctor->curr = next; | ||
79 | ctor->next = NULL; | ||
80 | ctor->pages = atomic ? | ||
81 | kmap_atomic(ctor->curr) : kmap(ctor->curr); | ||
82 | |||
83 | ctor->nr = PAGE_SIZE / sizeof(struct page *); | ||
84 | ctor->index = 0; | ||
85 | } | ||
86 | |||
87 | static inline void z_erofs_pagevec_ctor_init(struct z_erofs_pagevec_ctor *ctor, | ||
88 | unsigned int nr, | ||
89 | erofs_vtptr_t *pages, | ||
90 | unsigned int i) | ||
91 | { | ||
92 | ctor->nr = nr; | ||
93 | ctor->curr = ctor->next = NULL; | ||
94 | ctor->pages = pages; | ||
95 | |||
96 | if (i >= nr) { | ||
97 | i -= nr; | ||
98 | z_erofs_pagevec_ctor_pagedown(ctor, false); | ||
99 | while (i > ctor->nr) { | ||
100 | i -= ctor->nr; | ||
101 | z_erofs_pagevec_ctor_pagedown(ctor, false); | ||
102 | } | ||
103 | } | ||
104 | ctor->next = z_erofs_pagevec_ctor_next_page(ctor, i); | ||
105 | ctor->index = i; | ||
106 | } | ||
107 | |||
108 | static inline bool z_erofs_pagevec_enqueue(struct z_erofs_pagevec_ctor *ctor, | ||
109 | struct page *page, | ||
110 | enum z_erofs_page_type type, | ||
111 | bool *occupied) | ||
112 | { | ||
113 | *occupied = false; | ||
114 | if (unlikely(!ctor->next && type)) | ||
115 | if (ctor->index + 1 == ctor->nr) | ||
116 | return false; | ||
117 | |||
118 | if (unlikely(ctor->index >= ctor->nr)) | ||
119 | z_erofs_pagevec_ctor_pagedown(ctor, false); | ||
120 | |||
121 | /* exclusive page type must be 0 */ | ||
122 | if (Z_EROFS_PAGE_TYPE_EXCLUSIVE != (uintptr_t)NULL) | ||
123 | __bad_page_type_exclusive(); | ||
124 | |||
125 | /* should remind that collector->next never equal to 1, 2 */ | ||
126 | if (type == (uintptr_t)ctor->next) { | ||
127 | ctor->next = page; | ||
128 | *occupied = true; | ||
129 | } | ||
130 | ctor->pages[ctor->index++] = tagptr_fold(erofs_vtptr_t, page, type); | ||
131 | return true; | ||
132 | } | ||
133 | |||
134 | static inline struct page * | ||
135 | z_erofs_pagevec_dequeue(struct z_erofs_pagevec_ctor *ctor, | ||
136 | enum z_erofs_page_type *type) | ||
137 | { | ||
138 | erofs_vtptr_t t; | ||
139 | |||
140 | if (unlikely(ctor->index >= ctor->nr)) { | ||
141 | DBG_BUGON(!ctor->next); | ||
142 | z_erofs_pagevec_ctor_pagedown(ctor, true); | ||
143 | } | ||
144 | |||
145 | t = ctor->pages[ctor->index]; | ||
146 | |||
147 | *type = tagptr_unfold_tags(t); | ||
148 | |||
149 | /* should remind that collector->next never equal to 1, 2 */ | ||
150 | if (*type == (uintptr_t)ctor->next) | ||
151 | ctor->next = tagptr_unfold_ptr(t); | ||
152 | |||
153 | ctor->pages[ctor->index++] = tagptr_fold(erofs_vtptr_t, NULL, 0); | ||
154 | return tagptr_unfold_ptr(t); | ||
155 | } | ||
156 | #endif | ||
157 | |||