19 files changed, 6687 insertions, 0 deletions
diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
new file mode 100644
index 000000000000..16316d1adca3
--- /dev/null
+++ b/fs/erofs/Kconfig
@@ -0,0 +1,98 @@
+# SPDX-License-Identifier: GPL-2.0-only
+config EROFS_FS
+        tristate "EROFS filesystem support"
+        depends on BLOCK
+        help
+          EROFS (Enhanced Read-Only File System) is a lightweight
+          read-only file system with modern designs (eg. page-sized
+          blocks, inline xattrs/data, etc.) for scenarios which need
+          high-performance read-only requirements, e.g. Android OS
+          for mobile phones and LIVECDs.
+          It also provides fixed-sized output compression support,
+          which improves storage density, keeps relatively higher
+          compression ratios, which is more useful to achieve high
+          performance for embedded devices with limited memory.
+          If unsure, say N.
+config EROFS_FS_DEBUG
+        bool "EROFS debugging feature"
+        depends on EROFS_FS
+        help
+          Print debugging messages and enable more BUG_ONs which check
+          filesystem consistency and find potential issues aggressively,
+          which can be used for Android eng build, for example.
+          For daily use, say N.
+config EROFS_FAULT_INJECTION
+        bool "EROFS fault injection facility"
+        depends on EROFS_FS
+        help
+          Test EROFS to inject faults such as ENOMEM, EIO, and so on.
+          If unsure, say N.
+config EROFS_FS_XATTR
+        bool "EROFS extended attributes"
+        depends on EROFS_FS
+        default y
+        help
+          Extended attributes are name:value pairs associated with inodes by
+          the kernel or by users (see the attr(5) manual page, or visit
+          <http://acl.bestbits.at/> for details).
+          If unsure, say N.
+config EROFS_FS_POSIX_ACL
+        bool "EROFS Access Control Lists"
+        depends on EROFS_FS_XATTR
+        select FS_POSIX_ACL
+        default y
+        help
+          Posix Access Control Lists (ACLs) support permissions for users and
+          groups beyond the owner/group/world scheme.
+          To learn more about Access Control Lists, visit the POSIX ACLs for
+          Linux website <http://acl.bestbits.at/>.
+          If you don't know what Access Control Lists are, say N.
+config EROFS_FS_SECURITY
+        bool "EROFS Security Labels"
+        depends on EROFS_FS_XATTR
+        default y
+        help
+          Security labels provide an access control facility to support Linux
+          Security Models (LSMs) accepted by AppArmor, SELinux, Smack and TOMOYO
+          Linux. This option enables an extended attribute handler for file
+          security labels in the erofs filesystem, so that it requires enabling
+          the extended attribute support in advance.
+          If you are not using a security module, say N.
+config EROFS_FS_ZIP
+        bool "EROFS Data Compression Support"
+        depends on EROFS_FS
+        select LZ4_DECOMPRESS
+        default y
+        help
+          Enable fixed-sized output compression for EROFS.
+          If you don't want to enable compression feature, say N.
+config EROFS_FS_CLUSTER_PAGE_LIMIT
+        int "EROFS Cluster Pages Hard Limit"
+        depends on EROFS_FS_ZIP
+        range 1 256
+        default "1"
+        help
+          Indicates maximum # of pages of a compressed
+          physical cluster.
+          For example, if files in a image were compressed
+          into 8k-unit, hard limit should not be configured
+          less than 2. Otherwise, the image will be refused
+          to mount on this kernel.
diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
new file mode 100644
index 000000000000..46f2aa4ba46c
--- /dev/null
+++ b/fs/erofs/Makefile
@@ -0,0 +1,11 @@
+# SPDX-License-Identifier: GPL-2.0-only
+EROFS_VERSION = "1.0"
+ccflags-y += -DEROFS_VERSION=\"$(EROFS_VERSION)\"
+obj-$(CONFIG_EROFS_FS) += erofs.o
+erofs-objs := super.o inode.o data.o namei.o dir.o utils.o
+erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
+erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o
diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h
new file mode 100644
index 000000000000..07d279fd5d67
--- /dev/null
+++ b/fs/erofs/compress.h
@@ -0,0 +1,60 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2019 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25@huawei.com>
+ */
+#ifndef __EROFS_FS_COMPRESS_H
+#define __EROFS_FS_COMPRESS_H
+#include "internal.h"
+enum {
+        Z_EROFS_COMPRESSION_SHIFTED = Z_EROFS_COMPRESSION_MAX,
+        Z_EROFS_COMPRESSION_RUNTIME_MAX
+};
+struct z_erofs_decompress_req {
+        struct super_block *sb;
+        struct page **in, **out;
+        unsigned short pageofs_out;
+        unsigned int inputsize, outputsize;
+        /* indicate the algorithm will be used for decompression */
+        unsigned int alg;
+        bool inplace_io, partial_decoding;
+};
+/*
+ * - 0x5A110C8D ('sallocated', Z_EROFS_MAPPING_STAGING) -
+ * used to mark temporary allocated pages from other
+ * file/cached pages and NULL mapping pages.
+ */
+#define Z_EROFS_MAPPING_STAGING         ((void *)0x5A110C8D)
+/* check if a page is marked as staging */
+static inline bool z_erofs_page_is_staging(struct page *page)
+{
+        return page->mapping == Z_EROFS_MAPPING_STAGING;
+}
+static inline bool z_erofs_put_stagingpage(struct list_head *pagepool,
+                                           struct page *page)
+{
+        if (!z_erofs_page_is_staging(page))
+                return false;
+        /* staging pages should not be used by others at the same time */
+        if (page_ref_count(page) > 1)
+                put_page(page);
+        else
+                list_add(&page->lru, pagepool);
+        return true;
+}
+int z_erofs_decompress(struct z_erofs_decompress_req *rq,
+                       struct list_head *pagepool);
+#endif
diff --git a/fs/erofs/data.c b/fs/erofs/data.c
new file mode 100644
index 000000000000..fda16ec8863e
--- /dev/null
+++ b/fs/erofs/data.c
@@ -0,0 +1,423 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2017-2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25@huawei.com>
+ */
+#include "internal.h"
+#include <linux/prefetch.h>
+#include <trace/events/erofs.h>
+static inline void read_endio(struct bio *bio)
+{
+        struct super_block *const sb = bio->bi_private;
+        struct bio_vec *bvec;
+        blk_status_t err = bio->bi_status;
+        struct bvec_iter_all iter_all;
+        if (time_to_inject(EROFS_SB(sb), FAULT_READ_IO)) {
+                erofs_show_injection_info(FAULT_READ_IO);
+                err = BLK_STS_IOERR;
+        }
+        bio_for_each_segment_all(bvec, bio, iter_all) {
+                struct page *page = bvec->bv_page;
+                /* page is already locked */
+                DBG_BUGON(PageUptodate(page));
+                if (unlikely(err))
+                        SetPageError(page);
+                else
+                        SetPageUptodate(page);
+                unlock_page(page);
+                /* page could be reclaimed now */
+        }
+        bio_put(bio);
+}
+/* prio -- true is used for dir */
+struct page *__erofs_get_meta_page(struct super_block *sb,
+                                   erofs_blk_t blkaddr, bool prio, bool nofail)
+{
+        struct inode *const bd_inode = sb->s_bdev->bd_inode;
+        struct address_space *const mapping = bd_inode->i_mapping;
+        /* prefer retrying in the allocator to blindly looping below */
+        const gfp_t gfp = mapping_gfp_constraint(mapping, ~__GFP_FS) |
+                (nofail ? __GFP_NOFAIL : 0);
+        unsigned int io_retries = nofail ? EROFS_IO_MAX_RETRIES_NOFAIL : 0;
+        struct page *page;
+        int err;
+repeat:
+        page = find_or_create_page(mapping, blkaddr, gfp);
+        if (unlikely(!page)) {
+                DBG_BUGON(nofail);
+                return ERR_PTR(-ENOMEM);
+        }
+        DBG_BUGON(!PageLocked(page));
+        if (!PageUptodate(page)) {
+                struct bio *bio;
+                bio = erofs_grab_bio(sb, blkaddr, 1, sb, read_endio, nofail);
+                if (IS_ERR(bio)) {
+                        DBG_BUGON(nofail);
+                        err = PTR_ERR(bio);
+                        goto err_out;
+                }
+                err = bio_add_page(bio, page, PAGE_SIZE, 0);
+                if (unlikely(err != PAGE_SIZE)) {
+                        err = -EFAULT;
+                        goto err_out;
+                }
+                __submit_bio(bio, REQ_OP_READ,
+                             REQ_META | (prio ? REQ_PRIO : 0));
+                lock_page(page);
+                /* this page has been truncated by others */
+                if (unlikely(page->mapping != mapping)) {
+unlock_repeat:
+                        unlock_page(page);
+                        put_page(page);
+                        goto repeat;
+                }
+                /* more likely a read error */
+                if (unlikely(!PageUptodate(page))) {
+                        if (io_retries) {
+                                --io_retries;
+                                goto unlock_repeat;
+                        }
+                        err = -EIO;
+                        goto err_out;
+                }
+        }
+        return page;
+err_out:
+        unlock_page(page);
+        put_page(page);
+        return ERR_PTR(err);
+}
+static int erofs_map_blocks_flatmode(struct inode *inode,
+                                     struct erofs_map_blocks *map,
+                                     int flags)
+{
+        int err = 0;
+        erofs_blk_t nblocks, lastblk;
+        u64 offset = map->m_la;
+        struct erofs_vnode *vi = EROFS_V(inode);
+        trace_erofs_map_blocks_flatmode_enter(inode, map, flags);
+        nblocks = DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
+        lastblk = nblocks - is_inode_flat_inline(inode);
+        if (unlikely(offset >= inode->i_size)) {
+                /* leave out-of-bound access unmapped */
+                map->m_flags = 0;
+                map->m_plen = 0;
+                goto out;
+        }
+        /* there is no hole in flatmode */
+        map->m_flags = EROFS_MAP_MAPPED;
+        if (offset < blknr_to_addr(lastblk)) {
+                map->m_pa = blknr_to_addr(vi->raw_blkaddr) + map->m_la;
+                map->m_plen = blknr_to_addr(lastblk) - offset;
+        } else if (is_inode_flat_inline(inode)) {
+                /* 2 - inode inline B: inode, [xattrs], inline last blk... */
+                struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb);
+                map->m_pa = iloc(sbi, vi->nid) + vi->inode_isize +
+                        vi->xattr_isize + erofs_blkoff(map->m_la);
+                map->m_plen = inode->i_size - offset;
+                /* inline data should be located in one meta block */
+                if (erofs_blkoff(map->m_pa) + map->m_plen > PAGE_SIZE) {
+                        errln("inline data cross block boundary @ nid %llu",
+                              vi->nid);
+                        DBG_BUGON(1);
+                        err = -EFSCORRUPTED;
+                        goto err_out;
+                }
+                map->m_flags |= EROFS_MAP_META;
+        } else {
+                errln("internal error @ nid: %llu (size %llu), m_la 0x%llx",
+                      vi->nid, inode->i_size, map->m_la);
+                DBG_BUGON(1);
+                err = -EIO;
+                goto err_out;
+        }
+out:
+        map->m_llen = map->m_plen;
+err_out:
+        trace_erofs_map_blocks_flatmode_exit(inode, map, flags, 0);
+        return err;
+}
+int erofs_map_blocks(struct inode *inode,
+                     struct erofs_map_blocks *map, int flags)
+{
+        if (unlikely(is_inode_layout_compression(inode))) {
+                int err = z_erofs_map_blocks_iter(inode, map, flags);
+                if (map->mpage) {
+                        put_page(map->mpage);
+                        map->mpage = NULL;
+                }
+                return err;
+        }
+        return erofs_map_blocks_flatmode(inode, map, flags);
+}
+static inline struct bio *erofs_read_raw_page(struct bio *bio,
+                                              struct address_space *mapping,
+                                              struct page *page,
+                                              erofs_off_t *last_block,
+                                              unsigned int nblocks,
+                                              bool ra)
+{
+        struct inode *const inode = mapping->host;
+        struct super_block *const sb = inode->i_sb;
+        erofs_off_t current_block = (erofs_off_t)page->index;
+        int err;
+        DBG_BUGON(!nblocks);
+        if (PageUptodate(page)) {
+                err = 0;
+                goto has_updated;
+        }
+        /* note that for readpage case, bio also equals to NULL */
+        if (bio &&
+            /* not continuous */
+            *last_block + 1 != current_block) {
+submit_bio_retry:
+                __submit_bio(bio, REQ_OP_READ, 0);
+                bio = NULL;
+        }
+        if (!bio) {
+                struct erofs_map_blocks map = {
+                        .m_la = blknr_to_addr(current_block),
+                };
+                erofs_blk_t blknr;
+                unsigned int blkoff;
+                err = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW);
+                if (unlikely(err))
+                        goto err_out;
+                /* zero out the holed page */
+                if (unlikely(!(map.m_flags & EROFS_MAP_MAPPED))) {
+                        zero_user_segment(page, 0, PAGE_SIZE);
+                        SetPageUptodate(page);
+                        /* imply err = 0, see erofs_map_blocks */
+                        goto has_updated;
+                }
+                /* for RAW access mode, m_plen must be equal to m_llen */
+                DBG_BUGON(map.m_plen != map.m_llen);
+                blknr = erofs_blknr(map.m_pa);
+                blkoff = erofs_blkoff(map.m_pa);
+                /* deal with inline page */
+                if (map.m_flags & EROFS_MAP_META) {
+                        void *vsrc, *vto;
+                        struct page *ipage;
+                        DBG_BUGON(map.m_plen > PAGE_SIZE);
+                        ipage = erofs_get_meta_page(inode->i_sb, blknr, 0);
+                        if (IS_ERR(ipage)) {
+                                err = PTR_ERR(ipage);
+                                goto err_out;
+                        }
+                        vsrc = kmap_atomic(ipage);
+                        vto = kmap_atomic(page);
+                        memcpy(vto, vsrc + blkoff, map.m_plen);
+                        memset(vto + map.m_plen, 0, PAGE_SIZE - map.m_plen);
+                        kunmap_atomic(vto);
+                        kunmap_atomic(vsrc);
+                        flush_dcache_page(page);
+                        SetPageUptodate(page);
+                        /* TODO: could we unlock the page earlier? */
+                        unlock_page(ipage);
+                        put_page(ipage);
+                        /* imply err = 0, see erofs_map_blocks */
+                        goto has_updated;
+                }
+                /* pa must be block-aligned for raw reading */
+                DBG_BUGON(erofs_blkoff(map.m_pa));
+                /* max # of continuous pages */
+                if (nblocks > DIV_ROUND_UP(map.m_plen, PAGE_SIZE))
+                        nblocks = DIV_ROUND_UP(map.m_plen, PAGE_SIZE);
+                if (nblocks > BIO_MAX_PAGES)
+                        nblocks = BIO_MAX_PAGES;
+                bio = erofs_grab_bio(sb, blknr, nblocks, sb,
+                                     read_endio, false);
+                if (IS_ERR(bio)) {
+                        err = PTR_ERR(bio);
+                        bio = NULL;
+                        goto err_out;
+                }
+        }
+        err = bio_add_page(bio, page, PAGE_SIZE, 0);
+        /* out of the extent or bio is full */
+        if (err < PAGE_SIZE)
+                goto submit_bio_retry;
+        *last_block = current_block;
+        /* shift in advance in case of it followed by too many gaps */
+        if (bio->bi_iter.bi_size >= bio->bi_max_vecs * PAGE_SIZE) {
+                /* err should reassign to 0 after submitting */
+                err = 0;
+                goto submit_bio_out;
+        }
+        return bio;
+err_out:
+        /* for sync reading, set page error immediately */
+        if (!ra) {
+                SetPageError(page);
+                ClearPageUptodate(page);
+        }
+has_updated:
+        unlock_page(page);
+        /* if updated manually, continuous pages has a gap */
+        if (bio)
+submit_bio_out:
+                __submit_bio(bio, REQ_OP_READ, 0);
+        return unlikely(err) ? ERR_PTR(err) : NULL;
+}
+/*
+ * since we dont have write or truncate flows, so no inode
+ * locking needs to be held at the moment.
+ */
+static int erofs_raw_access_readpage(struct file *file, struct page *page)
+{
+        erofs_off_t last_block;
+        struct bio *bio;
+        trace_erofs_readpage(page, true);
+        bio = erofs_read_raw_page(NULL, page->mapping,
+                                  page, &last_block, 1, false);
+        if (IS_ERR(bio))
+                return PTR_ERR(bio);
+        DBG_BUGON(bio); /* since we have only one bio -- must be NULL */
+        return 0;
+}
+static int erofs_raw_access_readpages(struct file *filp,
+                                      struct address_space *mapping,
+                                      struct list_head *pages,
+                                      unsigned int nr_pages)
+{
+        erofs_off_t last_block;
+        struct bio *bio = NULL;
+        gfp_t gfp = readahead_gfp_mask(mapping);
+        struct page *page = list_last_entry(pages, struct page, lru);
+        trace_erofs_readpages(mapping->host, page, nr_pages, true);
+        for (; nr_pages; --nr_pages) {
+                page = list_entry(pages->prev, struct page, lru);
+                prefetchw(&page->flags);
+                list_del(&page->lru);
+                if (!add_to_page_cache_lru(page, mapping, page->index, gfp)) {
+                        bio = erofs_read_raw_page(bio, mapping, page,
+                                                  &last_block, nr_pages, true);
+                        /* all the page errors are ignored when readahead */
+                        if (IS_ERR(bio)) {
+                                pr_err("%s, readahead error at page %lu of nid %llu\n",
+                                       __func__, page->index,
+                                       EROFS_V(mapping->host)->nid);
+                                bio = NULL;
+                        }
+                }
+                /* pages could still be locked */
+                put_page(page);
+        }
+        DBG_BUGON(!list_empty(pages));
+        /* the rare case (end in gaps) */
+        if (unlikely(bio))
+                __submit_bio(bio, REQ_OP_READ, 0);
+        return 0;
+}
+static int erofs_get_block(struct inode *inode, sector_t iblock,
+                           struct buffer_head *bh, int create)
+{
+        struct erofs_map_blocks map = {
+                .m_la = iblock << 9,
+        };
+        int err;
+        err = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW);
+        if (err)
+                return err;
+        if (map.m_flags & EROFS_MAP_MAPPED)
+                bh->b_blocknr = erofs_blknr(map.m_pa);
+        return err;
+}
+static sector_t erofs_bmap(struct address_space *mapping, sector_t block)
+{
+        struct inode *inode = mapping->host;
+        if (is_inode_flat_inline(inode)) {
+                erofs_blk_t blks = i_size_read(inode) >> LOG_BLOCK_SIZE;
+                if (block >> LOG_SECTORS_PER_BLOCK >= blks)
+                        return 0;
+        }
+        return generic_block_bmap(mapping, block, erofs_get_block);
+}
+/* for uncompressed (aligned) files and raw access for other files */
+const struct address_space_operations erofs_raw_access_aops = {
+        .readpage = erofs_raw_access_readpage,
+        .readpages = erofs_raw_access_readpages,
+        .bmap = erofs_bmap,
+};
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
new file mode 100644
index 000000000000..5f4b7f302863
--- /dev/null
+++ b/fs/erofs/decompressor.c
@@ -0,0 +1,358 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2019 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25@huawei.com>
+ */
+#include "compress.h"
+#include <linux/module.h>
+#include <linux/lz4.h>
+#ifndef LZ4_DISTANCE_MAX        /* history window size */
+#define LZ4_DISTANCE_MAX 65535  /* set to maximum value by default */
+#endif
+#define LZ4_MAX_DISTANCE_PAGES  (DIV_ROUND_UP(LZ4_DISTANCE_MAX, PAGE_SIZE) + 1)
+#ifndef LZ4_DECOMPRESS_INPLACE_MARGIN
+#define LZ4_DECOMPRESS_INPLACE_MARGIN(srcsize)  (((srcsize) >> 8) + 32)
+#endif
+struct z_erofs_decompressor {
+        /*
+         * if destpages have sparsed pages, fill them with bounce pages.
+         * it also check whether destpages indicate continuous physical memory.
+         */
+        int (*prepare_destpages)(struct z_erofs_decompress_req *rq,
+                                 struct list_head *pagepool);
+        int (*decompress)(struct z_erofs_decompress_req *rq, u8 *out);
+        char *name;
+};
+static bool use_vmap;
+module_param(use_vmap, bool, 0444);
+MODULE_PARM_DESC(use_vmap, "Use vmap() instead of vm_map_ram() (default 0)");
+static int lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
+                                 struct list_head *pagepool)
+{
+        const unsigned int nr =
+                PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
+        struct page *availables[LZ4_MAX_DISTANCE_PAGES] = { NULL };
+        unsigned long bounced[DIV_ROUND_UP(LZ4_MAX_DISTANCE_PAGES,
+                                           BITS_PER_LONG)] = { 0 };
+        void *kaddr = NULL;
+        unsigned int i, j, top;
+        top = 0;
+        for (i = j = 0; i < nr; ++i, ++j) {
+                struct page *const page = rq->out[i];
+                struct page *victim;
+                if (j >= LZ4_MAX_DISTANCE_PAGES)
+                        j = 0;
+                /* 'valid' bounced can only be tested after a complete round */
+                if (test_bit(j, bounced)) {
+                        DBG_BUGON(i < LZ4_MAX_DISTANCE_PAGES);
+                        DBG_BUGON(top >= LZ4_MAX_DISTANCE_PAGES);
+                        availables[top++] = rq->out[i - LZ4_MAX_DISTANCE_PAGES];
+                }
+                if (page) {
+                        __clear_bit(j, bounced);
+                        if (kaddr) {
+                                if (kaddr + PAGE_SIZE == page_address(page))
+                                        kaddr += PAGE_SIZE;
+                                else
+                                        kaddr = NULL;
+                        } else if (!i) {
+                                kaddr = page_address(page);
+                        }
+                        continue;
+                }
+                kaddr = NULL;
+                __set_bit(j, bounced);
+                if (top) {
+                        victim = availables[--top];
+                        get_page(victim);
+                } else {
+                        victim = erofs_allocpage(pagepool, GFP_KERNEL, false);
+                        if (unlikely(!victim))
+                                return -ENOMEM;
+                        victim->mapping = Z_EROFS_MAPPING_STAGING;
+                }
+                rq->out[i] = victim;
+        }
+        return kaddr ? 1 : 0;
+}
+static void *generic_copy_inplace_data(struct z_erofs_decompress_req *rq,
+                                       u8 *src, unsigned int pageofs_in)
+{
+        /*
+         * if in-place decompression is ongoing, those decompressed
+         * pages should be copied in order to avoid being overlapped.
+         */
+        struct page **in = rq->in;
+        u8 *const tmp = erofs_get_pcpubuf(0);
+        u8 *tmpp = tmp;
+        unsigned int inlen = rq->inputsize - pageofs_in;
+        unsigned int count = min_t(uint, inlen, PAGE_SIZE - pageofs_in);
+        while (tmpp < tmp + inlen) {
+                if (!src)
+                        src = kmap_atomic(*in);
+                memcpy(tmpp, src + pageofs_in, count);
+                kunmap_atomic(src);
+                src = NULL;
+                tmpp += count;
+                pageofs_in = 0;
+                count = PAGE_SIZE;
+                ++in;
+        }
+        return tmp;
+}
+static int lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out)
+{
+        unsigned int inputmargin, inlen;
+        u8 *src;
+        bool copied, support_0padding;
+        int ret;
+        if (rq->inputsize > PAGE_SIZE)
+                return -EOPNOTSUPP;
+        src = kmap_atomic(*rq->in);
+        inputmargin = 0;
+        support_0padding = false;
+        /* decompression inplace is only safe when 0padding is enabled */
+        if (EROFS_SB(rq->sb)->requirements & EROFS_REQUIREMENT_LZ4_0PADDING) {
+                support_0padding = true;
+                while (!src[inputmargin & ~PAGE_MASK])
+                        if (!(++inputmargin & ~PAGE_MASK))
+                                break;
+                if (inputmargin >= rq->inputsize) {
+                        kunmap_atomic(src);
+                        return -EIO;
+                }
+        }
+        copied = false;
+        inlen = rq->inputsize - inputmargin;
+        if (rq->inplace_io) {
+                const uint oend = (rq->pageofs_out +
+                                   rq->outputsize) & ~PAGE_MASK;
+                const uint nr = PAGE_ALIGN(rq->pageofs_out +
+                                           rq->outputsize) >> PAGE_SHIFT;
+                if (rq->partial_decoding || !support_0padding ||
+                    rq->out[nr - 1] != rq->in[0] ||
+                    rq->inputsize - oend <
+                      LZ4_DECOMPRESS_INPLACE_MARGIN(inlen)) {
+                        src = generic_copy_inplace_data(rq, src, inputmargin);
+                        inputmargin = 0;
+                        copied = true;
+                }
+        }
+        ret = LZ4_decompress_safe_partial(src + inputmargin, out,
+                                          inlen, rq->outputsize,
+                                          rq->outputsize);
+        if (ret < 0) {
+                errln("%s, failed to decompress, in[%p, %u, %u] out[%p, %u]",
+                      __func__, src + inputmargin, inlen, inputmargin,
+                      out, rq->outputsize);
+                WARN_ON(1);
+                print_hex_dump(KERN_DEBUG, "[ in]: ", DUMP_PREFIX_OFFSET,
+                               16, 1, src + inputmargin, inlen, true);
+                print_hex_dump(KERN_DEBUG, "[out]: ", DUMP_PREFIX_OFFSET,
+                               16, 1, out, rq->outputsize, true);
+                ret = -EIO;
+        }
+        if (copied)
+                erofs_put_pcpubuf(src);
+        else
+                kunmap_atomic(src);
+        return ret;
+}
+static struct z_erofs_decompressor decompressors[] = {
+        [Z_EROFS_COMPRESSION_SHIFTED] = {
+                .name = "shifted"
+        },
+        [Z_EROFS_COMPRESSION_LZ4] = {
+                .prepare_destpages = lz4_prepare_destpages,
+                .decompress = lz4_decompress,
+                .name = "lz4"
+        },
+};
+static void copy_from_pcpubuf(struct page **out, const char *dst,
+                              unsigned short pageofs_out,
+                              unsigned int outputsize)
+{
+        const char *end = dst + outputsize;
+        const unsigned int righthalf = PAGE_SIZE - pageofs_out;
+        const char *cur = dst - pageofs_out;
+        while (cur < end) {
+                struct page *const page = *out++;
+                if (page) {
+                        char *buf = kmap_atomic(page);
+                        if (cur >= dst) {
+                                memcpy(buf, cur, min_t(uint, PAGE_SIZE,
+                                                       end - cur));
+                        } else {
+                                memcpy(buf + pageofs_out, cur + pageofs_out,
+                                       min_t(uint, righthalf, end - cur));
+                        }
+                        kunmap_atomic(buf);
+                }
+                cur += PAGE_SIZE;
+        }
+}
+static void *erofs_vmap(struct page **pages, unsigned int count)
+{
+        int i = 0;
+        if (use_vmap)
+                return vmap(pages, count, VM_MAP, PAGE_KERNEL);
+        while (1) {
+                void *addr = vm_map_ram(pages, count, -1, PAGE_KERNEL);
+                /* retry two more times (totally 3 times) */
+                if (addr || ++i >= 3)
+                        return addr;
+                vm_unmap_aliases();
+        }
+        return NULL;
+}
+static void erofs_vunmap(const void *mem, unsigned int count)
+{
+        if (!use_vmap)
+                vm_unmap_ram(mem, count);
+        else
+                vunmap(mem);
+}
+static int decompress_generic(struct z_erofs_decompress_req *rq,
+                              struct list_head *pagepool)
+{
+        const unsigned int nrpages_out =
+                PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
+        const struct z_erofs_decompressor *alg = decompressors + rq->alg;
+        unsigned int dst_maptype;
+        void *dst;
+        int ret;
+        if (nrpages_out == 1 && !rq->inplace_io) {
+                DBG_BUGON(!*rq->out);
+                dst = kmap_atomic(*rq->out);
+                dst_maptype = 0;
+                goto dstmap_out;
+        }
+        /*
+         * For the case of small output size (especially much less
+         * than PAGE_SIZE), memcpy the decompressed data rather than
+         * compressed data is preferred.
+         */
+        if (rq->outputsize <= PAGE_SIZE * 7 / 8) {
+                dst = erofs_get_pcpubuf(0);
+                if (IS_ERR(dst))
+                        return PTR_ERR(dst);
+                rq->inplace_io = false;
+                ret = alg->decompress(rq, dst);
+                if (!ret)
+                        copy_from_pcpubuf(rq->out, dst, rq->pageofs_out,
+                                          rq->outputsize);
+                erofs_put_pcpubuf(dst);
+                return ret;
+        }
+        ret = alg->prepare_destpages(rq, pagepool);
+        if (ret < 0) {
+                return ret;
+        } else if (ret) {
+                dst = page_address(*rq->out);
+                dst_maptype = 1;
+                goto dstmap_out;
+        }
+        dst = erofs_vmap(rq->out, nrpages_out);
+        if (!dst)
+                return -ENOMEM;
+        dst_maptype = 2;
+dstmap_out:
+        ret = alg->decompress(rq, dst + rq->pageofs_out);
+        if (!dst_maptype)
+                kunmap_atomic(dst);
+        else if (dst_maptype == 2)
+                erofs_vunmap(dst, nrpages_out);
+        return ret;
+}
+static int shifted_decompress(const struct z_erofs_decompress_req *rq,
+                              struct list_head *pagepool)
+{
+        const unsigned int nrpages_out =
+                PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
+        const unsigned int righthalf = PAGE_SIZE - rq->pageofs_out;
+        unsigned char *src, *dst;
+        if (nrpages_out > 2) {
+                DBG_BUGON(1);
+                return -EIO;
+        }
+        if (rq->out[0] == *rq->in) {
+                DBG_BUGON(nrpages_out != 1);
+                return 0;
+        }
+        src = kmap_atomic(*rq->in);
+        if (!rq->out[0]) {
+                dst = NULL;
+        } else {
+                dst = kmap_atomic(rq->out[0]);
+                memcpy(dst + rq->pageofs_out, src, righthalf);
+        }
+        if (rq->out[1] == *rq->in) {
+                memmove(src, src + righthalf, rq->pageofs_out);
+        } else if (nrpages_out == 2) {
+                if (dst)
+                        kunmap_atomic(dst);
+                DBG_BUGON(!rq->out[1]);
+                dst = kmap_atomic(rq->out[1]);
+                memcpy(dst, src + righthalf, rq->pageofs_out);
+        }
+        if (dst)
+                kunmap_atomic(dst);
+        kunmap_atomic(src);
+        return 0;
+}
+int z_erofs_decompress(struct z_erofs_decompress_req *rq,
+                       struct list_head *pagepool)
+{
+        if (rq->alg == Z_EROFS_COMPRESSION_SHIFTED)
+                return shifted_decompress(rq, pagepool);
+        return decompress_generic(rq, pagepool);
+}
diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c
new file mode 100644
index 000000000000..1976e60e5174
--- /dev/null
+++ b/fs/erofs/dir.c
@@ -0,0 +1,139 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2017-2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25@huawei.com>
+ */
+#include "internal.h"
+static void debug_one_dentry(unsigned char d_type, const char *de_name,
+                             unsigned int de_namelen)
+{
+#ifdef CONFIG_EROFS_FS_DEBUG
+        /* since the on-disk name could not have the trailing '\0' */
+        unsigned char dbg_namebuf[EROFS_NAME_LEN + 1];
+        memcpy(dbg_namebuf, de_name, de_namelen);
+        dbg_namebuf[de_namelen] = '\0';
+        debugln("found dirent %s de_len %u d_type %d", dbg_namebuf,
+                de_namelen, d_type);
+#endif
+}
+static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx,
+                               void *dentry_blk, unsigned int *ofs,
+                               unsigned int nameoff, unsigned int maxsize)
+{
+        struct erofs_dirent *de = dentry_blk + *ofs;
+        const struct erofs_dirent *end = dentry_blk + nameoff;
+        while (de < end) {
+                const char *de_name;
+                unsigned int de_namelen;
+                unsigned char d_type;
+                d_type = fs_ftype_to_dtype(de->file_type);
+                nameoff = le16_to_cpu(de->nameoff);
+                de_name = (char *)dentry_blk + nameoff;
+                /* the last dirent in the block? */
+                if (de + 1 >= end)
+                        de_namelen = strnlen(de_name, maxsize - nameoff);
+                else
+                        de_namelen = le16_to_cpu(de[1].nameoff) - nameoff;
+                /* a corrupted entry is found */
+                if (unlikely(nameoff + de_namelen > maxsize ||
+                             de_namelen > EROFS_NAME_LEN)) {
+                        errln("bogus dirent @ nid %llu", EROFS_V(dir)->nid);
+                        DBG_BUGON(1);
+                        return -EFSCORRUPTED;
+                }
+                debug_one_dentry(d_type, de_name, de_namelen);
+                if (!dir_emit(ctx, de_name, de_namelen,
+                              le64_to_cpu(de->nid), d_type))
+                        /* stopped by some reason */
+                        return 1;
+                ++de;
+                *ofs += sizeof(struct erofs_dirent);
+        }
+        *ofs = maxsize;
+        return 0;
+}
+static int erofs_readdir(struct file *f, struct dir_context *ctx)
+{
+        struct inode *dir = file_inode(f);
+        struct address_space *mapping = dir->i_mapping;
+        const size_t dirsize = i_size_read(dir);
+        unsigned int i = ctx->pos / EROFS_BLKSIZ;
+        unsigned int ofs = ctx->pos % EROFS_BLKSIZ;
+        int err = 0;
+        bool initial = true;
+        while (ctx->pos < dirsize) {
+                struct page *dentry_page;
+                struct erofs_dirent *de;
+                unsigned int nameoff, maxsize;
+                dentry_page = read_mapping_page(mapping, i, NULL);
+                if (dentry_page == ERR_PTR(-ENOMEM)) {
+                        err = -ENOMEM;
+                        break;
+                } else if (IS_ERR(dentry_page)) {
+                        errln("fail to readdir of logical block %u of nid %llu",
+                              i, EROFS_V(dir)->nid);
+                        err = -EFSCORRUPTED;
+                        break;
+                }
+                de = (struct erofs_dirent *)kmap(dentry_page);
+                nameoff = le16_to_cpu(de->nameoff);
+                if (unlikely(nameoff < sizeof(struct erofs_dirent) ||
+                             nameoff >= PAGE_SIZE)) {
+                        errln("%s, invalid de[0].nameoff %u @ nid %llu",
+                              __func__, nameoff, EROFS_V(dir)->nid);
+                        err = -EFSCORRUPTED;
+                        goto skip_this;
+                }
+                maxsize = min_t(unsigned int,
+                                dirsize - ctx->pos + ofs, PAGE_SIZE);
+                /* search dirents at the arbitrary position */
+                if (unlikely(initial)) {
+                        initial = false;
+                        ofs = roundup(ofs, sizeof(struct erofs_dirent));
+                        if (unlikely(ofs >= nameoff))
+                                goto skip_this;
+                }
+                err = erofs_fill_dentries(dir, ctx, de, &ofs,
+                                          nameoff, maxsize);
+skip_this:
+                kunmap(dentry_page);
+                put_page(dentry_page);
+                ctx->pos = blknr_to_addr(i) + ofs;
+                if (unlikely(err))
+                        break;
+                ++i;
+                ofs = 0;
+        }
+        return err < 0 ? err : 0;
+}
+const struct file_operations erofs_dir_fops = {
+        .llseek         = generic_file_llseek,
+        .read           = generic_read_dir,
+        .iterate_shared = erofs_readdir,
+};
diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
new file mode 100644
index 000000000000..afa7d45ca958
--- /dev/null
+++ b/fs/erofs/erofs_fs.h
@@ -0,0 +1,307 @@
+/* SPDX-License-Identifier: GPL-2.0-only OR Apache-2.0 */
+/*
+ * Copyright (C) 2017-2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25@huawei.com>
+ */
+#ifndef __EROFS_FS_H
+#define __EROFS_FS_H
+/* Enhanced(Extended) ROM File System */
+#define EROFS_SUPER_OFFSET      1024
+/*
+ * Any bits that aren't in EROFS_ALL_REQUIREMENTS should be
+ * incompatible with this kernel version.
+ */
+#define EROFS_REQUIREMENT_LZ4_0PADDING  0x00000001
+#define EROFS_ALL_REQUIREMENTS          EROFS_REQUIREMENT_LZ4_0PADDING
+struct erofs_super_block {
+/*  0 */__le32 magic;           /* in the little endian */
+/*  4 */__le32 checksum;        /* crc32c(super_block) */
+/*  8 */__le32 features;        /* (aka. feature_compat) */
+/* 12 */__u8 blkszbits;         /* support block_size == PAGE_SIZE only */
+/* 13 */__u8 reserved;
+/* 14 */__le16 root_nid;
+/* 16 */__le64 inos;            /* total valid ino # (== f_files - f_favail) */
+/* 24 */__le64 build_time;      /* inode v1 time derivation */
+/* 32 */__le32 build_time_nsec;
+/* 36 */__le32 blocks;          /* used for statfs */
+/* 40 */__le32 meta_blkaddr;
+/* 44 */__le32 xattr_blkaddr;
+/* 48 */__u8 uuid[16];          /* 128-bit uuid for volume */
+/* 64 */__u8 volume_name[16];   /* volume name */
+/* 80 */__le32 requirements;    /* (aka. feature_incompat) */
+/* 84 */__u8 reserved2[44];
+} __packed;                     /* 128 bytes */
+/*
+ * erofs inode data mapping:
+ * 0 - inode plain without inline data A:
+ * inode, [xattrs], ... | ... | no-holed data
+ * 1 - inode VLE compression B (legacy):
+ * inode, [xattrs], extents ... | ...
+ * 2 - inode plain with inline data C:
+ * inode, [xattrs], last_inline_data, ... | ... | no-holed data
+ * 3 - inode compression D:
+ * inode, [xattrs], map_header, extents ... | ...
+ * 4~7 - reserved
+ */
+enum {
+        EROFS_INODE_FLAT_PLAIN,
+        EROFS_INODE_FLAT_COMPRESSION_LEGACY,
+        EROFS_INODE_FLAT_INLINE,
+        EROFS_INODE_FLAT_COMPRESSION,
+        EROFS_INODE_LAYOUT_MAX
+};
+static inline bool erofs_inode_is_data_compressed(unsigned int datamode)
+{
+        if (datamode == EROFS_INODE_FLAT_COMPRESSION)
+                return true;
+        return datamode == EROFS_INODE_FLAT_COMPRESSION_LEGACY;
+}
+/* bit definitions of inode i_advise */
+#define EROFS_I_VERSION_BITS            1
+#define EROFS_I_DATA_MAPPING_BITS       3
+#define EROFS_I_VERSION_BIT             0
+#define EROFS_I_DATA_MAPPING_BIT        1
+struct erofs_inode_v1 {
+/*  0 */__le16 i_advise;
+/* 1 header + n-1 * 4 bytes inline xattr to keep continuity */
+/*  2 */__le16 i_xattr_icount;
+/*  4 */__le16 i_mode;
+/*  6 */__le16 i_nlink;
+/*  8 */__le32 i_size;
+/* 12 */__le32 i_reserved;
+/* 16 */union {
+                /* file total compressed blocks for data mapping 1 */
+                __le32 compressed_blocks;
+                __le32 raw_blkaddr;
+                /* for device files, used to indicate old/new device # */
+                __le32 rdev;
+        } i_u __packed;
+/* 20 */__le32 i_ino;           /* only used for 32-bit stat compatibility */
+/* 24 */__le16 i_uid;
+/* 26 */__le16 i_gid;
+/* 28 */__le32 i_reserved2;
+} __packed;
+/* 32 bytes on-disk inode */
+#define EROFS_INODE_LAYOUT_V1   0
+/* 64 bytes on-disk inode */
+#define EROFS_INODE_LAYOUT_V2   1
+struct erofs_inode_v2 {
+/*  0 */__le16 i_advise;
+/* 1 header + n-1 * 4 bytes inline xattr to keep continuity */
+/*  2 */__le16 i_xattr_icount;
+/*  4 */__le16 i_mode;
+/*  6 */__le16 i_reserved;
+/*  8 */__le64 i_size;
+/* 16 */union {
+                /* file total compressed blocks for data mapping 1 */
+                __le32 compressed_blocks;
+                __le32 raw_blkaddr;
+                /* for device files, used to indicate old/new device # */
+                __le32 rdev;
+        } i_u __packed;
+        /* only used for 32-bit stat compatibility */
+/* 20 */__le32 i_ino;
+/* 24 */__le32 i_uid;
+/* 28 */__le32 i_gid;
+/* 32 */__le64 i_ctime;
+/* 40 */__le32 i_ctime_nsec;
+/* 44 */__le32 i_nlink;
+/* 48 */__u8   i_reserved2[16];
+} __packed;                     /* 64 bytes */
+#define EROFS_MAX_SHARED_XATTRS         (128)
+/* h_shared_count between 129 ... 255 are special # */
+#define EROFS_SHARED_XATTR_EXTENT       (255)
+/*
+ * inline xattrs (n == i_xattr_icount):
+ * erofs_xattr_ibody_header(1) + (n - 1) * 4 bytes
+ *          12 bytes           /                   \
+ *                            /                     \
+ *                           /-----------------------\
+ *                           |  erofs_xattr_entries+ |
+ *                           +-----------------------+
+ * inline xattrs must starts in erofs_xattr_ibody_header,
+ * for read-only fs, no need to introduce h_refcount
+ */
+struct erofs_xattr_ibody_header {
+        __le32 h_reserved;
+        __u8   h_shared_count;
+        __u8   h_reserved2[7];
+        __le32 h_shared_xattrs[0];      /* shared xattr id array */
+} __packed;
+/* Name indexes */
+#define EROFS_XATTR_INDEX_USER              1
+#define EROFS_XATTR_INDEX_POSIX_ACL_ACCESS  2
+#define EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT 3
+#define EROFS_XATTR_INDEX_TRUSTED           4
+#define EROFS_XATTR_INDEX_LUSTRE            5
+#define EROFS_XATTR_INDEX_SECURITY          6
+/* xattr entry (for both inline & shared xattrs) */
+struct erofs_xattr_entry {
+        __u8   e_name_len;      /* length of name */
+        __u8   e_name_index;    /* attribute name index */
+        __le16 e_value_size;    /* size of attribute value */
+        /* followed by e_name and e_value */
+        char   e_name[0];       /* attribute name */
+} __packed;
+#define ondisk_xattr_ibody_size(count)  ({\
+        u32 __count = le16_to_cpu(count); \
+        ((__count) == 0) ? 0 : \
+        sizeof(struct erofs_xattr_ibody_header) + \
+                sizeof(__u32) * ((__count) - 1); })
+#define EROFS_XATTR_ALIGN(size) round_up(size, sizeof(struct erofs_xattr_entry))
+#define EROFS_XATTR_ENTRY_SIZE(entry) EROFS_XATTR_ALIGN( \
+        sizeof(struct erofs_xattr_entry) + \
+        (entry)->e_name_len + le16_to_cpu((entry)->e_value_size))
+/* available compression algorithm types */
+enum {
+        Z_EROFS_COMPRESSION_LZ4,
+        Z_EROFS_COMPRESSION_MAX
+};
+/*
+ * bit 0 : COMPACTED_2B indexes (0 - off; 1 - on)
+ *  e.g. for 4k logical cluster size,      4B        if compacted 2B is off;
+ *                                  (4B) + 2B + (4B) if compacted 2B is on.
+ */
+#define Z_EROFS_ADVISE_COMPACTED_2B_BIT         0
+#define Z_EROFS_ADVISE_COMPACTED_2B     (1 << Z_EROFS_ADVISE_COMPACTED_2B_BIT)
+struct z_erofs_map_header {
+        __le32  h_reserved1;
+        __le16  h_advise;
+        /*
+         * bit 0-3 : algorithm type of head 1 (logical cluster type 01);
+         * bit 4-7 : algorithm type of head 2 (logical cluster type 11).
+         */
+        __u8    h_algorithmtype;
+        /*
+         * bit 0-2 : logical cluster bits - 12, e.g. 0 for 4096;
+         * bit 3-4 : (physical - logical) cluster bits of head 1:
+         *       For example, if logical clustersize = 4096, 1 for 8192.
+         * bit 5-7 : (physical - logical) cluster bits of head 2.
+         */
+        __u8    h_clusterbits;
+};
+#define Z_EROFS_VLE_LEGACY_HEADER_PADDING       8
+/*
+ * Z_EROFS Variable-sized Logical Extent cluster type:
+ *    0 - literal (uncompressed) cluster
+ *    1 - compressed cluster (for the head logical cluster)
+ *    2 - compressed cluster (for the other logical clusters)
+ *
+ * In detail,
+ *    0 - literal (uncompressed) cluster,
+ *        di_advise = 0
+ *        di_clusterofs = the literal data offset of the cluster
+ *        di_blkaddr = the blkaddr of the literal cluster
+ *
+ *    1 - compressed cluster (for the head logical cluster)
+ *        di_advise = 1
+ *        di_clusterofs = the decompressed data offset of the cluster
+ *        di_blkaddr = the blkaddr of the compressed cluster
+ *
+ *    2 - compressed cluster (for the other logical clusters)
+ *        di_advise = 2
+ *        di_clusterofs =
+ *           the decompressed data offset in its own head cluster
+ *        di_u.delta[0] = distance to its corresponding head cluster
+ *        di_u.delta[1] = distance to its corresponding tail cluster
+ *                (di_advise could be 0, 1 or 2)
+ */
+enum {
+        Z_EROFS_VLE_CLUSTER_TYPE_PLAIN,
+        Z_EROFS_VLE_CLUSTER_TYPE_HEAD,
+        Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD,
+        Z_EROFS_VLE_CLUSTER_TYPE_RESERVED,
+        Z_EROFS_VLE_CLUSTER_TYPE_MAX
+};
+#define Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS        2
+#define Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT         0
+struct z_erofs_vle_decompressed_index {
+        __le16 di_advise;
+        /* where to decompress in the head cluster */
+        __le16 di_clusterofs;
+        union {
+                /* for the head cluster */
+                __le32 blkaddr;
+                /*
+                 * for the rest clusters
+                 * eg. for 4k page-sized cluster, maximum 4K*64k = 256M)
+                 * [0] - pointing to the head cluster
+                 * [1] - pointing to the tail cluster
+                 */
+                __le16 delta[2];
+        } di_u __packed;                /* 8 bytes */
+} __packed;
+#define Z_EROFS_VLE_LEGACY_INDEX_ALIGN(size) \
+        (round_up(size, sizeof(struct z_erofs_vle_decompressed_index)) + \
+         sizeof(struct z_erofs_map_header) + Z_EROFS_VLE_LEGACY_HEADER_PADDING)
+/* dirent sorts in alphabet order, thus we can do binary search */
+struct erofs_dirent {
+        __le64 nid;     /*  0, node number */
+        __le16 nameoff; /*  8, start offset of file name */
+        __u8 file_type; /* 10, file type */
+        __u8 reserved;  /* 11, reserved */
+} __packed;
+/*
+ * EROFS file types should match generic FT_* types and
+ * it seems no need to add BUILD_BUG_ONs since potential
+ * unmatchness will break other fses as well...
+ */
+#define EROFS_NAME_LEN      255
+/* check the EROFS on-disk layout strictly at compile time */
+static inline void erofs_check_ondisk_layout_definitions(void)
+{
+        BUILD_BUG_ON(sizeof(struct erofs_super_block) != 128);
+        BUILD_BUG_ON(sizeof(struct erofs_inode_v1) != 32);
+        BUILD_BUG_ON(sizeof(struct erofs_inode_v2) != 64);
+        BUILD_BUG_ON(sizeof(struct erofs_xattr_ibody_header) != 12);
+        BUILD_BUG_ON(sizeof(struct erofs_xattr_entry) != 4);
+        BUILD_BUG_ON(sizeof(struct z_erofs_map_header) != 8);
+        BUILD_BUG_ON(sizeof(struct z_erofs_vle_decompressed_index) != 8);
+        BUILD_BUG_ON(sizeof(struct erofs_dirent) != 12);
+        BUILD_BUG_ON(BIT(Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS) <
+                     Z_EROFS_VLE_CLUSTER_TYPE_MAX - 1);
+}
+#endif
diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c
new file mode 100644
index 000000000000..80f4fe919ee7
--- /dev/null
+++ b/fs/erofs/inode.c
@@ -0,0 +1,332 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2017-2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25@huawei.com>
+ */
+#include "xattr.h"
+#include <trace/events/erofs.h>
+/* no locking */
+static int read_inode(struct inode *inode, void *data)
+{
+        struct erofs_vnode *vi = EROFS_V(inode);
+        struct erofs_inode_v1 *v1 = data;
+        const unsigned int advise = le16_to_cpu(v1->i_advise);
+        erofs_blk_t nblks = 0;
+        vi->datamode = __inode_data_mapping(advise);
+        if (unlikely(vi->datamode >= EROFS_INODE_LAYOUT_MAX)) {
+                errln("unsupported data mapping %u of nid %llu",
+                      vi->datamode, vi->nid);
+                DBG_BUGON(1);
+                return -EOPNOTSUPP;
+        }
+        if (__inode_version(advise) == EROFS_INODE_LAYOUT_V2) {
+                struct erofs_inode_v2 *v2 = data;
+                vi->inode_isize = sizeof(struct erofs_inode_v2);
+                vi->xattr_isize = ondisk_xattr_ibody_size(v2->i_xattr_icount);
+                inode->i_mode = le16_to_cpu(v2->i_mode);
+                if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+                    S_ISLNK(inode->i_mode))
+                        vi->raw_blkaddr = le32_to_cpu(v2->i_u.raw_blkaddr);
+                else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
+                        inode->i_rdev =
+                                new_decode_dev(le32_to_cpu(v2->i_u.rdev));
+                else if (S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode))
+                        inode->i_rdev = 0;
+                else
+                        goto bogusimode;
+                i_uid_write(inode, le32_to_cpu(v2->i_uid));
+                i_gid_write(inode, le32_to_cpu(v2->i_gid));
+                set_nlink(inode, le32_to_cpu(v2->i_nlink));
+                /* ns timestamp */
+                inode->i_mtime.tv_sec = inode->i_ctime.tv_sec =
+                        le64_to_cpu(v2->i_ctime);
+                inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec =
+                        le32_to_cpu(v2->i_ctime_nsec);
+                inode->i_size = le64_to_cpu(v2->i_size);
+                /* total blocks for compressed files */
+                if (is_inode_layout_compression(inode))
+                        nblks = le32_to_cpu(v2->i_u.compressed_blocks);
+        } else if (__inode_version(advise) == EROFS_INODE_LAYOUT_V1) {
+                struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb);
+                vi->inode_isize = sizeof(struct erofs_inode_v1);
+                vi->xattr_isize = ondisk_xattr_ibody_size(v1->i_xattr_icount);
+                inode->i_mode = le16_to_cpu(v1->i_mode);
+                if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+                    S_ISLNK(inode->i_mode))
+                        vi->raw_blkaddr = le32_to_cpu(v1->i_u.raw_blkaddr);
+                else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
+                        inode->i_rdev =
+                                new_decode_dev(le32_to_cpu(v1->i_u.rdev));
+                else if (S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode))
+                        inode->i_rdev = 0;
+                else
+                        goto bogusimode;
+                i_uid_write(inode, le16_to_cpu(v1->i_uid));
+                i_gid_write(inode, le16_to_cpu(v1->i_gid));
+                set_nlink(inode, le16_to_cpu(v1->i_nlink));
+                /* use build time to derive all file time */
+                inode->i_mtime.tv_sec = inode->i_ctime.tv_sec =
+                        sbi->build_time;
+                inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec =
+                        sbi->build_time_nsec;
+                inode->i_size = le32_to_cpu(v1->i_size);
+                if (is_inode_layout_compression(inode))
+                        nblks = le32_to_cpu(v1->i_u.compressed_blocks);
+        } else {
+                errln("unsupported on-disk inode version %u of nid %llu",
+                      __inode_version(advise), vi->nid);
+                DBG_BUGON(1);
+                return -EOPNOTSUPP;
+        }
+        if (!nblks)
+                /* measure inode.i_blocks as generic filesystems */
+                inode->i_blocks = roundup(inode->i_size, EROFS_BLKSIZ) >> 9;
+        else
+                inode->i_blocks = nblks << LOG_SECTORS_PER_BLOCK;
+        return 0;
+bogusimode:
+        errln("bogus i_mode (%o) @ nid %llu", inode->i_mode, vi->nid);
+        DBG_BUGON(1);
+        return -EFSCORRUPTED;
+}
+/*
+ * try_lock can be required since locking order is:
+ *   file data(fs_inode)
+ *        meta(bd_inode)
+ * but the majority of the callers is "iget",
+ * in that case we are pretty sure no deadlock since
+ * no data operations exist. However I tend to
+ * try_lock since it takes no much overhead and
+ * will success immediately.
+ */
+static int fill_inline_data(struct inode *inode, void *data,
+                            unsigned int m_pofs)
+{
+        struct erofs_vnode *vi = EROFS_V(inode);
+        struct erofs_sb_info *sbi = EROFS_I_SB(inode);
+        /* should be inode inline C */
+        if (!is_inode_flat_inline(inode))
+                return 0;
+        /* fast symlink (following ext4) */
+        if (S_ISLNK(inode->i_mode) && inode->i_size < PAGE_SIZE) {
+                char *lnk = erofs_kmalloc(sbi, inode->i_size + 1, GFP_KERNEL);
+                if (unlikely(!lnk))
+                        return -ENOMEM;
+                m_pofs += vi->inode_isize + vi->xattr_isize;
+                /* inline symlink data shouldn't across page boundary as well */
+                if (unlikely(m_pofs + inode->i_size > PAGE_SIZE)) {
+                        kfree(lnk);
+                        errln("inline data cross block boundary @ nid %llu",
+                              vi->nid);
+                        DBG_BUGON(1);
+                        return -EFSCORRUPTED;
+                }
+                /* get in-page inline data */
+                memcpy(lnk, data + m_pofs, inode->i_size);
+                lnk[inode->i_size] = '\0';
+                inode->i_link = lnk;
+                set_inode_fast_symlink(inode);
+        }
+        return 0;
+}
+static int fill_inode(struct inode *inode, int isdir)
+{
+        struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb);
+        struct erofs_vnode *vi = EROFS_V(inode);
+        struct page *page;
+        void *data;
+        int err;
+        erofs_blk_t blkaddr;
+        unsigned int ofs;
+        erofs_off_t inode_loc;
+        trace_erofs_fill_inode(inode, isdir);
+        inode_loc = iloc(sbi, vi->nid);
+        blkaddr = erofs_blknr(inode_loc);
+        ofs = erofs_blkoff(inode_loc);
+        debugln("%s, reading inode nid %llu at %u of blkaddr %u",
+                __func__, vi->nid, ofs, blkaddr);
+        page = erofs_get_meta_page(inode->i_sb, blkaddr, isdir);
+        if (IS_ERR(page)) {
+                errln("failed to get inode (nid: %llu) page, err %ld",
+                      vi->nid, PTR_ERR(page));
+                return PTR_ERR(page);
+        }
+        DBG_BUGON(!PageUptodate(page));
+        data = page_address(page);
+        err = read_inode(inode, data + ofs);
+        if (!err) {
+                /* setup the new inode */
+                if (S_ISREG(inode->i_mode)) {
+                        inode->i_op = &erofs_generic_iops;
+                        inode->i_fop = &generic_ro_fops;
+                } else if (S_ISDIR(inode->i_mode)) {
+                        inode->i_op = &erofs_dir_iops;
+                        inode->i_fop = &erofs_dir_fops;
+                } else if (S_ISLNK(inode->i_mode)) {
+                        /* by default, page_get_link is used for symlink */
+                        inode->i_op = &erofs_symlink_iops;
+                        inode_nohighmem(inode);
+                } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
+                        S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
+                        inode->i_op = &erofs_generic_iops;
+                        init_special_inode(inode, inode->i_mode, inode->i_rdev);
+                        goto out_unlock;
+                } else {
+                        err = -EFSCORRUPTED;
+                        goto out_unlock;
+                }
+                if (is_inode_layout_compression(inode)) {
+                        err = z_erofs_fill_inode(inode);
+                        goto out_unlock;
+                }
+                inode->i_mapping->a_ops = &erofs_raw_access_aops;
+                /* fill last page if inline data is available */
+                err = fill_inline_data(inode, data, ofs);
+        }
+out_unlock:
+        unlock_page(page);
+        put_page(page);
+        return err;
+}
+/*
+ * erofs nid is 64bits, but i_ino is 'unsigned long', therefore
+ * we should do more for 32-bit platform to find the right inode.
+ */
+#if BITS_PER_LONG == 32
+static int erofs_ilookup_test_actor(struct inode *inode, void *opaque)
+{
+        const erofs_nid_t nid = *(erofs_nid_t *)opaque;
+        return EROFS_V(inode)->nid == nid;
+}
+static int erofs_iget_set_actor(struct inode *inode, void *opaque)
+{
+        const erofs_nid_t nid = *(erofs_nid_t *)opaque;
+        inode->i_ino = erofs_inode_hash(nid);
+        return 0;
+}
+#endif
+static inline struct inode *erofs_iget_locked(struct super_block *sb,
+                                              erofs_nid_t nid)
+{
+        const unsigned long hashval = erofs_inode_hash(nid);
+#if BITS_PER_LONG >= 64
+        /* it is safe to use iget_locked for >= 64-bit platform */
+        return iget_locked(sb, hashval);
+#else
+        return iget5_locked(sb, hashval, erofs_ilookup_test_actor,
+                erofs_iget_set_actor, &nid);
+#endif
+}
+struct inode *erofs_iget(struct super_block *sb,
+                         erofs_nid_t nid,
+                         bool isdir)
+{
+        struct inode *inode = erofs_iget_locked(sb, nid);
+        if (unlikely(!inode))
+                return ERR_PTR(-ENOMEM);
+        if (inode->i_state & I_NEW) {
+                int err;
+                struct erofs_vnode *vi = EROFS_V(inode);
+                vi->nid = nid;
+                err = fill_inode(inode, isdir);
+                if (likely(!err))
+                        unlock_new_inode(inode);
+                else {
+                        iget_failed(inode);
+                        inode = ERR_PTR(err);
+                }
+        }
+        return inode;
+}
+int erofs_getattr(const struct path *path, struct kstat *stat,
+                  u32 request_mask, unsigned int query_flags)
+{
+        struct inode *const inode = d_inode(path->dentry);
+        if (is_inode_layout_compression(inode))
+                stat->attributes |= STATX_ATTR_COMPRESSED;
+        stat->attributes |= STATX_ATTR_IMMUTABLE;
+        stat->attributes_mask |= (STATX_ATTR_COMPRESSED |
+                                  STATX_ATTR_IMMUTABLE);
+        generic_fillattr(inode, stat);
+        return 0;
+}
+const struct inode_operations erofs_generic_iops = {
+        .getattr = erofs_getattr,
+#ifdef CONFIG_EROFS_FS_XATTR
+        .listxattr = erofs_listxattr,
+#endif
+        .get_acl = erofs_get_acl,
+};
+const struct inode_operations erofs_symlink_iops = {
+        .get_link = page_get_link,
+        .getattr = erofs_getattr,
+#ifdef CONFIG_EROFS_FS_XATTR
+        .listxattr = erofs_listxattr,
+#endif
+        .get_acl = erofs_get_acl,
+};
+const struct inode_operations erofs_fast_symlink_iops = {
+        .get_link = simple_get_link,
+        .getattr = erofs_getattr,
+#ifdef CONFIG_EROFS_FS_XATTR
+        .listxattr = erofs_listxattr,
+#endif
+        .get_acl = erofs_get_acl,
+};
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
new file mode 100644
index 000000000000..620b73fcc416
--- /dev/null
+++ b/fs/erofs/internal.h
@@ -0,0 +1,553 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2017-2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25@huawei.com>
+ */
+#ifndef __EROFS_INTERNAL_H
+#define __EROFS_INTERNAL_H
+#include <linux/fs.h>
+#include <linux/dcache.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/bio.h>
+#include <linux/buffer_head.h>
+#include <linux/magic.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+#include "erofs_fs.h"
+/* redefine pr_fmt "erofs: " */
+#undef pr_fmt
+#define pr_fmt(fmt) "erofs: " fmt
+#define errln(x, ...)   pr_err(x "\n", ##__VA_ARGS__)
+#define infoln(x, ...)  pr_info(x "\n", ##__VA_ARGS__)
+#ifdef CONFIG_EROFS_FS_DEBUG
+#define debugln(x, ...) pr_debug(x "\n", ##__VA_ARGS__)
+#define DBG_BUGON               BUG_ON
+#else
+#define debugln(x, ...)         ((void)0)
+#define DBG_BUGON(x)            ((void)(x))
+#endif  /* !CONFIG_EROFS_FS_DEBUG */
+enum {
+        FAULT_KMALLOC,
+        FAULT_READ_IO,
+        FAULT_MAX,
+};
+#ifdef CONFIG_EROFS_FAULT_INJECTION
+extern const char *erofs_fault_name[FAULT_MAX];
+#define IS_FAULT_SET(fi, type) ((fi)->inject_type & (1 << (type)))
+struct erofs_fault_info {
+        atomic_t inject_ops;
+        unsigned int inject_rate;
+        unsigned int inject_type;
+};
+#endif  /* CONFIG_EROFS_FAULT_INJECTION */
+/* EROFS_SUPER_MAGIC_V1 to represent the whole file system */
+#define EROFS_SUPER_MAGIC   EROFS_SUPER_MAGIC_V1
+typedef u64 erofs_nid_t;
+typedef u64 erofs_off_t;
+/* data type for filesystem-wide blocks number */
+typedef u32 erofs_blk_t;
+struct erofs_sb_info {
+#ifdef CONFIG_EROFS_FS_ZIP
+        /* list for all registered superblocks, mainly for shrinker */
+        struct list_head list;
+        struct mutex umount_mutex;
+        /* the dedicated workstation for compression */
+        struct radix_tree_root workstn_tree;
+        /* threshold for decompression synchronously */
+        unsigned int max_sync_decompress_pages;
+        unsigned int shrinker_run_no;
+        /* current strategy of how to use managed cache */
+        unsigned char cache_strategy;
+        /* pseudo inode to manage cached pages */
+        struct inode *managed_cache;
+#endif  /* CONFIG_EROFS_FS_ZIP */
+        u32 blocks;
+        u32 meta_blkaddr;
+#ifdef CONFIG_EROFS_FS_XATTR
+        u32 xattr_blkaddr;
+#endif
+        /* inode slot unit size in bit shift */
+        unsigned char islotbits;
+        u32 build_time_nsec;
+        u64 build_time;
+        /* what we really care is nid, rather than ino.. */
+        erofs_nid_t root_nid;
+        /* used for statfs, f_files - f_favail */
+        u64 inos;
+        u8 uuid[16];                    /* 128-bit uuid for volume */
+        u8 volume_name[16];             /* volume name */
+        u32 requirements;
+        unsigned int mount_opt;
+#ifdef CONFIG_EROFS_FAULT_INJECTION
+        struct erofs_fault_info fault_info;     /* For fault injection */
+#endif
+};
+#ifdef CONFIG_EROFS_FAULT_INJECTION
+#define erofs_show_injection_info(type)                                 \
+        infoln("inject %s in %s of %pS", erofs_fault_name[type],        \
+                __func__, __builtin_return_address(0))
+static inline bool time_to_inject(struct erofs_sb_info *sbi, int type)
+{
+        struct erofs_fault_info *ffi = &sbi->fault_info;
+        if (!ffi->inject_rate)
+                return false;
+        if (!IS_FAULT_SET(ffi, type))
+                return false;
+        atomic_inc(&ffi->inject_ops);
+        if (atomic_read(&ffi->inject_ops) >= ffi->inject_rate) {
+                atomic_set(&ffi->inject_ops, 0);
+                return true;
+        }
+        return false;
+}
+#else
+static inline bool time_to_inject(struct erofs_sb_info *sbi, int type)
+{
+        return false;
+}
+static inline void erofs_show_injection_info(int type)
+{
+}
+#endif  /* !CONFIG_EROFS_FAULT_INJECTION */
+static inline void *erofs_kmalloc(struct erofs_sb_info *sbi,
+                                        size_t size, gfp_t flags)
+{
+        if (time_to_inject(sbi, FAULT_KMALLOC)) {
+                erofs_show_injection_info(FAULT_KMALLOC);
+                return NULL;
+        }
+        return kmalloc(size, flags);
+}
+#define EROFS_SB(sb) ((struct erofs_sb_info *)(sb)->s_fs_info)
+#define EROFS_I_SB(inode) ((struct erofs_sb_info *)(inode)->i_sb->s_fs_info)
+/* Mount flags set via mount options or defaults */
+#define EROFS_MOUNT_XATTR_USER          0x00000010
+#define EROFS_MOUNT_POSIX_ACL           0x00000020
+#define EROFS_MOUNT_FAULT_INJECTION     0x00000040
+#define clear_opt(sbi, option)  ((sbi)->mount_opt &= ~EROFS_MOUNT_##option)
+#define set_opt(sbi, option)    ((sbi)->mount_opt |= EROFS_MOUNT_##option)
+#define test_opt(sbi, option)   ((sbi)->mount_opt & EROFS_MOUNT_##option)
+#ifdef CONFIG_EROFS_FS_ZIP
+enum {
+        EROFS_ZIP_CACHE_DISABLED,
+        EROFS_ZIP_CACHE_READAHEAD,
+        EROFS_ZIP_CACHE_READAROUND
+};
+#define EROFS_LOCKED_MAGIC     (INT_MIN | 0xE0F510CCL)
+/* basic unit of the workstation of a super_block */
+struct erofs_workgroup {
+        /* the workgroup index in the workstation */
+        pgoff_t index;
+        /* overall workgroup reference count */
+        atomic_t refcount;
+};
+#if defined(CONFIG_SMP)
+static inline bool erofs_workgroup_try_to_freeze(struct erofs_workgroup *grp,
+                                                 int val)
+{
+        preempt_disable();
+        if (val != atomic_cmpxchg(&grp->refcount, val, EROFS_LOCKED_MAGIC)) {
+                preempt_enable();
+                return false;
+        }
+        return true;
+}
+static inline void erofs_workgroup_unfreeze(struct erofs_workgroup *grp,
+                                            int orig_val)
+{
+        /*
+         * other observers should notice all modifications
+         * in the freezing period.
+         */
+        smp_mb();
+        atomic_set(&grp->refcount, orig_val);
+        preempt_enable();
+}
+static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp)
+{
+        return atomic_cond_read_relaxed(&grp->refcount,
+                                        VAL != EROFS_LOCKED_MAGIC);
+}
+#else
+static inline bool erofs_workgroup_try_to_freeze(struct erofs_workgroup *grp,
+                                                 int val)
+{
+        preempt_disable();
+        /* no need to spin on UP platforms, let's just disable preemption. */
+        if (val != atomic_read(&grp->refcount)) {
+                preempt_enable();
+                return false;
+        }
+        return true;
+}
+static inline void erofs_workgroup_unfreeze(struct erofs_workgroup *grp,
+                                            int orig_val)
+{
+        preempt_enable();
+}
+static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp)
+{
+        int v = atomic_read(&grp->refcount);
+        /* workgroup is never freezed on uniprocessor systems */
+        DBG_BUGON(v == EROFS_LOCKED_MAGIC);
+        return v;
+}
+#endif  /* !CONFIG_SMP */
+/* hard limit of pages per compressed cluster */
+#define Z_EROFS_CLUSTER_MAX_PAGES       (CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT)
+#define EROFS_PCPUBUF_NR_PAGES          Z_EROFS_CLUSTER_MAX_PAGES
+#else
+#define EROFS_PCPUBUF_NR_PAGES          0
+#endif  /* !CONFIG_EROFS_FS_ZIP */
+/* we strictly follow PAGE_SIZE and no buffer head yet */
+#define LOG_BLOCK_SIZE          PAGE_SHIFT
+#undef LOG_SECTORS_PER_BLOCK
+#define LOG_SECTORS_PER_BLOCK   (PAGE_SHIFT - 9)
+#undef SECTORS_PER_BLOCK
+#define SECTORS_PER_BLOCK       (1 << SECTORS_PER_BLOCK)
+#define EROFS_BLKSIZ            (1 << LOG_BLOCK_SIZE)
+#if (EROFS_BLKSIZ % 4096 || !EROFS_BLKSIZ)
+#error erofs cannot be used in this platform
+#endif
+#define EROFS_IO_MAX_RETRIES_NOFAIL     5
+#define ROOT_NID(sb)            ((sb)->root_nid)
+#define erofs_blknr(addr)       ((addr) / EROFS_BLKSIZ)
+#define erofs_blkoff(addr)      ((addr) % EROFS_BLKSIZ)
+#define blknr_to_addr(nr)       ((erofs_off_t)(nr) * EROFS_BLKSIZ)
+static inline erofs_off_t iloc(struct erofs_sb_info *sbi, erofs_nid_t nid)
+{
+        return blknr_to_addr(sbi->meta_blkaddr) + (nid << sbi->islotbits);
+}
+/* atomic flag definitions */
+#define EROFS_V_EA_INITED_BIT   0
+#define EROFS_V_Z_INITED_BIT    1
+/* bitlock definitions (arranged in reverse order) */
+#define EROFS_V_BL_XATTR_BIT    (BITS_PER_LONG - 1)
+#define EROFS_V_BL_Z_BIT        (BITS_PER_LONG - 2)
+struct erofs_vnode {
+        erofs_nid_t nid;
+        /* atomic flags (including bitlocks) */
+        unsigned long flags;
+        unsigned char datamode;
+        unsigned char inode_isize;
+        unsigned short xattr_isize;
+        unsigned int xattr_shared_count;
+        unsigned int *xattr_shared_xattrs;
+        union {
+                erofs_blk_t raw_blkaddr;
+#ifdef CONFIG_EROFS_FS_ZIP
+                struct {
+                        unsigned short z_advise;
+                        unsigned char  z_algorithmtype[2];
+                        unsigned char  z_logical_clusterbits;
+                        unsigned char  z_physical_clusterbits[2];
+                };
+#endif  /* CONFIG_EROFS_FS_ZIP */
+        };
+        /* the corresponding vfs inode */
+        struct inode vfs_inode;
+};
+#define EROFS_V(ptr)    \
+        container_of(ptr, struct erofs_vnode, vfs_inode)
+#define __inode_advise(x, bit, bits) \
+        (((x) >> (bit)) & ((1 << (bits)) - 1))
+#define __inode_version(advise) \
+        __inode_advise(advise, EROFS_I_VERSION_BIT,     \
+                EROFS_I_VERSION_BITS)
+#define __inode_data_mapping(advise)    \
+        __inode_advise(advise, EROFS_I_DATA_MAPPING_BIT,\
+                EROFS_I_DATA_MAPPING_BITS)
+static inline unsigned long inode_datablocks(struct inode *inode)
+{
+        /* since i_size cannot be changed */
+        return DIV_ROUND_UP(inode->i_size, EROFS_BLKSIZ);
+}
+static inline bool is_inode_layout_compression(struct inode *inode)
+{
+        return erofs_inode_is_data_compressed(EROFS_V(inode)->datamode);
+}
+static inline bool is_inode_flat_inline(struct inode *inode)
+{
+        return EROFS_V(inode)->datamode == EROFS_INODE_FLAT_INLINE;
+}
+extern const struct super_operations erofs_sops;
+extern const struct address_space_operations erofs_raw_access_aops;
+#ifdef CONFIG_EROFS_FS_ZIP
+extern const struct address_space_operations z_erofs_vle_normalaccess_aops;
+#endif
+/*
+ * Logical to physical block mapping, used by erofs_map_blocks()
+ *
+ * Different with other file systems, it is used for 2 access modes:
+ *
+ * 1) RAW access mode:
+ *
+ * Users pass a valid (m_lblk, m_lofs -- usually 0) pair,
+ * and get the valid m_pblk, m_pofs and the longest m_len(in bytes).
+ *
+ * Note that m_lblk in the RAW access mode refers to the number of
+ * the compressed ondisk block rather than the uncompressed
+ * in-memory block for the compressed file.
+ *
+ * m_pofs equals to m_lofs except for the inline data page.
+ *
+ * 2) Normal access mode:
+ *
+ * If the inode is not compressed, it has no difference with
+ * the RAW access mode. However, if the inode is compressed,
+ * users should pass a valid (m_lblk, m_lofs) pair, and get
+ * the needed m_pblk, m_pofs, m_len to get the compressed data
+ * and the updated m_lblk, m_lofs which indicates the start
+ * of the corresponding uncompressed data in the file.
+ */
+enum {
+        BH_Zipped = BH_PrivateStart,
+        BH_FullMapped,
+};
+/* Has a disk mapping */
+#define EROFS_MAP_MAPPED        (1 << BH_Mapped)
+/* Located in metadata (could be copied from bd_inode) */
+#define EROFS_MAP_META          (1 << BH_Meta)
+/* The extent has been compressed */
+#define EROFS_MAP_ZIPPED        (1 << BH_Zipped)
+/* The length of extent is full */
+#define EROFS_MAP_FULL_MAPPED   (1 << BH_FullMapped)
+struct erofs_map_blocks {
+        erofs_off_t m_pa, m_la;
+        u64 m_plen, m_llen;
+        unsigned int m_flags;
+        struct page *mpage;
+};
+/* Flags used by erofs_map_blocks() */
+#define EROFS_GET_BLOCKS_RAW    0x0001
+/* zmap.c */
+#ifdef CONFIG_EROFS_FS_ZIP
+int z_erofs_fill_inode(struct inode *inode);
+int z_erofs_map_blocks_iter(struct inode *inode,
+                            struct erofs_map_blocks *map,
+                            int flags);
+#else
+static inline int z_erofs_fill_inode(struct inode *inode) { return -EOPNOTSUPP; }
+static inline int z_erofs_map_blocks_iter(struct inode *inode,
+                                          struct erofs_map_blocks *map,
+                                          int flags)
+{
+        return -EOPNOTSUPP;
+}
+#endif  /* !CONFIG_EROFS_FS_ZIP */
+/* data.c */
+static inline struct bio *erofs_grab_bio(struct super_block *sb,
+                                         erofs_blk_t blkaddr,
+                                         unsigned int nr_pages,
+                                         void *bi_private, bio_end_io_t endio,
+                                         bool nofail)
+{
+        const gfp_t gfp = GFP_NOIO;
+        struct bio *bio;
+        do {
+                if (nr_pages == 1) {
+                        bio = bio_alloc(gfp | (nofail ? __GFP_NOFAIL : 0), 1);
+                        if (unlikely(!bio)) {
+                                DBG_BUGON(nofail);
+                                return ERR_PTR(-ENOMEM);
+                        }
+                        break;
+                }
+                bio = bio_alloc(gfp, nr_pages);
+                nr_pages /= 2;
+        } while (unlikely(!bio));
+        bio->bi_end_io = endio;
+        bio_set_dev(bio, sb->s_bdev);
+        bio->bi_iter.bi_sector = (sector_t)blkaddr << LOG_SECTORS_PER_BLOCK;
+        bio->bi_private = bi_private;
+        return bio;
+}
+static inline void __submit_bio(struct bio *bio, unsigned int op,
+                                unsigned int op_flags)
+{
+        bio_set_op_attrs(bio, op, op_flags);
+        submit_bio(bio);
+}
+struct page *__erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr,
+                                   bool prio, bool nofail);
+static inline struct page *erofs_get_meta_page(struct super_block *sb,
+        erofs_blk_t blkaddr, bool prio)
+{
+        return __erofs_get_meta_page(sb, blkaddr, prio, false);
+}
+int erofs_map_blocks(struct inode *, struct erofs_map_blocks *, int);
+static inline struct page *erofs_get_inline_page(struct inode *inode,
+                                                 erofs_blk_t blkaddr)
+{
+        return erofs_get_meta_page(inode->i_sb, blkaddr,
+                                   S_ISDIR(inode->i_mode));
+}
+/* inode.c */
+static inline unsigned long erofs_inode_hash(erofs_nid_t nid)
+{
+#if BITS_PER_LONG == 32
+        return (nid >> 32) ^ (nid & 0xffffffff);
+#else
+        return nid;
+#endif
+}
+extern const struct inode_operations erofs_generic_iops;
+extern const struct inode_operations erofs_symlink_iops;
+extern const struct inode_operations erofs_fast_symlink_iops;
+static inline void set_inode_fast_symlink(struct inode *inode)
+{
+        inode->i_op = &erofs_fast_symlink_iops;
+}
+static inline bool is_inode_fast_symlink(struct inode *inode)
+{
+        return inode->i_op == &erofs_fast_symlink_iops;
+}
+struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid, bool dir);
+int erofs_getattr(const struct path *path, struct kstat *stat,
+                  u32 request_mask, unsigned int query_flags);
+/* namei.c */
+extern const struct inode_operations erofs_dir_iops;
+int erofs_namei(struct inode *dir, struct qstr *name,
+                erofs_nid_t *nid, unsigned int *d_type);
+/* dir.c */
+extern const struct file_operations erofs_dir_fops;
+/* utils.c / zdata.c */
+struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp, bool nofail);
+#if (EROFS_PCPUBUF_NR_PAGES > 0)
+void *erofs_get_pcpubuf(unsigned int pagenr);
+#define erofs_put_pcpubuf(buf) do { \
+        (void)&(buf);   \
+        preempt_enable();       \
+} while (0)
+#else
+static inline void *erofs_get_pcpubuf(unsigned int pagenr)
+{
+        return ERR_PTR(-EOPNOTSUPP);
+}
+#define erofs_put_pcpubuf(buf) do {} while (0)
+#endif
+#ifdef CONFIG_EROFS_FS_ZIP
+int erofs_workgroup_put(struct erofs_workgroup *grp);
+struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
+                                             pgoff_t index, bool *tag);
+int erofs_register_workgroup(struct super_block *sb,
+                             struct erofs_workgroup *grp, bool tag);
+void erofs_workgroup_free_rcu(struct erofs_workgroup *grp);
+void erofs_shrinker_register(struct super_block *sb);
+void erofs_shrinker_unregister(struct super_block *sb);
+int __init erofs_init_shrinker(void);
+void erofs_exit_shrinker(void);
+int __init z_erofs_init_zip_subsystem(void);
+void z_erofs_exit_zip_subsystem(void);
+int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
+                                       struct erofs_workgroup *egrp);
+int erofs_try_to_free_cached_page(struct address_space *mapping,
+                                  struct page *page);
+#else
+static inline void erofs_shrinker_register(struct super_block *sb) {}
+static inline void erofs_shrinker_unregister(struct super_block *sb) {}
+static inline int erofs_init_shrinker(void) { return 0; }
+static inline void erofs_exit_shrinker(void) {}
+static inline int z_erofs_init_zip_subsystem(void) { return 0; }
+static inline void z_erofs_exit_zip_subsystem(void) {}
+#endif  /* !CONFIG_EROFS_FS_ZIP */
+#define EFSCORRUPTED    EUCLEAN         /* Filesystem is corrupted */
+#endif  /* __EROFS_INTERNAL_H */
diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c
new file mode 100644
index 000000000000..8832b5d95d91
--- /dev/null
+++ b/fs/erofs/namei.c
@@ -0,0 +1,251 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2017-2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25@huawei.com>
+ */
+#include "xattr.h"
+#include <trace/events/erofs.h>
+struct erofs_qstr {
+        const unsigned char *name;
+        const unsigned char *end;
+};
+/* based on the end of qn is accurate and it must have the trailing '\0' */
+static inline int dirnamecmp(const struct erofs_qstr *qn,
+                             const struct erofs_qstr *qd,
+                             unsigned int *matched)
+{
+        unsigned int i = *matched;
+        /*
+         * on-disk error, let's only BUG_ON in the debugging mode.
+         * otherwise, it will return 1 to just skip the invalid name
+         * and go on (in consideration of the lookup performance).
+         */
+        DBG_BUGON(qd->name > qd->end);
+        /* qd could not have trailing '\0' */
+        /* However it is absolutely safe if < qd->end */
+        while (qd->name + i < qd->end && qd->name[i] != '\0') {
+                if (qn->name[i] != qd->name[i]) {
+                        *matched = i;
+                        return qn->name[i] > qd->name[i] ? 1 : -1;
+                }
+                ++i;
+        }
+        *matched = i;
+        /* See comments in __d_alloc on the terminating NUL character */
+        return qn->name[i] == '\0' ? 0 : 1;
+}
+#define nameoff_from_disk(off, sz)      (le16_to_cpu(off) & ((sz) - 1))
+static struct erofs_dirent *find_target_dirent(struct erofs_qstr *name,
+                                               u8 *data,
+                                               unsigned int dirblksize,
+                                               const int ndirents)
+{
+        int head, back;
+        unsigned int startprfx, endprfx;
+        struct erofs_dirent *const de = (struct erofs_dirent *)data;
+        /* since the 1st dirent has been evaluated previously */
+        head = 1;
+        back = ndirents - 1;
+        startprfx = endprfx = 0;
+        while (head <= back) {
+                const int mid = head + (back - head) / 2;
+                const int nameoff = nameoff_from_disk(de[mid].nameoff,
+                                                      dirblksize);
+                unsigned int matched = min(startprfx, endprfx);
+                struct erofs_qstr dname = {
+                        .name = data + nameoff,
+                        .end = unlikely(mid >= ndirents - 1) ?
+                                data + dirblksize :
+                                data + nameoff_from_disk(de[mid + 1].nameoff,
+                                                         dirblksize)
+                };
+                /* string comparison without already matched prefix */
+                int ret = dirnamecmp(name, &dname, &matched);
+                if (unlikely(!ret)) {
+                        return de + mid;
+                } else if (ret > 0) {
+                        head = mid + 1;
+                        startprfx = matched;
+                } else {
+                        back = mid - 1;
+                        endprfx = matched;
+                }
+        }
+        return ERR_PTR(-ENOENT);
+}
+static struct page *find_target_block_classic(struct inode *dir,
+                                              struct erofs_qstr *name,
+                                              int *_ndirents)
+{
+        unsigned int startprfx, endprfx;
+        int head, back;
+        struct address_space *const mapping = dir->i_mapping;
+        struct page *candidate = ERR_PTR(-ENOENT);
+        startprfx = endprfx = 0;
+        head = 0;
+        back = inode_datablocks(dir) - 1;
+        while (head <= back) {
+                const int mid = head + (back - head) / 2;
+                struct page *page = read_mapping_page(mapping, mid, NULL);
+                if (!IS_ERR(page)) {
+                        struct erofs_dirent *de = kmap_atomic(page);
+                        const int nameoff = nameoff_from_disk(de->nameoff,
+                                                              EROFS_BLKSIZ);
+                        const int ndirents = nameoff / sizeof(*de);
+                        int diff;
+                        unsigned int matched;
+                        struct erofs_qstr dname;
+                        if (unlikely(!ndirents)) {
+                                kunmap_atomic(de);
+                                put_page(page);
+                                errln("corrupted dir block %d @ nid %llu",
+                                      mid, EROFS_V(dir)->nid);
+                                DBG_BUGON(1);
+                                page = ERR_PTR(-EFSCORRUPTED);
+                                goto out;
+                        }
+                        matched = min(startprfx, endprfx);
+                        dname.name = (u8 *)de + nameoff;
+                        if (ndirents == 1)
+                                dname.end = (u8 *)de + EROFS_BLKSIZ;
+                        else
+                                dname.end = (u8 *)de +
+                                        nameoff_from_disk(de[1].nameoff,
+                                                          EROFS_BLKSIZ);
+                        /* string comparison without already matched prefix */
+                        diff = dirnamecmp(name, &dname, &matched);
+                        kunmap_atomic(de);
+                        if (unlikely(!diff)) {
+                                *_ndirents = 0;
+                                goto out;
+                        } else if (diff > 0) {
+                                head = mid + 1;
+                                startprfx = matched;
+                                if (!IS_ERR(candidate))
+                                        put_page(candidate);
+                                candidate = page;
+                                *_ndirents = ndirents;
+                        } else {
+                                put_page(page);
+                                back = mid - 1;
+                                endprfx = matched;
+                        }
+                        continue;
+                }
+out:            /* free if the candidate is valid */
+                if (!IS_ERR(candidate))
+                        put_page(candidate);
+                return page;
+        }
+        return candidate;
+}
+int erofs_namei(struct inode *dir,
+                struct qstr *name,
+                erofs_nid_t *nid, unsigned int *d_type)
+{
+        int ndirents;
+        struct page *page;
+        void *data;
+        struct erofs_dirent *de;
+        struct erofs_qstr qn;
+        if (unlikely(!dir->i_size))
+                return -ENOENT;
+        qn.name = name->name;
+        qn.end = name->name + name->len;
+        ndirents = 0;
+        page = find_target_block_classic(dir, &qn, &ndirents);
+        if (IS_ERR(page))
+                return PTR_ERR(page);
+        data = kmap_atomic(page);
+        /* the target page has been mapped */
+        if (ndirents)
+                de = find_target_dirent(&qn, data, EROFS_BLKSIZ, ndirents);
+        else
+                de = (struct erofs_dirent *)data;
+        if (!IS_ERR(de)) {
+                *nid = le64_to_cpu(de->nid);
+                *d_type = de->file_type;
+        }
+        kunmap_atomic(data);
+        put_page(page);
+        return PTR_ERR_OR_ZERO(de);
+}
+/* NOTE: i_mutex is already held by vfs */
+static struct dentry *erofs_lookup(struct inode *dir,
+                                   struct dentry *dentry,
+                                   unsigned int flags)
+{
+        int err;
+        erofs_nid_t nid;
+        unsigned int d_type;
+        struct inode *inode;
+        DBG_BUGON(!d_really_is_negative(dentry));
+        /* dentry must be unhashed in lookup, no need to worry about */
+        DBG_BUGON(!d_unhashed(dentry));
+        trace_erofs_lookup(dir, dentry, flags);
+        /* file name exceeds fs limit */
+        if (unlikely(dentry->d_name.len > EROFS_NAME_LEN))
+                return ERR_PTR(-ENAMETOOLONG);
+        /* false uninitialized warnings on gcc 4.8.x */
+        err = erofs_namei(dir, &dentry->d_name, &nid, &d_type);
+        if (err == -ENOENT) {
+                /* negative dentry */
+                inode = NULL;
+        } else if (unlikely(err)) {
+                inode = ERR_PTR(err);
+        } else {
+                debugln("%s, %s (nid %llu) found, d_type %u", __func__,
+                        dentry->d_name.name, nid, d_type);
+                inode = erofs_iget(dir->i_sb, nid, d_type == FT_DIR);
+        }
+        return d_splice_alias(inode, dentry);
+}
+const struct inode_operations erofs_dir_iops = {
+        .lookup = erofs_lookup,
+        .getattr = erofs_getattr,
+#ifdef CONFIG_EROFS_FS_XATTR
+        .listxattr = erofs_listxattr,
+#endif
+        .get_acl = erofs_get_acl,
+};
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
new file mode 100644
index 000000000000..6d3a9bcb8daa
--- /dev/null
+++ b/fs/erofs/super.c
@@ -0,0 +1,669 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2017-2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25@huawei.com>
+ */
+#include <linux/module.h>
+#include <linux/buffer_head.h>
+#include <linux/statfs.h>
+#include <linux/parser.h>
+#include <linux/seq_file.h>
+#include "xattr.h"
+#define CREATE_TRACE_POINTS
+#include <trace/events/erofs.h>
+static struct kmem_cache *erofs_inode_cachep __read_mostly;
+static void init_once(void *ptr)
+{
+        struct erofs_vnode *vi = ptr;
+        inode_init_once(&vi->vfs_inode);
+}
+static int __init erofs_init_inode_cache(void)
+{
+        erofs_inode_cachep = kmem_cache_create("erofs_inode",
+                                               sizeof(struct erofs_vnode), 0,
+                                               SLAB_RECLAIM_ACCOUNT,
+                                               init_once);
+        return erofs_inode_cachep ? 0 : -ENOMEM;
+}
+static void erofs_exit_inode_cache(void)
+{
+        kmem_cache_destroy(erofs_inode_cachep);
+}
+static struct inode *alloc_inode(struct super_block *sb)
+{
+        struct erofs_vnode *vi =
+                kmem_cache_alloc(erofs_inode_cachep, GFP_KERNEL);
+        if (!vi)
+                return NULL;
+        /* zero out everything except vfs_inode */
+        memset(vi, 0, offsetof(struct erofs_vnode, vfs_inode));
+        return &vi->vfs_inode;
+}
+static void free_inode(struct inode *inode)
+{
+        struct erofs_vnode *vi = EROFS_V(inode);
+        /* be careful RCU symlink path (see ext4_inode_info->i_data)! */
+        if (is_inode_fast_symlink(inode))
+                kfree(inode->i_link);
+        kfree(vi->xattr_shared_xattrs);
+        kmem_cache_free(erofs_inode_cachep, vi);
+}
+static bool check_layout_compatibility(struct super_block *sb,
+                                       struct erofs_super_block *layout)
+{
+        const unsigned int requirements = le32_to_cpu(layout->requirements);
+        EROFS_SB(sb)->requirements = requirements;
+        /* check if current kernel meets all mandatory requirements */
+        if (requirements & (~EROFS_ALL_REQUIREMENTS)) {
+                errln("unidentified requirements %x, please upgrade kernel version",
+                      requirements & ~EROFS_ALL_REQUIREMENTS);
+                return false;
+        }
+        return true;
+}
+static int superblock_read(struct super_block *sb)
+{
+        struct erofs_sb_info *sbi;
+        struct buffer_head *bh;
+        struct erofs_super_block *layout;
+        unsigned int blkszbits;
+        int ret;
+        bh = sb_bread(sb, 0);
+        if (!bh) {
+                errln("cannot read erofs superblock");
+                return -EIO;
+        }
+        sbi = EROFS_SB(sb);
+        layout = (struct erofs_super_block *)((u8 *)bh->b_data
+                 + EROFS_SUPER_OFFSET);
+        ret = -EINVAL;
+        if (le32_to_cpu(layout->magic) != EROFS_SUPER_MAGIC_V1) {
+                errln("cannot find valid erofs superblock");
+                goto out;
+        }
+        blkszbits = layout->blkszbits;
+        /* 9(512 bytes) + LOG_SECTORS_PER_BLOCK == LOG_BLOCK_SIZE */
+        if (unlikely(blkszbits != LOG_BLOCK_SIZE)) {
+                errln("blksize %u isn't supported on this platform",
+                      1 << blkszbits);
+                goto out;
+        }
+        if (!check_layout_compatibility(sb, layout))
+                goto out;
+        sbi->blocks = le32_to_cpu(layout->blocks);
+        sbi->meta_blkaddr = le32_to_cpu(layout->meta_blkaddr);
+#ifdef CONFIG_EROFS_FS_XATTR
+        sbi->xattr_blkaddr = le32_to_cpu(layout->xattr_blkaddr);
+#endif
+        sbi->islotbits = ffs(sizeof(struct erofs_inode_v1)) - 1;
+        sbi->root_nid = le16_to_cpu(layout->root_nid);
+        sbi->inos = le64_to_cpu(layout->inos);
+        sbi->build_time = le64_to_cpu(layout->build_time);
+        sbi->build_time_nsec = le32_to_cpu(layout->build_time_nsec);
+        memcpy(&sb->s_uuid, layout->uuid, sizeof(layout->uuid));
+        ret = strscpy(sbi->volume_name, layout->volume_name,
+                      sizeof(layout->volume_name));
+        if (ret < 0) {  /* -E2BIG */
+                errln("bad volume name without NIL terminator");
+                ret = -EFSCORRUPTED;
+                goto out;
+        }
+        ret = 0;
+out:
+        brelse(bh);
+        return ret;
+}
+#ifdef CONFIG_EROFS_FAULT_INJECTION
+const char *erofs_fault_name[FAULT_MAX] = {
+        [FAULT_KMALLOC]         = "kmalloc",
+        [FAULT_READ_IO]         = "read IO error",
+};
+static void __erofs_build_fault_attr(struct erofs_sb_info *sbi,
+                                     unsigned int rate)
+{
+        struct erofs_fault_info *ffi = &sbi->fault_info;
+        if (rate) {
+                atomic_set(&ffi->inject_ops, 0);
+                ffi->inject_rate = rate;
+                ffi->inject_type = (1 << FAULT_MAX) - 1;
+        } else {
+                memset(ffi, 0, sizeof(struct erofs_fault_info));
+        }
+        set_opt(sbi, FAULT_INJECTION);
+}
+static int erofs_build_fault_attr(struct erofs_sb_info *sbi,
+                                  substring_t *args)
+{
+        int rate = 0;
+        if (args->from && match_int(args, &rate))
+                return -EINVAL;
+        __erofs_build_fault_attr(sbi, rate);
+        return 0;
+}
+static unsigned int erofs_get_fault_rate(struct erofs_sb_info *sbi)
+{
+        return sbi->fault_info.inject_rate;
+}
+#else
+static void __erofs_build_fault_attr(struct erofs_sb_info *sbi,
+                                     unsigned int rate)
+{
+}
+static int erofs_build_fault_attr(struct erofs_sb_info *sbi,
+                                  substring_t *args)
+{
+        infoln("fault_injection options not supported");
+        return 0;
+}
+static unsigned int erofs_get_fault_rate(struct erofs_sb_info *sbi)
+{
+        return 0;
+}
+#endif
+#ifdef CONFIG_EROFS_FS_ZIP
+static int erofs_build_cache_strategy(struct erofs_sb_info *sbi,
+                                      substring_t *args)
+{
+        const char *cs = match_strdup(args);
+        int err = 0;
+        if (!cs) {
+                errln("Not enough memory to store cache strategy");
+                return -ENOMEM;
+        }
+        if (!strcmp(cs, "disabled")) {
+                sbi->cache_strategy = EROFS_ZIP_CACHE_DISABLED;
+        } else if (!strcmp(cs, "readahead")) {
+                sbi->cache_strategy = EROFS_ZIP_CACHE_READAHEAD;
+        } else if (!strcmp(cs, "readaround")) {
+                sbi->cache_strategy = EROFS_ZIP_CACHE_READAROUND;
+        } else {
+                errln("Unrecognized cache strategy \"%s\"", cs);
+                err = -EINVAL;
+        }
+        kfree(cs);
+        return err;
+}
+#else
+static int erofs_build_cache_strategy(struct erofs_sb_info *sbi,
+                                      substring_t *args)
+{
+        infoln("EROFS compression is disabled, so cache strategy is ignored");
+        return 0;
+}
+#endif
+/* set up default EROFS parameters */
+static void default_options(struct erofs_sb_info *sbi)
+{
+#ifdef CONFIG_EROFS_FS_ZIP
+        sbi->cache_strategy = EROFS_ZIP_CACHE_READAROUND;
+        sbi->max_sync_decompress_pages = 3;
+#endif
+#ifdef CONFIG_EROFS_FS_XATTR
+        set_opt(sbi, XATTR_USER);
+#endif
+#ifdef CONFIG_EROFS_FS_POSIX_ACL
+        set_opt(sbi, POSIX_ACL);
+#endif
+}
+enum {
+        Opt_user_xattr,
+        Opt_nouser_xattr,
+        Opt_acl,
+        Opt_noacl,
+        Opt_fault_injection,
+        Opt_cache_strategy,
+        Opt_err
+};
+static match_table_t erofs_tokens = {
+        {Opt_user_xattr, "user_xattr"},
+        {Opt_nouser_xattr, "nouser_xattr"},
+        {Opt_acl, "acl"},
+        {Opt_noacl, "noacl"},
+        {Opt_fault_injection, "fault_injection=%u"},
+        {Opt_cache_strategy, "cache_strategy=%s"},
+        {Opt_err, NULL}
+};
+static int parse_options(struct super_block *sb, char *options)
+{
+        substring_t args[MAX_OPT_ARGS];
+        char *p;
+        int err;
+        if (!options)
+                return 0;
+        while ((p = strsep(&options, ","))) {
+                int token;
+                if (!*p)
+                        continue;
+                args[0].to = args[0].from = NULL;
+                token = match_token(p, erofs_tokens, args);
+                switch (token) {
+#ifdef CONFIG_EROFS_FS_XATTR
+                case Opt_user_xattr:
+                        set_opt(EROFS_SB(sb), XATTR_USER);
+                        break;
+                case Opt_nouser_xattr:
+                        clear_opt(EROFS_SB(sb), XATTR_USER);
+                        break;
+#else
+                case Opt_user_xattr:
+                        infoln("user_xattr options not supported");
+                        break;
+                case Opt_nouser_xattr:
+                        infoln("nouser_xattr options not supported");
+                        break;
+#endif
+#ifdef CONFIG_EROFS_FS_POSIX_ACL
+                case Opt_acl:
+                        set_opt(EROFS_SB(sb), POSIX_ACL);
+                        break;
+                case Opt_noacl:
+                        clear_opt(EROFS_SB(sb), POSIX_ACL);
+                        break;
+#else
+                case Opt_acl:
+                        infoln("acl options not supported");
+                        break;
+                case Opt_noacl:
+                        infoln("noacl options not supported");
+                        break;
+#endif
+                case Opt_fault_injection:
+                        err = erofs_build_fault_attr(EROFS_SB(sb), args);
+                        if (err)
+                                return err;
+                        break;
+                case Opt_cache_strategy:
+                        err = erofs_build_cache_strategy(EROFS_SB(sb), args);
+                        if (err)
+                                return err;
+                        break;
+                default:
+                        errln("Unrecognized mount option \"%s\" or missing value", p);
+                        return -EINVAL;
+                }
+        }
+        return 0;
+}
+#ifdef CONFIG_EROFS_FS_ZIP
+static const struct address_space_operations managed_cache_aops;
+static int managed_cache_releasepage(struct page *page, gfp_t gfp_mask)
+{
+        int ret = 1;    /* 0 - busy */
+        struct address_space *const mapping = page->mapping;
+        DBG_BUGON(!PageLocked(page));
+        DBG_BUGON(mapping->a_ops != &managed_cache_aops);
+        if (PagePrivate(page))
+                ret = erofs_try_to_free_cached_page(mapping, page);
+        return ret;
+}
+static void managed_cache_invalidatepage(struct page *page,
+                                         unsigned int offset,
+                                         unsigned int length)
+{
+        const unsigned int stop = length + offset;
+        DBG_BUGON(!PageLocked(page));
+        /* Check for potential overflow in debug mode */
+        DBG_BUGON(stop > PAGE_SIZE || stop < length);
+        if (offset == 0 && stop == PAGE_SIZE)
+                while (!managed_cache_releasepage(page, GFP_NOFS))
+                        cond_resched();
+}
+static const struct address_space_operations managed_cache_aops = {
+        .releasepage = managed_cache_releasepage,
+        .invalidatepage = managed_cache_invalidatepage,
+};
+static int erofs_init_managed_cache(struct super_block *sb)
+{
+        struct erofs_sb_info *const sbi = EROFS_SB(sb);
+        struct inode *const inode = new_inode(sb);
+        if (unlikely(!inode))
+                return -ENOMEM;
+        set_nlink(inode, 1);
+        inode->i_size = OFFSET_MAX;
+        inode->i_mapping->a_ops = &managed_cache_aops;
+        mapping_set_gfp_mask(inode->i_mapping,
+                             GFP_NOFS | __GFP_HIGHMEM | __GFP_MOVABLE);
+        sbi->managed_cache = inode;
+        return 0;
+}
+#else
+static int erofs_init_managed_cache(struct super_block *sb) { return 0; }
+#endif
+static int erofs_fill_super(struct super_block *sb, void *data, int silent)
+{
+        struct inode *inode;
+        struct erofs_sb_info *sbi;
+        int err;
+        infoln("fill_super, device -> %s", sb->s_id);
+        infoln("options -> %s", (char *)data);
+        sb->s_magic = EROFS_SUPER_MAGIC;
+        if (unlikely(!sb_set_blocksize(sb, EROFS_BLKSIZ))) {
+                errln("failed to set erofs blksize");
+                return -EINVAL;
+        }
+        sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+        if (unlikely(!sbi))
+                return -ENOMEM;
+        sb->s_fs_info = sbi;
+        err = superblock_read(sb);
+        if (err)
+                return err;
+        sb->s_flags |= SB_RDONLY | SB_NOATIME;
+        sb->s_maxbytes = MAX_LFS_FILESIZE;
+        sb->s_time_gran = 1;
+        sb->s_op = &erofs_sops;
+#ifdef CONFIG_EROFS_FS_XATTR
+        sb->s_xattr = erofs_xattr_handlers;
+#endif
+        /* set erofs default mount options */
+        default_options(sbi);
+        err = parse_options(sb, data);
+        if (unlikely(err))
+                return err;
+        if (!silent)
+                infoln("root inode @ nid %llu", ROOT_NID(sbi));
+        if (test_opt(sbi, POSIX_ACL))
+                sb->s_flags |= SB_POSIXACL;
+        else
+                sb->s_flags &= ~SB_POSIXACL;
+#ifdef CONFIG_EROFS_FS_ZIP
+        INIT_RADIX_TREE(&sbi->workstn_tree, GFP_ATOMIC);
+#endif
+        /* get the root inode */
+        inode = erofs_iget(sb, ROOT_NID(sbi), true);
+        if (IS_ERR(inode))
+                return PTR_ERR(inode);
+        if (unlikely(!S_ISDIR(inode->i_mode))) {
+                errln("rootino(nid %llu) is not a directory(i_mode %o)",
+                      ROOT_NID(sbi), inode->i_mode);
+                iput(inode);
+                return -EINVAL;
+        }
+        sb->s_root = d_make_root(inode);
+        if (unlikely(!sb->s_root))
+                return -ENOMEM;
+        erofs_shrinker_register(sb);
+        /* sb->s_umount is already locked, SB_ACTIVE and SB_BORN are not set */
+        err = erofs_init_managed_cache(sb);
+        if (unlikely(err))
+                return err;
+        if (!silent)
+                infoln("mounted on %s with opts: %s.", sb->s_id, (char *)data);
+        return 0;
+}
+static struct dentry *erofs_mount(struct file_system_type *fs_type, int flags,
+                                  const char *dev_name, void *data)
+{
+        return mount_bdev(fs_type, flags, dev_name, data, erofs_fill_super);
+}
+/*
+ * could be triggered after deactivate_locked_super()
+ * is called, thus including umount and failed to initialize.
+ */
+static void erofs_kill_sb(struct super_block *sb)
+{
+        struct erofs_sb_info *sbi;
+        WARN_ON(sb->s_magic != EROFS_SUPER_MAGIC);
+        infoln("unmounting for %s", sb->s_id);
+        kill_block_super(sb);
+        sbi = EROFS_SB(sb);
+        if (!sbi)
+                return;
+        kfree(sbi);
+        sb->s_fs_info = NULL;
+}
+/* called when ->s_root is non-NULL */
+static void erofs_put_super(struct super_block *sb)
+{
+        struct erofs_sb_info *const sbi = EROFS_SB(sb);
+        DBG_BUGON(!sbi);
+        erofs_shrinker_unregister(sb);
+#ifdef CONFIG_EROFS_FS_ZIP
+        iput(sbi->managed_cache);
+        sbi->managed_cache = NULL;
+#endif
+}
+static struct file_system_type erofs_fs_type = {
+        .owner          = THIS_MODULE,
+        .name           = "erofs",
+        .mount          = erofs_mount,
+        .kill_sb        = erofs_kill_sb,
+        .fs_flags       = FS_REQUIRES_DEV,
+};
+MODULE_ALIAS_FS("erofs");
+static int __init erofs_module_init(void)
+{
+        int err;
+        erofs_check_ondisk_layout_definitions();
+        infoln("initializing erofs " EROFS_VERSION);
+        err = erofs_init_inode_cache();
+        if (err)
+                goto icache_err;
+        err = erofs_init_shrinker();
+        if (err)
+                goto shrinker_err;
+        err = z_erofs_init_zip_subsystem();
+        if (err)
+                goto zip_err;
+        err = register_filesystem(&erofs_fs_type);
+        if (err)
+                goto fs_err;
+        infoln("successfully to initialize erofs");
+        return 0;
+fs_err:
+        z_erofs_exit_zip_subsystem();
+zip_err:
+        erofs_exit_shrinker();
+shrinker_err:
+        erofs_exit_inode_cache();
+icache_err:
+        return err;
+}
+static void __exit erofs_module_exit(void)
+{
+        unregister_filesystem(&erofs_fs_type);
+        z_erofs_exit_zip_subsystem();
+        erofs_exit_shrinker();
+        erofs_exit_inode_cache();
+        infoln("successfully finalize erofs");
+}
+/* get filesystem statistics */
+static int erofs_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+        struct super_block *sb = dentry->d_sb;
+        struct erofs_sb_info *sbi = EROFS_SB(sb);
+        u64 id = huge_encode_dev(sb->s_bdev->bd_dev);
+        buf->f_type = sb->s_magic;
+        buf->f_bsize = EROFS_BLKSIZ;
+        buf->f_blocks = sbi->blocks;
+        buf->f_bfree = buf->f_bavail = 0;
+        buf->f_files = ULLONG_MAX;
+        buf->f_ffree = ULLONG_MAX - sbi->inos;
+        buf->f_namelen = EROFS_NAME_LEN;
+        buf->f_fsid.val[0] = (u32)id;
+        buf->f_fsid.val[1] = (u32)(id >> 32);
+        return 0;
+}
+static int erofs_show_options(struct seq_file *seq, struct dentry *root)
+{
+        struct erofs_sb_info *sbi __maybe_unused = EROFS_SB(root->d_sb);
+#ifdef CONFIG_EROFS_FS_XATTR
+        if (test_opt(sbi, XATTR_USER))
+                seq_puts(seq, ",user_xattr");
+        else
+                seq_puts(seq, ",nouser_xattr");
+#endif
+#ifdef CONFIG_EROFS_FS_POSIX_ACL
+        if (test_opt(sbi, POSIX_ACL))
+                seq_puts(seq, ",acl");
+        else
+                seq_puts(seq, ",noacl");
+#endif
+        if (test_opt(sbi, FAULT_INJECTION))
+                seq_printf(seq, ",fault_injection=%u",
+                           erofs_get_fault_rate(sbi));
+#ifdef CONFIG_EROFS_FS_ZIP
+        if (sbi->cache_strategy == EROFS_ZIP_CACHE_DISABLED) {
+                seq_puts(seq, ",cache_strategy=disabled");
+        } else if (sbi->cache_strategy == EROFS_ZIP_CACHE_READAHEAD) {
+                seq_puts(seq, ",cache_strategy=readahead");
+        } else if (sbi->cache_strategy == EROFS_ZIP_CACHE_READAROUND) {
+                seq_puts(seq, ",cache_strategy=readaround");
+        } else {
+                seq_puts(seq, ",cache_strategy=(unknown)");
+                DBG_BUGON(1);
+        }
+#endif
+        return 0;
+}
+static int erofs_remount(struct super_block *sb, int *flags, char *data)
+{
+        struct erofs_sb_info *sbi = EROFS_SB(sb);
+        unsigned int org_mnt_opt = sbi->mount_opt;
+        unsigned int org_inject_rate = erofs_get_fault_rate(sbi);
+        int err;
+        DBG_BUGON(!sb_rdonly(sb));
+        err = parse_options(sb, data);
+        if (err)
+                goto out;
+        if (test_opt(sbi, POSIX_ACL))
+                sb->s_flags |= SB_POSIXACL;
+        else
+                sb->s_flags &= ~SB_POSIXACL;
+        *flags |= SB_RDONLY;
+        return 0;
+out:
+        __erofs_build_fault_attr(sbi, org_inject_rate);
+        sbi->mount_opt = org_mnt_opt;
+        return err;
+}
+const struct super_operations erofs_sops = {
+        .put_super = erofs_put_super,
+        .alloc_inode = alloc_inode,
+        .free_inode = free_inode,
+        .statfs = erofs_statfs,
+        .show_options = erofs_show_options,
+        .remount_fs = erofs_remount,
+};
+module_init(erofs_module_init);
+module_exit(erofs_module_exit);
+MODULE_DESCRIPTION("Enhanced ROM File System");
+MODULE_AUTHOR("Gao Xiang, Chao Yu, Miao Xie, CONSUMER BG, HUAWEI Inc.");
+MODULE_LICENSE("GPL");
diff --git a/fs/erofs/tagptr.h b/fs/erofs/tagptr.h
new file mode 100644
index 000000000000..a72897c86744
--- /dev/null
+++ b/fs/erofs/tagptr.h
@@ -0,0 +1,110 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * A tagged pointer implementation
+ *
+ * Copyright (C) 2018 Gao Xiang <gaoxiang25@huawei.com>
+ */
+#ifndef __EROFS_FS_TAGPTR_H
+#define __EROFS_FS_TAGPTR_H
+#include <linux/types.h>
+#include <linux/build_bug.h>
+/*
+ * the name of tagged pointer types are tagptr{1, 2, 3...}_t
+ * avoid directly using the internal structs __tagptr{1, 2, 3...}
+ */
+#define __MAKE_TAGPTR(n) \
+typedef struct __tagptr##n {    \
+        uintptr_t v;    \
+} tagptr##n##_t;
+__MAKE_TAGPTR(1)
+__MAKE_TAGPTR(2)
+__MAKE_TAGPTR(3)
+__MAKE_TAGPTR(4)
+#undef __MAKE_TAGPTR
+extern void __compiletime_error("bad tagptr tags")
+        __bad_tagptr_tags(void);
+extern void __compiletime_error("bad tagptr type")
+        __bad_tagptr_type(void);
+/* fix the broken usage of "#define tagptr2_t tagptr3_t" by users */
+#define __tagptr_mask_1(ptr, n) \
+        __builtin_types_compatible_p(typeof(ptr), struct __tagptr##n) ? \
+                (1UL << (n)) - 1 :
+#define __tagptr_mask(ptr)      (\
+        __tagptr_mask_1(ptr, 1) ( \
+        __tagptr_mask_1(ptr, 2) ( \
+        __tagptr_mask_1(ptr, 3) ( \
+        __tagptr_mask_1(ptr, 4) ( \
+        __bad_tagptr_type(), 0)))))
+/* generate a tagged pointer from a raw value */
+#define tagptr_init(type, val) \
+        ((typeof(type)){ .v = (uintptr_t)(val) })
+/*
+ * directly cast a tagged pointer to the native pointer type, which
+ * could be used for backward compatibility of existing code.
+ */
+#define tagptr_cast_ptr(tptr) ((void *)(tptr).v)
+/* encode tagged pointers */
+#define tagptr_fold(type, ptr, _tags) ({ \
+        const typeof(_tags) tags = (_tags); \
+        if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(type))) \
+                __bad_tagptr_tags(); \
+tagptr_init(type, (uintptr_t)(ptr) | tags); })
+/* decode tagged pointers */
+#define tagptr_unfold_ptr(tptr) \
+        ((void *)((tptr).v & ~__tagptr_mask(tptr)))
+#define tagptr_unfold_tags(tptr) \
+        ((tptr).v & __tagptr_mask(tptr))
+/* operations for the tagger pointer */
+#define tagptr_eq(_tptr1, _tptr2) ({ \
+        typeof(_tptr1) tptr1 = (_tptr1); \
+        typeof(_tptr2) tptr2 = (_tptr2); \
+        (void)(&tptr1 == &tptr2); \
+(tptr1).v == (tptr2).v; })
+/* lock-free CAS operation */
+#define tagptr_cmpxchg(_ptptr, _o, _n) ({ \
+        typeof(_ptptr) ptptr = (_ptptr); \
+        typeof(_o) o = (_o); \
+        typeof(_n) n = (_n); \
+        (void)(&o == &n); \
+        (void)(&o == ptptr); \
+tagptr_init(o, cmpxchg(&ptptr->v, o.v, n.v)); })
+/* wrap WRITE_ONCE if atomic update is needed */
+#define tagptr_replace_tags(_ptptr, tags) ({ \
+        typeof(_ptptr) ptptr = (_ptptr); \
+        *ptptr = tagptr_fold(*ptptr, tagptr_unfold_ptr(*ptptr), tags); \
+*ptptr; })
+#define tagptr_set_tags(_ptptr, _tags) ({ \
+        typeof(_ptptr) ptptr = (_ptptr); \
+        const typeof(_tags) tags = (_tags); \
+        if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(*ptptr))) \
+                __bad_tagptr_tags(); \
+        ptptr->v |= tags; \
+*ptptr; })
+#define tagptr_clear_tags(_ptptr, _tags) ({ \
+        typeof(_ptptr) ptptr = (_ptptr); \
+        const typeof(_tags) tags = (_tags); \
+        if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(*ptptr))) \
+                __bad_tagptr_tags(); \
+        ptptr->v &= ~tags; \
+*ptptr; })
+#endif  /* __EROFS_FS_TAGPTR_H */
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
new file mode 100644
index 000000000000..1dd041aa0f5a
--- /dev/null
+++ b/fs/erofs/utils.c
@@ -0,0 +1,333 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25@huawei.com>
+ */
+#include "internal.h"
+#include <linux/pagevec.h>
+struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp, bool nofail)
+{
+        struct page *page;
+        if (!list_empty(pool)) {
+                page = lru_to_page(pool);
+                DBG_BUGON(page_ref_count(page) != 1);
+                list_del(&page->lru);
+        } else {
+                page = alloc_pages(gfp | (nofail ? __GFP_NOFAIL : 0), 0);
+        }
+        return page;
+}
+#if (EROFS_PCPUBUF_NR_PAGES > 0)
+static struct {
+        u8 data[PAGE_SIZE * EROFS_PCPUBUF_NR_PAGES];
+} ____cacheline_aligned_in_smp erofs_pcpubuf[NR_CPUS];
+void *erofs_get_pcpubuf(unsigned int pagenr)
+{
+        preempt_disable();
+        return &erofs_pcpubuf[smp_processor_id()].data[pagenr * PAGE_SIZE];
+}
+#endif
+#ifdef CONFIG_EROFS_FS_ZIP
+/* global shrink count (for all mounted EROFS instances) */
+static atomic_long_t erofs_global_shrink_cnt;
+#define __erofs_workgroup_get(grp)      atomic_inc(&(grp)->refcount)
+#define __erofs_workgroup_put(grp)      atomic_dec(&(grp)->refcount)
+static int erofs_workgroup_get(struct erofs_workgroup *grp)
+{
+        int o;
+repeat:
+        o = erofs_wait_on_workgroup_freezed(grp);
+        if (unlikely(o <= 0))
+                return -1;
+        if (unlikely(atomic_cmpxchg(&grp->refcount, o, o + 1) != o))
+                goto repeat;
+        /* decrease refcount paired by erofs_workgroup_put */
+        if (unlikely(o == 1))
+                atomic_long_dec(&erofs_global_shrink_cnt);
+        return 0;
+}
+struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
+                                             pgoff_t index, bool *tag)
+{
+        struct erofs_sb_info *sbi = EROFS_SB(sb);
+        struct erofs_workgroup *grp;
+repeat:
+        rcu_read_lock();
+        grp = radix_tree_lookup(&sbi->workstn_tree, index);
+        if (grp) {
+                *tag = xa_pointer_tag(grp);
+                grp = xa_untag_pointer(grp);
+                if (erofs_workgroup_get(grp)) {
+                        /* prefer to relax rcu read side */
+                        rcu_read_unlock();
+                        goto repeat;
+                }
+                DBG_BUGON(index != grp->index);
+        }
+        rcu_read_unlock();
+        return grp;
+}
+int erofs_register_workgroup(struct super_block *sb,
+                             struct erofs_workgroup *grp,
+                             bool tag)
+{
+        struct erofs_sb_info *sbi;
+        int err;
+        /* grp shouldn't be broken or used before */
+        if (unlikely(atomic_read(&grp->refcount) != 1)) {
+                DBG_BUGON(1);
+                return -EINVAL;
+        }
+        err = radix_tree_preload(GFP_NOFS);
+        if (err)
+                return err;
+        sbi = EROFS_SB(sb);
+        xa_lock(&sbi->workstn_tree);
+        grp = xa_tag_pointer(grp, tag);
+        /*
+         * Bump up reference count before making this workgroup
+         * visible to other users in order to avoid potential UAF
+         * without serialized by workstn_lock.
+         */
+        __erofs_workgroup_get(grp);
+        err = radix_tree_insert(&sbi->workstn_tree, grp->index, grp);
+        if (unlikely(err))
+                /*
+                 * it's safe to decrease since the workgroup isn't visible
+                 * and refcount >= 2 (cannot be freezed).
+                 */
+                __erofs_workgroup_put(grp);
+        xa_unlock(&sbi->workstn_tree);
+        radix_tree_preload_end();
+        return err;
+}
+static void  __erofs_workgroup_free(struct erofs_workgroup *grp)
+{
+        atomic_long_dec(&erofs_global_shrink_cnt);
+        erofs_workgroup_free_rcu(grp);
+}
+int erofs_workgroup_put(struct erofs_workgroup *grp)
+{
+        int count = atomic_dec_return(&grp->refcount);
+        if (count == 1)
+                atomic_long_inc(&erofs_global_shrink_cnt);
+        else if (!count)
+                __erofs_workgroup_free(grp);
+        return count;
+}
+static void erofs_workgroup_unfreeze_final(struct erofs_workgroup *grp)
+{
+        erofs_workgroup_unfreeze(grp, 0);
+        __erofs_workgroup_free(grp);
+}
+static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi,
+                                           struct erofs_workgroup *grp,
+                                           bool cleanup)
+{
+        /*
+         * If managed cache is on, refcount of workgroups
+         * themselves could be < 0 (freezed). In other words,
+         * there is no guarantee that all refcounts > 0.
+         */
+        if (!erofs_workgroup_try_to_freeze(grp, 1))
+                return false;
+        /*
+         * Note that all cached pages should be unattached
+         * before deleted from the radix tree. Otherwise some
+         * cached pages could be still attached to the orphan
+         * old workgroup when the new one is available in the tree.
+         */
+        if (erofs_try_to_free_all_cached_pages(sbi, grp)) {
+                erofs_workgroup_unfreeze(grp, 1);
+                return false;
+        }
+        /*
+         * It's impossible to fail after the workgroup is freezed,
+         * however in order to avoid some race conditions, add a
+         * DBG_BUGON to observe this in advance.
+         */
+        DBG_BUGON(xa_untag_pointer(radix_tree_delete(&sbi->workstn_tree,
+                                                     grp->index)) != grp);
+        /*
+         * If managed cache is on, last refcount should indicate
+         * the related workstation.
+         */
+        erofs_workgroup_unfreeze_final(grp);
+        return true;
+}
+static unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
+                                              unsigned long nr_shrink,
+                                              bool cleanup)
+{
+        pgoff_t first_index = 0;
+        void *batch[PAGEVEC_SIZE];
+        unsigned int freed = 0;
+        int i, found;
+repeat:
+        xa_lock(&sbi->workstn_tree);
+        found = radix_tree_gang_lookup(&sbi->workstn_tree,
+                                       batch, first_index, PAGEVEC_SIZE);
+        for (i = 0; i < found; ++i) {
+                struct erofs_workgroup *grp = xa_untag_pointer(batch[i]);
+                first_index = grp->index + 1;
+                /* try to shrink each valid workgroup */
+                if (!erofs_try_to_release_workgroup(sbi, grp, cleanup))
+                        continue;
+                ++freed;
+                if (unlikely(!--nr_shrink))
+                        break;
+        }
+        xa_unlock(&sbi->workstn_tree);
+        if (i && nr_shrink)
+                goto repeat;
+        return freed;
+}
+/* protected by 'erofs_sb_list_lock' */
+static unsigned int shrinker_run_no;
+/* protects the mounted 'erofs_sb_list' */
+static DEFINE_SPINLOCK(erofs_sb_list_lock);
+static LIST_HEAD(erofs_sb_list);
+void erofs_shrinker_register(struct super_block *sb)
+{
+        struct erofs_sb_info *sbi = EROFS_SB(sb);
+        mutex_init(&sbi->umount_mutex);
+        spin_lock(&erofs_sb_list_lock);
+        list_add(&sbi->list, &erofs_sb_list);
+        spin_unlock(&erofs_sb_list_lock);
+}
+void erofs_shrinker_unregister(struct super_block *sb)
+{
+        struct erofs_sb_info *const sbi = EROFS_SB(sb);
+        mutex_lock(&sbi->umount_mutex);
+        erofs_shrink_workstation(sbi, ~0UL, true);
+        spin_lock(&erofs_sb_list_lock);
+        list_del(&sbi->list);
+        spin_unlock(&erofs_sb_list_lock);
+        mutex_unlock(&sbi->umount_mutex);
+}
+static unsigned long erofs_shrink_count(struct shrinker *shrink,
+                                        struct shrink_control *sc)
+{
+        return atomic_long_read(&erofs_global_shrink_cnt);
+}
+static unsigned long erofs_shrink_scan(struct shrinker *shrink,
+                                       struct shrink_control *sc)
+{
+        struct erofs_sb_info *sbi;
+        struct list_head *p;
+        unsigned long nr = sc->nr_to_scan;
+        unsigned int run_no;
+        unsigned long freed = 0;
+        spin_lock(&erofs_sb_list_lock);
+        do {
+                run_no = ++shrinker_run_no;
+        } while (run_no == 0);
+        /* Iterate over all mounted superblocks and try to shrink them */
+        p = erofs_sb_list.next;
+        while (p != &erofs_sb_list) {
+                sbi = list_entry(p, struct erofs_sb_info, list);
+                /*
+                 * We move the ones we do to the end of the list, so we stop
+                 * when we see one we have already done.
+                 */
+                if (sbi->shrinker_run_no == run_no)
+                        break;
+                if (!mutex_trylock(&sbi->umount_mutex)) {
+                        p = p->next;
+                        continue;
+                }
+                spin_unlock(&erofs_sb_list_lock);
+                sbi->shrinker_run_no = run_no;
+                freed += erofs_shrink_workstation(sbi, nr, false);
+                spin_lock(&erofs_sb_list_lock);
+                /* Get the next list element before we move this one */
+                p = p->next;
+                /*
+                 * Move this one to the end of the list to provide some
+                 * fairness.
+                 */
+                list_move_tail(&sbi->list, &erofs_sb_list);
+                mutex_unlock(&sbi->umount_mutex);
+                if (freed >= nr)
+                        break;
+        }
+        spin_unlock(&erofs_sb_list_lock);
+        return freed;
+}
+static struct shrinker erofs_shrinker_info = {
+        .scan_objects = erofs_shrink_scan,
+        .count_objects = erofs_shrink_count,
+        .seeks = DEFAULT_SEEKS,
+};
+int __init erofs_init_shrinker(void)
+{
+        return register_shrinker(&erofs_shrinker_info);
+}
+void erofs_exit_shrinker(void)
+{
+        unregister_shrinker(&erofs_shrinker_info);
+}
+#endif  /* !CONFIG_EROFS_FS_ZIP */
diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c
new file mode 100644
index 000000000000..a8286998a079
--- /dev/null
+++ b/fs/erofs/xattr.c
@@ -0,0 +1,703 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2017-2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25@huawei.com>
+ */
+#include <linux/security.h>
+#include "xattr.h"
+struct xattr_iter {
+        struct super_block *sb;
+        struct page *page;
+        void *kaddr;
+        erofs_blk_t blkaddr;
+        unsigned int ofs;
+};
+static inline void xattr_iter_end(struct xattr_iter *it, bool atomic)
+{
+        /* the only user of kunmap() is 'init_inode_xattrs' */
+        if (unlikely(!atomic))
+                kunmap(it->page);
+        else
+                kunmap_atomic(it->kaddr);
+        unlock_page(it->page);
+        put_page(it->page);
+}
+static inline void xattr_iter_end_final(struct xattr_iter *it)
+{
+        if (!it->page)
+                return;
+        xattr_iter_end(it, true);
+}
+static int init_inode_xattrs(struct inode *inode)
+{
+        struct erofs_vnode *const vi = EROFS_V(inode);
+        struct xattr_iter it;
+        unsigned int i;
+        struct erofs_xattr_ibody_header *ih;
+        struct super_block *sb;
+        struct erofs_sb_info *sbi;
+        bool atomic_map;
+        int ret = 0;
+        /* the most case is that xattrs of this inode are initialized. */
+        if (test_bit(EROFS_V_EA_INITED_BIT, &vi->flags))
+                return 0;
+        if (wait_on_bit_lock(&vi->flags, EROFS_V_BL_XATTR_BIT, TASK_KILLABLE))
+                return -ERESTARTSYS;
+        /* someone has initialized xattrs for us? */
+        if (test_bit(EROFS_V_EA_INITED_BIT, &vi->flags))
+                goto out_unlock;
+        /*
+         * bypass all xattr operations if ->xattr_isize is not greater than
+         * sizeof(struct erofs_xattr_ibody_header), in detail:
+         * 1) it is not enough to contain erofs_xattr_ibody_header then
+         *    ->xattr_isize should be 0 (it means no xattr);
+         * 2) it is just to contain erofs_xattr_ibody_header, which is on-disk
+         *    undefined right now (maybe use later with some new sb feature).
+         */
+        if (vi->xattr_isize == sizeof(struct erofs_xattr_ibody_header)) {
+                errln("xattr_isize %d of nid %llu is not supported yet",
+                      vi->xattr_isize, vi->nid);
+                ret = -EOPNOTSUPP;
+                goto out_unlock;
+        } else if (vi->xattr_isize < sizeof(struct erofs_xattr_ibody_header)) {
+                if (unlikely(vi->xattr_isize)) {
+                        errln("bogus xattr ibody @ nid %llu", vi->nid);
+                        DBG_BUGON(1);
+                        ret = -EFSCORRUPTED;
+                        goto out_unlock;        /* xattr ondisk layout error */
+                }
+                ret = -ENOATTR;
+                goto out_unlock;
+        }
+        sb = inode->i_sb;
+        sbi = EROFS_SB(sb);
+        it.blkaddr = erofs_blknr(iloc(sbi, vi->nid) + vi->inode_isize);
+        it.ofs = erofs_blkoff(iloc(sbi, vi->nid) + vi->inode_isize);
+        it.page = erofs_get_inline_page(inode, it.blkaddr);
+        if (IS_ERR(it.page)) {
+                ret = PTR_ERR(it.page);
+                goto out_unlock;
+        }
+        /* read in shared xattr array (non-atomic, see kmalloc below) */
+        it.kaddr = kmap(it.page);
+        atomic_map = false;
+        ih = (struct erofs_xattr_ibody_header *)(it.kaddr + it.ofs);
+        vi->xattr_shared_count = ih->h_shared_count;
+        vi->xattr_shared_xattrs = kmalloc_array(vi->xattr_shared_count,
+                                                sizeof(uint), GFP_KERNEL);
+        if (!vi->xattr_shared_xattrs) {
+                xattr_iter_end(&it, atomic_map);
+                ret = -ENOMEM;
+                goto out_unlock;
+        }
+        /* let's skip ibody header */
+        it.ofs += sizeof(struct erofs_xattr_ibody_header);
+        for (i = 0; i < vi->xattr_shared_count; ++i) {
+                if (unlikely(it.ofs >= EROFS_BLKSIZ)) {
+                        /* cannot be unaligned */
+                        DBG_BUGON(it.ofs != EROFS_BLKSIZ);
+                        xattr_iter_end(&it, atomic_map);
+                        it.page = erofs_get_meta_page(sb, ++it.blkaddr,
+                                                      S_ISDIR(inode->i_mode));
+                        if (IS_ERR(it.page)) {
+                                kfree(vi->xattr_shared_xattrs);
+                                vi->xattr_shared_xattrs = NULL;
+                                ret = PTR_ERR(it.page);
+                                goto out_unlock;
+                        }
+                        it.kaddr = kmap_atomic(it.page);
+                        atomic_map = true;
+                        it.ofs = 0;
+                }
+                vi->xattr_shared_xattrs[i] =
+                        le32_to_cpu(*(__le32 *)(it.kaddr + it.ofs));
+                it.ofs += sizeof(__le32);
+        }
+        xattr_iter_end(&it, atomic_map);
+        set_bit(EROFS_V_EA_INITED_BIT, &vi->flags);
+out_unlock:
+        clear_and_wake_up_bit(EROFS_V_BL_XATTR_BIT, &vi->flags);
+        return ret;
+}
+/*
+ * the general idea for these return values is
+ * if    0 is returned, go on processing the current xattr;
+ *       1 (> 0) is returned, skip this round to process the next xattr;
+ *    -err (< 0) is returned, an error (maybe ENOXATTR) occurred
+ *                            and need to be handled
+ */
+struct xattr_iter_handlers {
+        int (*entry)(struct xattr_iter *_it, struct erofs_xattr_entry *entry);
+        int (*name)(struct xattr_iter *_it, unsigned int processed, char *buf,
+                    unsigned int len);
+        int (*alloc_buffer)(struct xattr_iter *_it, unsigned int value_sz);
+        void (*value)(struct xattr_iter *_it, unsigned int processed, char *buf,
+                      unsigned int len);
+};
+static inline int xattr_iter_fixup(struct xattr_iter *it)
+{
+        if (it->ofs < EROFS_BLKSIZ)
+                return 0;
+        xattr_iter_end(it, true);
+        it->blkaddr += erofs_blknr(it->ofs);
+        it->page = erofs_get_meta_page(it->sb, it->blkaddr, false);
+        if (IS_ERR(it->page)) {
+                int err = PTR_ERR(it->page);
+                it->page = NULL;
+                return err;
+        }
+        it->kaddr = kmap_atomic(it->page);
+        it->ofs = erofs_blkoff(it->ofs);
+        return 0;
+}
+static int inline_xattr_iter_begin(struct xattr_iter *it,
+                                   struct inode *inode)
+{
+        struct erofs_vnode *const vi = EROFS_V(inode);
+        struct erofs_sb_info *const sbi = EROFS_SB(inode->i_sb);
+        unsigned int xattr_header_sz, inline_xattr_ofs;
+        xattr_header_sz = inlinexattr_header_size(inode);
+        if (unlikely(xattr_header_sz >= vi->xattr_isize)) {
+                DBG_BUGON(xattr_header_sz > vi->xattr_isize);
+                return -ENOATTR;
+        }
+        inline_xattr_ofs = vi->inode_isize + xattr_header_sz;
+        it->blkaddr = erofs_blknr(iloc(sbi, vi->nid) + inline_xattr_ofs);
+        it->ofs = erofs_blkoff(iloc(sbi, vi->nid) + inline_xattr_ofs);
+        it->page = erofs_get_inline_page(inode, it->blkaddr);
+        if (IS_ERR(it->page))
+                return PTR_ERR(it->page);
+        it->kaddr = kmap_atomic(it->page);
+        return vi->xattr_isize - xattr_header_sz;
+}
+/*
+ * Regardless of success or failure, `xattr_foreach' will end up with
+ * `ofs' pointing to the next xattr item rather than an arbitrary position.
+ */
+static int xattr_foreach(struct xattr_iter *it,
+                         const struct xattr_iter_handlers *op,
+                         unsigned int *tlimit)
+{
+        struct erofs_xattr_entry entry;
+        unsigned int value_sz, processed, slice;
+        int err;
+        /* 0. fixup blkaddr, ofs, ipage */
+        err = xattr_iter_fixup(it);
+        if (err)
+                return err;
+        /*
+         * 1. read xattr entry to the memory,
+         *    since we do EROFS_XATTR_ALIGN
+         *    therefore entry should be in the page
+         */
+        entry = *(struct erofs_xattr_entry *)(it->kaddr + it->ofs);
+        if (tlimit) {
+                unsigned int entry_sz = EROFS_XATTR_ENTRY_SIZE(&entry);
+                /* xattr on-disk corruption: xattr entry beyond xattr_isize */
+                if (unlikely(*tlimit < entry_sz)) {
+                        DBG_BUGON(1);
+                        return -EFSCORRUPTED;
+                }
+                *tlimit -= entry_sz;
+        }
+        it->ofs += sizeof(struct erofs_xattr_entry);
+        value_sz = le16_to_cpu(entry.e_value_size);
+        /* handle entry */
+        err = op->entry(it, &entry);
+        if (err) {
+                it->ofs += entry.e_name_len + value_sz;
+                goto out;
+        }
+        /* 2. handle xattr name (ofs will finally be at the end of name) */
+        processed = 0;
+        while (processed < entry.e_name_len) {
+                if (it->ofs >= EROFS_BLKSIZ) {
+                        DBG_BUGON(it->ofs > EROFS_BLKSIZ);
+                        err = xattr_iter_fixup(it);
+                        if (err)
+                                goto out;
+                        it->ofs = 0;
+                }
+                slice = min_t(unsigned int, PAGE_SIZE - it->ofs,
+                              entry.e_name_len - processed);
+                /* handle name */
+                err = op->name(it, processed, it->kaddr + it->ofs, slice);
+                if (err) {
+                        it->ofs += entry.e_name_len - processed + value_sz;
+                        goto out;
+                }
+                it->ofs += slice;
+                processed += slice;
+        }
+        /* 3. handle xattr value */
+        processed = 0;
+        if (op->alloc_buffer) {
+                err = op->alloc_buffer(it, value_sz);
+                if (err) {
+                        it->ofs += value_sz;
+                        goto out;
+                }
+        }
+        while (processed < value_sz) {
+                if (it->ofs >= EROFS_BLKSIZ) {
+                        DBG_BUGON(it->ofs > EROFS_BLKSIZ);
+                        err = xattr_iter_fixup(it);
+                        if (err)
+                                goto out;
+                        it->ofs = 0;
+                }
+                slice = min_t(unsigned int, PAGE_SIZE - it->ofs,
+                              value_sz - processed);
+                op->value(it, processed, it->kaddr + it->ofs, slice);
+                it->ofs += slice;
+                processed += slice;
+        }
+out:
+        /* xattrs should be 4-byte aligned (on-disk constraint) */
+        it->ofs = EROFS_XATTR_ALIGN(it->ofs);
+        return err < 0 ? err : 0;
+}
+struct getxattr_iter {
+        struct xattr_iter it;
+        char *buffer;
+        int buffer_size, index;
+        struct qstr name;
+};
+static int xattr_entrymatch(struct xattr_iter *_it,
+                            struct erofs_xattr_entry *entry)
+{
+        struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it);
+        return (it->index != entry->e_name_index ||
+                it->name.len != entry->e_name_len) ? -ENOATTR : 0;
+}
+static int xattr_namematch(struct xattr_iter *_it,
+                           unsigned int processed, char *buf, unsigned int len)
+{
+        struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it);
+        return memcmp(buf, it->name.name + processed, len) ? -ENOATTR : 0;
+}
+static int xattr_checkbuffer(struct xattr_iter *_it,
+                             unsigned int value_sz)
+{
+        struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it);
+        int err = it->buffer_size < value_sz ? -ERANGE : 0;
+        it->buffer_size = value_sz;
+        return !it->buffer ? 1 : err;
+}
+static void xattr_copyvalue(struct xattr_iter *_it,
+                            unsigned int processed,
+                            char *buf, unsigned int len)
+{
+        struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it);
+        memcpy(it->buffer + processed, buf, len);
+}
+static const struct xattr_iter_handlers find_xattr_handlers = {
+        .entry = xattr_entrymatch,
+        .name = xattr_namematch,
+        .alloc_buffer = xattr_checkbuffer,
+        .value = xattr_copyvalue
+};
+static int inline_getxattr(struct inode *inode, struct getxattr_iter *it)
+{
+        int ret;
+        unsigned int remaining;
+        ret = inline_xattr_iter_begin(&it->it, inode);
+        if (ret < 0)
+                return ret;
+        remaining = ret;
+        while (remaining) {
+                ret = xattr_foreach(&it->it, &find_xattr_handlers, &remaining);
+                if (ret != -ENOATTR)
+                        break;
+        }
+        xattr_iter_end_final(&it->it);
+        return ret ? ret : it->buffer_size;
+}
+static int shared_getxattr(struct inode *inode, struct getxattr_iter *it)
+{
+        struct erofs_vnode *const vi = EROFS_V(inode);
+        struct super_block *const sb = inode->i_sb;
+        struct erofs_sb_info *const sbi = EROFS_SB(sb);
+        unsigned int i;
+        int ret = -ENOATTR;
+        for (i = 0; i < vi->xattr_shared_count; ++i) {
+                erofs_blk_t blkaddr =
+                        xattrblock_addr(sbi, vi->xattr_shared_xattrs[i]);
+                it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]);
+                if (!i || blkaddr != it->it.blkaddr) {
+                        if (i)
+                                xattr_iter_end(&it->it, true);
+                        it->it.page = erofs_get_meta_page(sb, blkaddr, false);
+                        if (IS_ERR(it->it.page))
+                                return PTR_ERR(it->it.page);
+                        it->it.kaddr = kmap_atomic(it->it.page);
+                        it->it.blkaddr = blkaddr;
+                }
+                ret = xattr_foreach(&it->it, &find_xattr_handlers, NULL);
+                if (ret != -ENOATTR)
+                        break;
+        }
+        if (vi->xattr_shared_count)
+                xattr_iter_end_final(&it->it);
+        return ret ? ret : it->buffer_size;
+}
+static bool erofs_xattr_user_list(struct dentry *dentry)
+{
+        return test_opt(EROFS_SB(dentry->d_sb), XATTR_USER);
+}
+static bool erofs_xattr_trusted_list(struct dentry *dentry)
+{
+        return capable(CAP_SYS_ADMIN);
+}
+int erofs_getxattr(struct inode *inode, int index,
+                   const char *name,
+                   void *buffer, size_t buffer_size)
+{
+        int ret;
+        struct getxattr_iter it;
+        if (unlikely(!name))
+                return -EINVAL;
+        ret = init_inode_xattrs(inode);
+        if (ret)
+                return ret;
+        it.index = index;
+        it.name.len = strlen(name);
+        if (it.name.len > EROFS_NAME_LEN)
+                return -ERANGE;
+        it.name.name = name;
+        it.buffer = buffer;
+        it.buffer_size = buffer_size;
+        it.it.sb = inode->i_sb;
+        ret = inline_getxattr(inode, &it);
+        if (ret == -ENOATTR)
+                ret = shared_getxattr(inode, &it);
+        return ret;
+}
+static int erofs_xattr_generic_get(const struct xattr_handler *handler,
+                                   struct dentry *unused, struct inode *inode,
+                                   const char *name, void *buffer, size_t size)
+{
+        struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
+        switch (handler->flags) {
+        case EROFS_XATTR_INDEX_USER:
+                if (!test_opt(sbi, XATTR_USER))
+                        return -EOPNOTSUPP;
+                break;
+        case EROFS_XATTR_INDEX_TRUSTED:
+                if (!capable(CAP_SYS_ADMIN))
+                        return -EPERM;
+                break;
+        case EROFS_XATTR_INDEX_SECURITY:
+                break;
+        default:
+                return -EINVAL;
+        }
+        return erofs_getxattr(inode, handler->flags, name, buffer, size);
+}
+const struct xattr_handler erofs_xattr_user_handler = {
+        .prefix = XATTR_USER_PREFIX,
+        .flags  = EROFS_XATTR_INDEX_USER,
+        .list   = erofs_xattr_user_list,
+        .get    = erofs_xattr_generic_get,
+};
+const struct xattr_handler erofs_xattr_trusted_handler = {
+        .prefix = XATTR_TRUSTED_PREFIX,
+        .flags  = EROFS_XATTR_INDEX_TRUSTED,
+        .list   = erofs_xattr_trusted_list,
+        .get    = erofs_xattr_generic_get,
+};
+#ifdef CONFIG_EROFS_FS_SECURITY
+const struct xattr_handler __maybe_unused erofs_xattr_security_handler = {
+        .prefix = XATTR_SECURITY_PREFIX,
+        .flags  = EROFS_XATTR_INDEX_SECURITY,
+        .get    = erofs_xattr_generic_get,
+};
+#endif
+const struct xattr_handler *erofs_xattr_handlers[] = {
+        &erofs_xattr_user_handler,
+#ifdef CONFIG_EROFS_FS_POSIX_ACL
+        &posix_acl_access_xattr_handler,
+        &posix_acl_default_xattr_handler,
+#endif
+        &erofs_xattr_trusted_handler,
+#ifdef CONFIG_EROFS_FS_SECURITY
+        &erofs_xattr_security_handler,
+#endif
+        NULL,
+};
+struct listxattr_iter {
+        struct xattr_iter it;
+        struct dentry *dentry;
+        char *buffer;
+        int buffer_size, buffer_ofs;
+};
+static int xattr_entrylist(struct xattr_iter *_it,
+                           struct erofs_xattr_entry *entry)
+{
+        struct listxattr_iter *it =
+                container_of(_it, struct listxattr_iter, it);
+        unsigned int prefix_len;
+        const char *prefix;
+        const struct xattr_handler *h =
+                erofs_xattr_handler(entry->e_name_index);
+        if (!h || (h->list && !h->list(it->dentry)))
+                return 1;
+        prefix = xattr_prefix(h);
+        prefix_len = strlen(prefix);
+        if (!it->buffer) {
+                it->buffer_ofs += prefix_len + entry->e_name_len + 1;
+                return 1;
+        }
+        if (it->buffer_ofs + prefix_len
+                + entry->e_name_len + 1 > it->buffer_size)
+                return -ERANGE;
+        memcpy(it->buffer + it->buffer_ofs, prefix, prefix_len);
+        it->buffer_ofs += prefix_len;
+        return 0;
+}
+static int xattr_namelist(struct xattr_iter *_it,
+                          unsigned int processed, char *buf, unsigned int len)
+{
+        struct listxattr_iter *it =
+                container_of(_it, struct listxattr_iter, it);
+        memcpy(it->buffer + it->buffer_ofs, buf, len);
+        it->buffer_ofs += len;
+        return 0;
+}
+static int xattr_skipvalue(struct xattr_iter *_it,
+                           unsigned int value_sz)
+{
+        struct listxattr_iter *it =
+                container_of(_it, struct listxattr_iter, it);
+        it->buffer[it->buffer_ofs++] = '\0';
+        return 1;
+}
+static const struct xattr_iter_handlers list_xattr_handlers = {
+        .entry = xattr_entrylist,
+        .name = xattr_namelist,
+        .alloc_buffer = xattr_skipvalue,
+        .value = NULL
+};
+static int inline_listxattr(struct listxattr_iter *it)
+{
+        int ret;
+        unsigned int remaining;
+        ret = inline_xattr_iter_begin(&it->it, d_inode(it->dentry));
+        if (ret < 0)
+                return ret;
+        remaining = ret;
+        while (remaining) {
+                ret = xattr_foreach(&it->it, &list_xattr_handlers, &remaining);
+                if (ret)
+                        break;
+        }
+        xattr_iter_end_final(&it->it);
+        return ret ? ret : it->buffer_ofs;
+}
+static int shared_listxattr(struct listxattr_iter *it)
+{
+        struct inode *const inode = d_inode(it->dentry);
+        struct erofs_vnode *const vi = EROFS_V(inode);
+        struct super_block *const sb = inode->i_sb;
+        struct erofs_sb_info *const sbi = EROFS_SB(sb);
+        unsigned int i;
+        int ret = 0;
+        for (i = 0; i < vi->xattr_shared_count; ++i) {
+                erofs_blk_t blkaddr =
+                        xattrblock_addr(sbi, vi->xattr_shared_xattrs[i]);
+                it->it.ofs = xattrblock_offset(sbi, vi->xattr_shared_xattrs[i]);
+                if (!i || blkaddr != it->it.blkaddr) {
+                        if (i)
+                                xattr_iter_end(&it->it, true);
+                        it->it.page = erofs_get_meta_page(sb, blkaddr, false);
+                        if (IS_ERR(it->it.page))
+                                return PTR_ERR(it->it.page);
+                        it->it.kaddr = kmap_atomic(it->it.page);
+                        it->it.blkaddr = blkaddr;
+                }
+                ret = xattr_foreach(&it->it, &list_xattr_handlers, NULL);
+                if (ret)
+                        break;
+        }
+        if (vi->xattr_shared_count)
+                xattr_iter_end_final(&it->it);
+        return ret ? ret : it->buffer_ofs;
+}
+ssize_t erofs_listxattr(struct dentry *dentry,
+                        char *buffer, size_t buffer_size)
+{
+        int ret;
+        struct listxattr_iter it;
+        ret = init_inode_xattrs(d_inode(dentry));
+        if (ret)
+                return ret;
+        it.dentry = dentry;
+        it.buffer = buffer;
+        it.buffer_size = buffer_size;
+        it.buffer_ofs = 0;
+        it.it.sb = dentry->d_sb;
+        ret = inline_listxattr(&it);
+        if (ret < 0 && ret != -ENOATTR)
+                return ret;
+        return shared_listxattr(&it);
+}
+#ifdef CONFIG_EROFS_FS_POSIX_ACL
+struct posix_acl *erofs_get_acl(struct inode *inode, int type)
+{
+        struct posix_acl *acl;
+        int prefix, rc;
+        char *value = NULL;
+        switch (type) {
+        case ACL_TYPE_ACCESS:
+                prefix = EROFS_XATTR_INDEX_POSIX_ACL_ACCESS;
+                break;
+        case ACL_TYPE_DEFAULT:
+                prefix = EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT;
+                break;
+        default:
+                return ERR_PTR(-EINVAL);
+        }
+        rc = erofs_getxattr(inode, prefix, "", NULL, 0);
+        if (rc > 0) {
+                value = kmalloc(rc, GFP_KERNEL);
+                if (!value)
+                        return ERR_PTR(-ENOMEM);
+                rc = erofs_getxattr(inode, prefix, "", value, rc);
+        }
+        if (rc == -ENOATTR)
+                acl = NULL;
+        else if (rc < 0)
+                acl = ERR_PTR(rc);
+        else
+                acl = posix_acl_from_xattr(&init_user_ns, value, rc);
+        kfree(value);
+        return acl;
+}
+#endif
diff --git a/fs/erofs/xattr.h b/fs/erofs/xattr.h
new file mode 100644
index 000000000000..c5ca47d814dd
--- /dev/null
+++ b/fs/erofs/xattr.h
@@ -0,0 +1,92 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2017-2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25@huawei.com>
+ */
+#ifndef __EROFS_XATTR_H
+#define __EROFS_XATTR_H
+#include "internal.h"
+#include <linux/posix_acl_xattr.h>
+#include <linux/xattr.h>
+/* Attribute not found */
+#define ENOATTR         ENODATA
+static inline unsigned int inlinexattr_header_size(struct inode *inode)
+{
+        return sizeof(struct erofs_xattr_ibody_header)
+                + sizeof(u32) * EROFS_V(inode)->xattr_shared_count;
+}
+static inline erofs_blk_t xattrblock_addr(struct erofs_sb_info *sbi,
+                                          unsigned int xattr_id)
+{
+#ifdef CONFIG_EROFS_FS_XATTR
+        return sbi->xattr_blkaddr +
+                xattr_id * sizeof(__u32) / EROFS_BLKSIZ;
+#else
+        return 0;
+#endif
+}
+static inline unsigned int xattrblock_offset(struct erofs_sb_info *sbi,
+                                             unsigned int xattr_id)
+{
+        return (xattr_id * sizeof(__u32)) % EROFS_BLKSIZ;
+}
+#ifdef CONFIG_EROFS_FS_XATTR
+extern const struct xattr_handler erofs_xattr_user_handler;
+extern const struct xattr_handler erofs_xattr_trusted_handler;
+#ifdef CONFIG_EROFS_FS_SECURITY
+extern const struct xattr_handler erofs_xattr_security_handler;
+#endif
+static inline const struct xattr_handler *erofs_xattr_handler(unsigned int idx)
+{
+static const struct xattr_handler *xattr_handler_map[] = {
+        [EROFS_XATTR_INDEX_USER] = &erofs_xattr_user_handler,
+#ifdef CONFIG_EROFS_FS_POSIX_ACL
+        [EROFS_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler,
+        [EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT] =
+                &posix_acl_default_xattr_handler,
+#endif
+        [EROFS_XATTR_INDEX_TRUSTED] = &erofs_xattr_trusted_handler,
+#ifdef CONFIG_EROFS_FS_SECURITY
+        [EROFS_XATTR_INDEX_SECURITY] = &erofs_xattr_security_handler,
+#endif
+};
+        return idx && idx < ARRAY_SIZE(xattr_handler_map) ?
+                xattr_handler_map[idx] : NULL;
+}
+extern const struct xattr_handler *erofs_xattr_handlers[];
+int erofs_getxattr(struct inode *, int, const char *, void *, size_t);
+ssize_t erofs_listxattr(struct dentry *, char *, size_t);
+#else
+static inline int erofs_getxattr(struct inode *inode, int index,
+                                 const char *name, void *buffer,
+                                 size_t buffer_size)
+{
+        return -EOPNOTSUPP;
+}
+static inline ssize_t erofs_listxattr(struct dentry *dentry,
+                                      char *buffer, size_t buffer_size)
+{
+        return -EOPNOTSUPP;
+}
+#endif  /* !CONFIG_EROFS_FS_XATTR */
+#ifdef CONFIG_EROFS_FS_POSIX_ACL
+struct posix_acl *erofs_get_acl(struct inode *inode, int type);
+#else
+#define erofs_get_acl   (NULL)
+#endif
+#endif
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
new file mode 100644
index 000000000000..b32ad585237c
--- /dev/null
+++ b/fs/erofs/zdata.c
@@ -0,0 +1,1432 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25@huawei.com>
+ */
+#include "zdata.h"
+#include "compress.h"
+#include <linux/prefetch.h>
+#include <trace/events/erofs.h>
+/*
+ * a compressed_pages[] placeholder in order to avoid
+ * being filled with file pages for in-place decompression.
+ */
+#define PAGE_UNALLOCATED     ((void *)0x5F0E4B1D)
+/* how to allocate cached pages for a pcluster */
+enum z_erofs_cache_alloctype {
+        DONTALLOC,      /* don't allocate any cached pages */
+        DELAYEDALLOC,   /* delayed allocation (at the time of submitting io) */
+};
+/*
+ * tagged pointer with 1-bit tag for all compressed pages
+ * tag 0 - the page is just found with an extra page reference
+ */
+typedef tagptr1_t compressed_page_t;
+#define tag_compressed_page_justfound(page) \
+        tagptr_fold(compressed_page_t, page, 1)
+static struct workqueue_struct *z_erofs_workqueue __read_mostly;
+static struct kmem_cache *pcluster_cachep __read_mostly;
+void z_erofs_exit_zip_subsystem(void)
+{
+        destroy_workqueue(z_erofs_workqueue);
+        kmem_cache_destroy(pcluster_cachep);
+}
+static inline int init_unzip_workqueue(void)
+{
+        const unsigned int onlinecpus = num_possible_cpus();
+        const unsigned int flags = WQ_UNBOUND | WQ_HIGHPRI | WQ_CPU_INTENSIVE;
+        /*
+         * no need to spawn too many threads, limiting threads could minimum
+         * scheduling overhead, perhaps per-CPU threads should be better?
+         */
+        z_erofs_workqueue = alloc_workqueue("erofs_unzipd", flags,
+                                            onlinecpus + onlinecpus / 4);
+        return z_erofs_workqueue ? 0 : -ENOMEM;
+}
+static void init_once(void *ptr)
+{
+        struct z_erofs_pcluster *pcl = ptr;
+        struct z_erofs_collection *cl = z_erofs_primarycollection(pcl);
+        unsigned int i;
+        mutex_init(&cl->lock);
+        cl->nr_pages = 0;
+        cl->vcnt = 0;
+        for (i = 0; i < Z_EROFS_CLUSTER_MAX_PAGES; ++i)
+                pcl->compressed_pages[i] = NULL;
+}
+static void init_always(struct z_erofs_pcluster *pcl)
+{
+        struct z_erofs_collection *cl = z_erofs_primarycollection(pcl);
+        atomic_set(&pcl->obj.refcount, 1);
+        DBG_BUGON(cl->nr_pages);
+        DBG_BUGON(cl->vcnt);
+}
+int __init z_erofs_init_zip_subsystem(void)
+{
+        pcluster_cachep = kmem_cache_create("erofs_compress",
+                                            Z_EROFS_WORKGROUP_SIZE, 0,
+                                            SLAB_RECLAIM_ACCOUNT, init_once);
+        if (pcluster_cachep) {
+                if (!init_unzip_workqueue())
+                        return 0;
+                kmem_cache_destroy(pcluster_cachep);
+        }
+        return -ENOMEM;
+}
+enum z_erofs_collectmode {
+        COLLECT_SECONDARY,
+        COLLECT_PRIMARY,
+        /*
+         * The current collection was the tail of an exist chain, in addition
+         * that the previous processed chained collections are all decided to
+         * be hooked up to it.
+         * A new chain will be created for the remaining collections which are
+         * not processed yet, therefore different from COLLECT_PRIMARY_FOLLOWED,
+         * the next collection cannot reuse the whole page safely in
+         * the following scenario:
+         *  ________________________________________________________________
+         * |      tail (partial) page     |       head (partial) page       |
+         * |   (belongs to the next cl)   |   (belongs to the current cl)   |
+         * |_______PRIMARY_FOLLOWED_______|________PRIMARY_HOOKED___________|
+         */
+        COLLECT_PRIMARY_HOOKED,
+        COLLECT_PRIMARY_FOLLOWED_NOINPLACE,
+        /*
+         * The current collection has been linked with the owned chain, and
+         * could also be linked with the remaining collections, which means
+         * if the processing page is the tail page of the collection, thus
+         * the current collection can safely use the whole page (since
+         * the previous collection is under control) for in-place I/O, as
+         * illustrated below:
+         *  ________________________________________________________________
+         * |  tail (partial) page |          head (partial) page           |
+         * |  (of the current cl) |      (of the previous collection)      |
+         * |  PRIMARY_FOLLOWED or |                                        |
+         * |_____PRIMARY_HOOKED___|____________PRIMARY_FOLLOWED____________|
+         *
+         * [  (*) the above page can be used as inplace I/O.               ]
+         */
+        COLLECT_PRIMARY_FOLLOWED,
+};
+struct z_erofs_collector {
+        struct z_erofs_pagevec_ctor vector;
+        struct z_erofs_pcluster *pcl, *tailpcl;
+        struct z_erofs_collection *cl;
+        struct page **compressedpages;
+        z_erofs_next_pcluster_t owned_head;
+        enum z_erofs_collectmode mode;
+};
+struct z_erofs_decompress_frontend {
+        struct inode *const inode;
+        struct z_erofs_collector clt;
+        struct erofs_map_blocks map;
+        /* used for applying cache strategy on the fly */
+        bool backmost;
+        erofs_off_t headoffset;
+};
+#define COLLECTOR_INIT() { \
+        .owned_head = Z_EROFS_PCLUSTER_TAIL, \
+        .mode = COLLECT_PRIMARY_FOLLOWED }
+#define DECOMPRESS_FRONTEND_INIT(__i) { \
+        .inode = __i, .clt = COLLECTOR_INIT(), \
+        .backmost = true, }
+static struct page *z_pagemap_global[Z_EROFS_VMAP_GLOBAL_PAGES];
+static DEFINE_MUTEX(z_pagemap_global_lock);
+static void preload_compressed_pages(struct z_erofs_collector *clt,
+                                     struct address_space *mc,
+                                     enum z_erofs_cache_alloctype type,
+                                     struct list_head *pagepool)
+{
+        const struct z_erofs_pcluster *pcl = clt->pcl;
+        const unsigned int clusterpages = BIT(pcl->clusterbits);
+        struct page **pages = clt->compressedpages;
+        pgoff_t index = pcl->obj.index + (pages - pcl->compressed_pages);
+        bool standalone = true;
+        if (clt->mode < COLLECT_PRIMARY_FOLLOWED)
+                return;
+        for (; pages < pcl->compressed_pages + clusterpages; ++pages) {
+                struct page *page;
+                compressed_page_t t;
+                /* the compressed page was loaded before */
+                if (READ_ONCE(*pages))
+                        continue;
+                page = find_get_page(mc, index);
+                if (page) {
+                        t = tag_compressed_page_justfound(page);
+                } else if (type == DELAYEDALLOC) {
+                        t = tagptr_init(compressed_page_t, PAGE_UNALLOCATED);
+                } else {        /* DONTALLOC */
+                        if (standalone)
+                                clt->compressedpages = pages;
+                        standalone = false;
+                        continue;
+                }
+                if (!cmpxchg_relaxed(pages, NULL, tagptr_cast_ptr(t)))
+                        continue;
+                if (page)
+                        put_page(page);
+        }
+        if (standalone)         /* downgrade to PRIMARY_FOLLOWED_NOINPLACE */
+                clt->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE;
+}
+/* called by erofs_shrinker to get rid of all compressed_pages */
+int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
+                                       struct erofs_workgroup *grp)
+{
+        struct z_erofs_pcluster *const pcl =
+                container_of(grp, struct z_erofs_pcluster, obj);
+        struct address_space *const mapping = MNGD_MAPPING(sbi);
+        const unsigned int clusterpages = BIT(pcl->clusterbits);
+        int i;
+        /*
+         * refcount of workgroup is now freezed as 1,
+         * therefore no need to worry about available decompression users.
+         */
+        for (i = 0; i < clusterpages; ++i) {
+                struct page *page = pcl->compressed_pages[i];
+                if (!page)
+                        continue;
+                /* block other users from reclaiming or migrating the page */
+                if (!trylock_page(page))
+                        return -EBUSY;
+                if (unlikely(page->mapping != mapping))
+                        continue;
+                /* barrier is implied in the following 'unlock_page' */
+                WRITE_ONCE(pcl->compressed_pages[i], NULL);
+                set_page_private(page, 0);
+                ClearPagePrivate(page);
+                unlock_page(page);
+                put_page(page);
+        }
+        return 0;
+}
+int erofs_try_to_free_cached_page(struct address_space *mapping,
+                                  struct page *page)
+{
+        struct z_erofs_pcluster *const pcl = (void *)page_private(page);
+        const unsigned int clusterpages = BIT(pcl->clusterbits);
+        int ret = 0;    /* 0 - busy */
+        if (erofs_workgroup_try_to_freeze(&pcl->obj, 1)) {
+                unsigned int i;
+                for (i = 0; i < clusterpages; ++i) {
+                        if (pcl->compressed_pages[i] == page) {
+                                WRITE_ONCE(pcl->compressed_pages[i], NULL);
+                                ret = 1;
+                                break;
+                        }
+                }
+                erofs_workgroup_unfreeze(&pcl->obj, 1);
+                if (ret) {
+                        ClearPagePrivate(page);
+                        put_page(page);
+                }
+        }
+        return ret;
+}
+/* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */
+static inline bool try_inplace_io(struct z_erofs_collector *clt,
+                                  struct page *page)
+{
+        struct z_erofs_pcluster *const pcl = clt->pcl;
+        const unsigned int clusterpages = BIT(pcl->clusterbits);
+        while (clt->compressedpages < pcl->compressed_pages + clusterpages) {
+                if (!cmpxchg(clt->compressedpages++, NULL, page))
+                        return true;
+        }
+        return false;
+}
+/* callers must be with collection lock held */
+static int z_erofs_attach_page(struct z_erofs_collector *clt,
+                               struct page *page,
+                               enum z_erofs_page_type type)
+{
+        int ret;
+        bool occupied;
+        /* give priority for inplaceio */
+        if (clt->mode >= COLLECT_PRIMARY &&
+            type == Z_EROFS_PAGE_TYPE_EXCLUSIVE &&
+            try_inplace_io(clt, page))
+                return 0;
+        ret = z_erofs_pagevec_enqueue(&clt->vector,
+                                      page, type, &occupied);
+        clt->cl->vcnt += (unsigned int)ret;
+        return ret ? 0 : -EAGAIN;
+}
+static enum z_erofs_collectmode
+try_to_claim_pcluster(struct z_erofs_pcluster *pcl,
+                      z_erofs_next_pcluster_t *owned_head)
+{
+        /* let's claim these following types of pclusters */
+retry:
+        if (pcl->next == Z_EROFS_PCLUSTER_NIL) {
+                /* type 1, nil pcluster */
+                if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_NIL,
+                            *owned_head) != Z_EROFS_PCLUSTER_NIL)
+                        goto retry;
+                *owned_head = &pcl->next;
+                /* lucky, I am the followee :) */
+                return COLLECT_PRIMARY_FOLLOWED;
+        } else if (pcl->next == Z_EROFS_PCLUSTER_TAIL) {
+                /*
+                 * type 2, link to the end of a existing open chain,
+                 * be careful that its submission itself is governed
+                 * by the original owned chain.
+                 */
+                if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL,
+                            *owned_head) != Z_EROFS_PCLUSTER_TAIL)
+                        goto retry;
+                *owned_head = Z_EROFS_PCLUSTER_TAIL;
+                return COLLECT_PRIMARY_HOOKED;
+        }
+        return COLLECT_PRIMARY; /* :( better luck next time */
+}
+static struct z_erofs_collection *cllookup(struct z_erofs_collector *clt,
+                                           struct inode *inode,
+                                           struct erofs_map_blocks *map)
+{
+        struct erofs_workgroup *grp;
+        struct z_erofs_pcluster *pcl;
+        struct z_erofs_collection *cl;
+        unsigned int length;
+        bool tag;
+        grp = erofs_find_workgroup(inode->i_sb, map->m_pa >> PAGE_SHIFT, &tag);
+        if (!grp)
+                return NULL;
+        pcl = container_of(grp, struct z_erofs_pcluster, obj);
+        if (clt->owned_head == &pcl->next || pcl == clt->tailpcl) {
+                DBG_BUGON(1);
+                erofs_workgroup_put(grp);
+                return ERR_PTR(-EFSCORRUPTED);
+        }
+        cl = z_erofs_primarycollection(pcl);
+        if (unlikely(cl->pageofs != (map->m_la & ~PAGE_MASK))) {
+                DBG_BUGON(1);
+                erofs_workgroup_put(grp);
+                return ERR_PTR(-EFSCORRUPTED);
+        }
+        length = READ_ONCE(pcl->length);
+        if (length & Z_EROFS_PCLUSTER_FULL_LENGTH) {
+                if ((map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) > length) {
+                        DBG_BUGON(1);
+                        erofs_workgroup_put(grp);
+                        return ERR_PTR(-EFSCORRUPTED);
+                }
+        } else {
+                unsigned int llen = map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT;
+                if (map->m_flags & EROFS_MAP_FULL_MAPPED)
+                        llen |= Z_EROFS_PCLUSTER_FULL_LENGTH;
+                while (llen > length &&
+                       length != cmpxchg_relaxed(&pcl->length, length, llen)) {
+                        cpu_relax();
+                        length = READ_ONCE(pcl->length);
+                }
+        }
+        mutex_lock(&cl->lock);
+        /* used to check tail merging loop due to corrupted images */
+        if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL)
+                clt->tailpcl = pcl;
+        clt->mode = try_to_claim_pcluster(pcl, &clt->owned_head);
+        /* clean tailpcl if the current owned_head is Z_EROFS_PCLUSTER_TAIL */
+        if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL)
+                clt->tailpcl = NULL;
+        clt->pcl = pcl;
+        clt->cl = cl;
+        return cl;
+}
+static struct z_erofs_collection *clregister(struct z_erofs_collector *clt,
+                                             struct inode *inode,
+                                             struct erofs_map_blocks *map)
+{
+        struct z_erofs_pcluster *pcl;
+        struct z_erofs_collection *cl;
+        int err;
+        /* no available workgroup, let's allocate one */
+        pcl = kmem_cache_alloc(pcluster_cachep, GFP_NOFS);
+        if (unlikely(!pcl))
+                return ERR_PTR(-ENOMEM);
+        init_always(pcl);
+        pcl->obj.index = map->m_pa >> PAGE_SHIFT;
+        pcl->length = (map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) |
+                (map->m_flags & EROFS_MAP_FULL_MAPPED ?
+                        Z_EROFS_PCLUSTER_FULL_LENGTH : 0);
+        if (map->m_flags & EROFS_MAP_ZIPPED)
+                pcl->algorithmformat = Z_EROFS_COMPRESSION_LZ4;
+        else
+                pcl->algorithmformat = Z_EROFS_COMPRESSION_SHIFTED;
+        pcl->clusterbits = EROFS_V(inode)->z_physical_clusterbits[0];
+        pcl->clusterbits -= PAGE_SHIFT;
+        /* new pclusters should be claimed as type 1, primary and followed */
+        pcl->next = clt->owned_head;
+        clt->mode = COLLECT_PRIMARY_FOLLOWED;
+        cl = z_erofs_primarycollection(pcl);
+        cl->pageofs = map->m_la & ~PAGE_MASK;
+        /*
+         * lock all primary followed works before visible to others
+         * and mutex_trylock *never* fails for a new pcluster.
+         */
+        mutex_trylock(&cl->lock);
+        err = erofs_register_workgroup(inode->i_sb, &pcl->obj, 0);
+        if (err) {
+                mutex_unlock(&cl->lock);
+                kmem_cache_free(pcluster_cachep, pcl);
+                return ERR_PTR(-EAGAIN);
+        }
+        /* used to check tail merging loop due to corrupted images */
+        if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL)
+                clt->tailpcl = pcl;
+        clt->owned_head = &pcl->next;
+        clt->pcl = pcl;
+        clt->cl = cl;
+        return cl;
+}
+static int z_erofs_collector_begin(struct z_erofs_collector *clt,
+                                   struct inode *inode,
+                                   struct erofs_map_blocks *map)
+{
+        struct z_erofs_collection *cl;
+        DBG_BUGON(clt->cl);
+        /* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous collection */
+        DBG_BUGON(clt->owned_head == Z_EROFS_PCLUSTER_NIL);
+        DBG_BUGON(clt->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
+        if (!PAGE_ALIGNED(map->m_pa)) {
+                DBG_BUGON(1);
+                return -EINVAL;
+        }
+repeat:
+        cl = cllookup(clt, inode, map);
+        if (!cl) {
+                cl = clregister(clt, inode, map);
+                if (unlikely(cl == ERR_PTR(-EAGAIN)))
+                        goto repeat;
+        }
+        if (IS_ERR(cl))
+                return PTR_ERR(cl);
+        z_erofs_pagevec_ctor_init(&clt->vector, Z_EROFS_NR_INLINE_PAGEVECS,
+                                  cl->pagevec, cl->vcnt);
+        clt->compressedpages = clt->pcl->compressed_pages;
+        if (clt->mode <= COLLECT_PRIMARY) /* cannot do in-place I/O */
+                clt->compressedpages += Z_EROFS_CLUSTER_MAX_PAGES;
+        return 0;
+}
+/*
+ * keep in mind that no referenced pclusters will be freed
+ * only after a RCU grace period.
+ */
+static void z_erofs_rcu_callback(struct rcu_head *head)
+{
+        struct z_erofs_collection *const cl =
+                container_of(head, struct z_erofs_collection, rcu);
+        kmem_cache_free(pcluster_cachep,
+                        container_of(cl, struct z_erofs_pcluster,
+                                     primary_collection));
+}
+void erofs_workgroup_free_rcu(struct erofs_workgroup *grp)
+{
+        struct z_erofs_pcluster *const pcl =
+                container_of(grp, struct z_erofs_pcluster, obj);
+        struct z_erofs_collection *const cl = z_erofs_primarycollection(pcl);
+        call_rcu(&cl->rcu, z_erofs_rcu_callback);
+}
+static void z_erofs_collection_put(struct z_erofs_collection *cl)
+{
+        struct z_erofs_pcluster *const pcl =
+                container_of(cl, struct z_erofs_pcluster, primary_collection);
+        erofs_workgroup_put(&pcl->obj);
+}
+static bool z_erofs_collector_end(struct z_erofs_collector *clt)
+{
+        struct z_erofs_collection *cl = clt->cl;
+        if (!cl)
+                return false;
+        z_erofs_pagevec_ctor_exit(&clt->vector, false);
+        mutex_unlock(&cl->lock);
+        /*
+         * if all pending pages are added, don't hold its reference
+         * any longer if the pcluster isn't hosted by ourselves.
+         */
+        if (clt->mode < COLLECT_PRIMARY_FOLLOWED_NOINPLACE)
+                z_erofs_collection_put(cl);
+        clt->cl = NULL;
+        return true;
+}
+static inline struct page *__stagingpage_alloc(struct list_head *pagepool,
+                                               gfp_t gfp)
+{
+        struct page *page = erofs_allocpage(pagepool, gfp, true);
+        page->mapping = Z_EROFS_MAPPING_STAGING;
+        return page;
+}
+static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend *fe,
+                                       unsigned int cachestrategy,
+                                       erofs_off_t la)
+{
+        if (cachestrategy <= EROFS_ZIP_CACHE_DISABLED)
+                return false;
+        if (fe->backmost)
+                return true;
+        return cachestrategy >= EROFS_ZIP_CACHE_READAROUND &&
+                la < fe->headoffset;
+}
+static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
+                                struct page *page,
+                                struct list_head *pagepool)
+{
+        struct inode *const inode = fe->inode;
+        struct erofs_sb_info *const sbi __maybe_unused = EROFS_I_SB(inode);
+        struct erofs_map_blocks *const map = &fe->map;
+        struct z_erofs_collector *const clt = &fe->clt;
+        const loff_t offset = page_offset(page);
+        bool tight = (clt->mode >= COLLECT_PRIMARY_HOOKED);
+        enum z_erofs_cache_alloctype cache_strategy;
+        enum z_erofs_page_type page_type;
+        unsigned int cur, end, spiltted, index;
+        int err = 0;
+        /* register locked file pages as online pages in pack */
+        z_erofs_onlinepage_init(page);
+        spiltted = 0;
+        end = PAGE_SIZE;
+repeat:
+        cur = end - 1;
+        /* lucky, within the range of the current map_blocks */
+        if (offset + cur >= map->m_la &&
+            offset + cur < map->m_la + map->m_llen) {
+                /* didn't get a valid collection previously (very rare) */
+                if (!clt->cl)
+                        goto restart_now;
+                goto hitted;
+        }
+        /* go ahead the next map_blocks */
+        debugln("%s: [out-of-range] pos %llu", __func__, offset + cur);
+        if (z_erofs_collector_end(clt))
+                fe->backmost = false;
+        map->m_la = offset + cur;
+        map->m_llen = 0;
+        err = z_erofs_map_blocks_iter(inode, map, 0);
+        if (unlikely(err))
+                goto err_out;
+restart_now:
+        if (unlikely(!(map->m_flags & EROFS_MAP_MAPPED)))
+                goto hitted;
+        err = z_erofs_collector_begin(clt, inode, map);
+        if (unlikely(err))
+                goto err_out;
+        /* preload all compressed pages (maybe downgrade role if necessary) */
+        if (should_alloc_managed_pages(fe, sbi->cache_strategy, map->m_la))
+                cache_strategy = DELAYEDALLOC;
+        else
+                cache_strategy = DONTALLOC;
+        preload_compressed_pages(clt, MNGD_MAPPING(sbi),
+                                 cache_strategy, pagepool);
+        tight &= (clt->mode >= COLLECT_PRIMARY_HOOKED);
+hitted:
+        cur = end - min_t(unsigned int, offset + end - map->m_la, end);
+        if (unlikely(!(map->m_flags & EROFS_MAP_MAPPED))) {
+                zero_user_segment(page, cur, end);
+                goto next_part;
+        }
+        /* let's derive page type */
+        page_type = cur ? Z_EROFS_VLE_PAGE_TYPE_HEAD :
+                (!spiltted ? Z_EROFS_PAGE_TYPE_EXCLUSIVE :
+                        (tight ? Z_EROFS_PAGE_TYPE_EXCLUSIVE :
+                                Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED));
+        if (cur)
+                tight &= (clt->mode >= COLLECT_PRIMARY_FOLLOWED);
+retry:
+        err = z_erofs_attach_page(clt, page, page_type);
+        /* should allocate an additional staging page for pagevec */
+        if (err == -EAGAIN) {
+                struct page *const newpage =
+                        __stagingpage_alloc(pagepool, GFP_NOFS);
+                err = z_erofs_attach_page(clt, newpage,
+                                          Z_EROFS_PAGE_TYPE_EXCLUSIVE);
+                if (likely(!err))
+                        goto retry;
+        }
+        if (unlikely(err))
+                goto err_out;
+        index = page->index - (map->m_la >> PAGE_SHIFT);
+        z_erofs_onlinepage_fixup(page, index, true);
+        /* bump up the number of spiltted parts of a page */
+        ++spiltted;
+        /* also update nr_pages */
+        clt->cl->nr_pages = max_t(pgoff_t, clt->cl->nr_pages, index + 1);
+next_part:
+        /* can be used for verification */
+        map->m_llen = offset + cur - map->m_la;
+        end = cur;
+        if (end > 0)
+                goto repeat;
+out:
+        z_erofs_onlinepage_endio(page);
+        debugln("%s, finish page: %pK spiltted: %u map->m_llen %llu",
+                __func__, page, spiltted, map->m_llen);
+        return err;
+        /* if some error occurred while processing this page */
+err_out:
+        SetPageError(page);
+        goto out;
+}
+static void z_erofs_vle_unzip_kickoff(void *ptr, int bios)
+{
+        tagptr1_t t = tagptr_init(tagptr1_t, ptr);
+        struct z_erofs_unzip_io *io = tagptr_unfold_ptr(t);
+        bool background = tagptr_unfold_tags(t);
+        if (!background) {
+                unsigned long flags;
+                spin_lock_irqsave(&io->u.wait.lock, flags);
+                if (!atomic_add_return(bios, &io->pending_bios))
+                        wake_up_locked(&io->u.wait);
+                spin_unlock_irqrestore(&io->u.wait.lock, flags);
+                return;
+        }
+        if (!atomic_add_return(bios, &io->pending_bios))
+                queue_work(z_erofs_workqueue, &io->u.work);
+}
+static inline void z_erofs_vle_read_endio(struct bio *bio)
+{
+        struct erofs_sb_info *sbi = NULL;
+        blk_status_t err = bio->bi_status;
+        struct bio_vec *bvec;
+        struct bvec_iter_all iter_all;
+        bio_for_each_segment_all(bvec, bio, iter_all) {
+                struct page *page = bvec->bv_page;
+                bool cachemngd = false;
+                DBG_BUGON(PageUptodate(page));
+                DBG_BUGON(!page->mapping);
+                if (unlikely(!sbi && !z_erofs_page_is_staging(page))) {
+                        sbi = EROFS_SB(page->mapping->host->i_sb);
+                        if (time_to_inject(sbi, FAULT_READ_IO)) {
+                                erofs_show_injection_info(FAULT_READ_IO);
+                                err = BLK_STS_IOERR;
+                        }
+                }
+                /* sbi should already be gotten if the page is managed */
+                if (sbi)
+                        cachemngd = erofs_page_is_managed(sbi, page);
+                if (unlikely(err))
+                        SetPageError(page);
+                else if (cachemngd)
+                        SetPageUptodate(page);
+                if (cachemngd)
+                        unlock_page(page);
+        }
+        z_erofs_vle_unzip_kickoff(bio->bi_private, -1);
+        bio_put(bio);
+}
+static int z_erofs_decompress_pcluster(struct super_block *sb,
+                                       struct z_erofs_pcluster *pcl,
+                                       struct list_head *pagepool)
+{
+        struct erofs_sb_info *const sbi = EROFS_SB(sb);
+        const unsigned int clusterpages = BIT(pcl->clusterbits);
+        struct z_erofs_pagevec_ctor ctor;
+        unsigned int i, outputsize, llen, nr_pages;
+        struct page *pages_onstack[Z_EROFS_VMAP_ONSTACK_PAGES];
+        struct page **pages, **compressed_pages, *page;
+        enum z_erofs_page_type page_type;
+        bool overlapped, partial;
+        struct z_erofs_collection *cl;
+        int err;
+        might_sleep();
+        cl = z_erofs_primarycollection(pcl);
+        DBG_BUGON(!READ_ONCE(cl->nr_pages));
+        mutex_lock(&cl->lock);
+        nr_pages = cl->nr_pages;
+        if (likely(nr_pages <= Z_EROFS_VMAP_ONSTACK_PAGES)) {
+                pages = pages_onstack;
+        } else if (nr_pages <= Z_EROFS_VMAP_GLOBAL_PAGES &&
+                   mutex_trylock(&z_pagemap_global_lock)) {
+                pages = z_pagemap_global;
+        } else {
+                gfp_t gfp_flags = GFP_KERNEL;
+                if (nr_pages > Z_EROFS_VMAP_GLOBAL_PAGES)
+                        gfp_flags |= __GFP_NOFAIL;
+                pages = kvmalloc_array(nr_pages, sizeof(struct page *),
+                                       gfp_flags);
+                /* fallback to global pagemap for the lowmem scenario */
+                if (unlikely(!pages)) {
+                        mutex_lock(&z_pagemap_global_lock);
+                        pages = z_pagemap_global;
+                }
+        }
+        for (i = 0; i < nr_pages; ++i)
+                pages[i] = NULL;
+        err = 0;
+        z_erofs_pagevec_ctor_init(&ctor, Z_EROFS_NR_INLINE_PAGEVECS,
+                                  cl->pagevec, 0);
+        for (i = 0; i < cl->vcnt; ++i) {
+                unsigned int pagenr;
+                page = z_erofs_pagevec_dequeue(&ctor, &page_type);
+                /* all pages in pagevec ought to be valid */
+                DBG_BUGON(!page);
+                DBG_BUGON(!page->mapping);
+                if (z_erofs_put_stagingpage(pagepool, page))
+                        continue;
+                if (page_type == Z_EROFS_VLE_PAGE_TYPE_HEAD)
+                        pagenr = 0;
+                else
+                        pagenr = z_erofs_onlinepage_index(page);
+                DBG_BUGON(pagenr >= nr_pages);
+                /*
+                 * currently EROFS doesn't support multiref(dedup),
+                 * so here erroring out one multiref page.
+                 */
+                if (unlikely(pages[pagenr])) {
+                        DBG_BUGON(1);
+                        SetPageError(pages[pagenr]);
+                        z_erofs_onlinepage_endio(pages[pagenr]);
+                        err = -EFSCORRUPTED;
+                }
+                pages[pagenr] = page;
+        }
+        z_erofs_pagevec_ctor_exit(&ctor, true);
+        overlapped = false;
+        compressed_pages = pcl->compressed_pages;
+        for (i = 0; i < clusterpages; ++i) {
+                unsigned int pagenr;
+                page = compressed_pages[i];
+                /* all compressed pages ought to be valid */
+                DBG_BUGON(!page);
+                DBG_BUGON(!page->mapping);
+                if (!z_erofs_page_is_staging(page)) {
+                        if (erofs_page_is_managed(sbi, page)) {
+                                if (unlikely(!PageUptodate(page)))
+                                        err = -EIO;
+                                continue;
+                        }
+                        /*
+                         * only if non-head page can be selected
+                         * for inplace decompression
+                         */
+                        pagenr = z_erofs_onlinepage_index(page);
+                        DBG_BUGON(pagenr >= nr_pages);
+                        if (unlikely(pages[pagenr])) {
+                                DBG_BUGON(1);
+                                SetPageError(pages[pagenr]);
+                                z_erofs_onlinepage_endio(pages[pagenr]);
+                                err = -EFSCORRUPTED;
+                        }
+                        pages[pagenr] = page;
+                        overlapped = true;
+                }
+                /* PG_error needs checking for inplaced and staging pages */
+                if (unlikely(PageError(page))) {
+                        DBG_BUGON(PageUptodate(page));
+                        err = -EIO;
+                }
+        }
+        if (unlikely(err))
+                goto out;
+        llen = pcl->length >> Z_EROFS_PCLUSTER_LENGTH_BIT;
+        if (nr_pages << PAGE_SHIFT >= cl->pageofs + llen) {
+                outputsize = llen;
+                partial = !(pcl->length & Z_EROFS_PCLUSTER_FULL_LENGTH);
+        } else {
+                outputsize = (nr_pages << PAGE_SHIFT) - cl->pageofs;
+                partial = true;
+        }
+        err = z_erofs_decompress(&(struct z_erofs_decompress_req) {
+                                        .sb = sb,
+                                        .in = compressed_pages,
+                                        .out = pages,
+                                        .pageofs_out = cl->pageofs,
+                                        .inputsize = PAGE_SIZE,
+                                        .outputsize = outputsize,
+                                        .alg = pcl->algorithmformat,
+                                        .inplace_io = overlapped,
+                                        .partial_decoding = partial
+                                 }, pagepool);
+out:
+        /* must handle all compressed pages before endding pages */
+        for (i = 0; i < clusterpages; ++i) {
+                page = compressed_pages[i];
+                if (erofs_page_is_managed(sbi, page))
+                        continue;
+                /* recycle all individual staging pages */
+                (void)z_erofs_put_stagingpage(pagepool, page);
+                WRITE_ONCE(compressed_pages[i], NULL);
+        }
+        for (i = 0; i < nr_pages; ++i) {
+                page = pages[i];
+                if (!page)
+                        continue;
+                DBG_BUGON(!page->mapping);
+                /* recycle all individual staging pages */
+                if (z_erofs_put_stagingpage(pagepool, page))
+                        continue;
+                if (unlikely(err < 0))
+                        SetPageError(page);
+                z_erofs_onlinepage_endio(page);
+        }
+        if (pages == z_pagemap_global)
+                mutex_unlock(&z_pagemap_global_lock);
+        else if (unlikely(pages != pages_onstack))
+                kvfree(pages);
+        cl->nr_pages = 0;
+        cl->vcnt = 0;
+        /* all cl locks MUST be taken before the following line */
+        WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_NIL);
+        /* all cl locks SHOULD be released right now */
+        mutex_unlock(&cl->lock);
+        z_erofs_collection_put(cl);
+        return err;
+}
+static void z_erofs_vle_unzip_all(struct super_block *sb,
+                                  struct z_erofs_unzip_io *io,
+                                  struct list_head *pagepool)
+{
+        z_erofs_next_pcluster_t owned = io->head;
+        while (owned != Z_EROFS_PCLUSTER_TAIL_CLOSED) {
+                struct z_erofs_pcluster *pcl;
+                /* no possible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */
+                DBG_BUGON(owned == Z_EROFS_PCLUSTER_TAIL);
+                /* no possible that 'owned' equals NULL */
+                DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL);
+                pcl = container_of(owned, struct z_erofs_pcluster, next);
+                owned = READ_ONCE(pcl->next);
+                z_erofs_decompress_pcluster(sb, pcl, pagepool);
+        }
+}
+static void z_erofs_vle_unzip_wq(struct work_struct *work)
+{
+        struct z_erofs_unzip_io_sb *iosb =
+                container_of(work, struct z_erofs_unzip_io_sb, io.u.work);
+        LIST_HEAD(pagepool);
+        DBG_BUGON(iosb->io.head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
+        z_erofs_vle_unzip_all(iosb->sb, &iosb->io, &pagepool);
+        put_pages_list(&pagepool);
+        kvfree(iosb);
+}
+static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl,
+                                               unsigned int nr,
+                                               struct list_head *pagepool,
+                                               struct address_space *mc,
+                                               gfp_t gfp)
+{
+        /* determined at compile time to avoid too many #ifdefs */
+        const bool nocache = __builtin_constant_p(mc) ? !mc : false;
+        const pgoff_t index = pcl->obj.index;
+        bool tocache = false;
+        struct address_space *mapping;
+        struct page *oldpage, *page;
+        compressed_page_t t;
+        int justfound;
+repeat:
+        page = READ_ONCE(pcl->compressed_pages[nr]);
+        oldpage = page;
+        if (!page)
+                goto out_allocpage;
+        /*
+         * the cached page has not been allocated and
+         * an placeholder is out there, prepare it now.
+         */
+        if (!nocache && page == PAGE_UNALLOCATED) {
+                tocache = true;
+                goto out_allocpage;
+        }
+        /* process the target tagged pointer */
+        t = tagptr_init(compressed_page_t, page);
+        justfound = tagptr_unfold_tags(t);
+        page = tagptr_unfold_ptr(t);
+        mapping = READ_ONCE(page->mapping);
+        /*
+         * if managed cache is disabled, it's no way to
+         * get such a cached-like page.
+         */
+        if (nocache) {
+                /* if managed cache is disabled, it is impossible `justfound' */
+                DBG_BUGON(justfound);
+                /* and it should be locked, not uptodate, and not truncated */
+                DBG_BUGON(!PageLocked(page));
+                DBG_BUGON(PageUptodate(page));
+                DBG_BUGON(!mapping);
+                goto out;
+        }
+        /*
+         * unmanaged (file) pages are all locked solidly,
+         * therefore it is impossible for `mapping' to be NULL.
+         */
+        if (mapping && mapping != mc)
+                /* ought to be unmanaged pages */
+                goto out;
+        lock_page(page);
+        /* only true if page reclaim goes wrong, should never happen */
+        DBG_BUGON(justfound && PagePrivate(page));
+        /* the page is still in manage cache */
+        if (page->mapping == mc) {
+                WRITE_ONCE(pcl->compressed_pages[nr], page);
+                ClearPageError(page);
+                if (!PagePrivate(page)) {
+                        /*
+                         * impossible to be !PagePrivate(page) for
+                         * the current restriction as well if
+                         * the page is already in compressed_pages[].
+                         */
+                        DBG_BUGON(!justfound);
+                        justfound = 0;
+                        set_page_private(page, (unsigned long)pcl);
+                        SetPagePrivate(page);
+                }
+                /* no need to submit io if it is already up-to-date */
+                if (PageUptodate(page)) {
+                        unlock_page(page);
+                        page = NULL;
+                }
+                goto out;
+        }
+        /*
+         * the managed page has been truncated, it's unsafe to
+         * reuse this one, let's allocate a new cache-managed page.
+         */
+        DBG_BUGON(page->mapping);
+        DBG_BUGON(!justfound);
+        tocache = true;
+        unlock_page(page);
+        put_page(page);
+out_allocpage:
+        page = __stagingpage_alloc(pagepool, gfp);
+        if (oldpage != cmpxchg(&pcl->compressed_pages[nr], oldpage, page)) {
+                list_add(&page->lru, pagepool);
+                cpu_relax();
+                goto repeat;
+        }
+        if (nocache || !tocache)
+                goto out;
+        if (add_to_page_cache_lru(page, mc, index + nr, gfp)) {
+                page->mapping = Z_EROFS_MAPPING_STAGING;
+                goto out;
+        }
+        set_page_private(page, (unsigned long)pcl);
+        SetPagePrivate(page);
+out:    /* the only exit (for tracing and debugging) */
+        return page;
+}
+static struct z_erofs_unzip_io *jobqueue_init(struct super_block *sb,
+                                              struct z_erofs_unzip_io *io,
+                                              bool foreground)
+{
+        struct z_erofs_unzip_io_sb *iosb;
+        if (foreground) {
+                /* waitqueue available for foreground io */
+                DBG_BUGON(!io);
+                init_waitqueue_head(&io->u.wait);
+                atomic_set(&io->pending_bios, 0);
+                goto out;
+        }
+        iosb = kvzalloc(sizeof(*iosb), GFP_KERNEL | __GFP_NOFAIL);
+        DBG_BUGON(!iosb);
+        /* initialize fields in the allocated descriptor */
+        io = &iosb->io;
+        iosb->sb = sb;
+        INIT_WORK(&io->u.work, z_erofs_vle_unzip_wq);
+out:
+        io->head = Z_EROFS_PCLUSTER_TAIL_CLOSED;
+        return io;
+}
+/* define decompression jobqueue types */
+enum {
+        JQ_BYPASS,
+        JQ_SUBMIT,
+        NR_JOBQUEUES,
+};
+static void *jobqueueset_init(struct super_block *sb,
+                              z_erofs_next_pcluster_t qtail[],
+                              struct z_erofs_unzip_io *q[],
+                              struct z_erofs_unzip_io *fgq,
+                              bool forcefg)
+{
+        /*
+         * if managed cache is enabled, bypass jobqueue is needed,
+         * no need to read from device for all pclusters in this queue.
+         */
+        q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, true);
+        qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head;
+        q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, forcefg);
+        qtail[JQ_SUBMIT] = &q[JQ_SUBMIT]->head;
+        return tagptr_cast_ptr(tagptr_fold(tagptr1_t, q[JQ_SUBMIT], !forcefg));
+}
+static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl,
+                                    z_erofs_next_pcluster_t qtail[],
+                                    z_erofs_next_pcluster_t owned_head)
+{
+        z_erofs_next_pcluster_t *const submit_qtail = qtail[JQ_SUBMIT];
+        z_erofs_next_pcluster_t *const bypass_qtail = qtail[JQ_BYPASS];
+        DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
+        if (owned_head == Z_EROFS_PCLUSTER_TAIL)
+                owned_head = Z_EROFS_PCLUSTER_TAIL_CLOSED;
+        WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL_CLOSED);
+        WRITE_ONCE(*submit_qtail, owned_head);
+        WRITE_ONCE(*bypass_qtail, &pcl->next);
+        qtail[JQ_BYPASS] = &pcl->next;
+}
+static bool postsubmit_is_all_bypassed(struct z_erofs_unzip_io *q[],
+                                       unsigned int nr_bios,
+                                       bool force_fg)
+{
+        /*
+         * although background is preferred, no one is pending for submission.
+         * don't issue workqueue for decompression but drop it directly instead.
+         */
+        if (force_fg || nr_bios)
+                return false;
+        kvfree(container_of(q[JQ_SUBMIT], struct z_erofs_unzip_io_sb, io));
+        return true;
+}
+static bool z_erofs_vle_submit_all(struct super_block *sb,
+                                   z_erofs_next_pcluster_t owned_head,
+                                   struct list_head *pagepool,
+                                   struct z_erofs_unzip_io *fgq,
+                                   bool force_fg)
+{
+        struct erofs_sb_info *const sbi __maybe_unused = EROFS_SB(sb);
+        z_erofs_next_pcluster_t qtail[NR_JOBQUEUES];
+        struct z_erofs_unzip_io *q[NR_JOBQUEUES];
+        struct bio *bio;
+        void *bi_private;
+        /* since bio will be NULL, no need to initialize last_index */
+        pgoff_t uninitialized_var(last_index);
+        bool force_submit = false;
+        unsigned int nr_bios;
+        if (unlikely(owned_head == Z_EROFS_PCLUSTER_TAIL))
+                return false;
+        force_submit = false;
+        bio = NULL;
+        nr_bios = 0;
+        bi_private = jobqueueset_init(sb, qtail, q, fgq, force_fg);
+        /* by default, all need io submission */
+        q[JQ_SUBMIT]->head = owned_head;
+        do {
+                struct z_erofs_pcluster *pcl;
+                unsigned int clusterpages;
+                pgoff_t first_index;
+                struct page *page;
+                unsigned int i = 0, bypass = 0;
+                int err;
+                /* no possible 'owned_head' equals the following */
+                DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
+                DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_NIL);
+                pcl = container_of(owned_head, struct z_erofs_pcluster, next);
+                clusterpages = BIT(pcl->clusterbits);
+                /* close the main owned chain at first */
+                owned_head = cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL,
+                                     Z_EROFS_PCLUSTER_TAIL_CLOSED);
+                first_index = pcl->obj.index;
+                force_submit |= (first_index != last_index + 1);
+repeat:
+                page = pickup_page_for_submission(pcl, i, pagepool,
+                                                  MNGD_MAPPING(sbi),
+                                                  GFP_NOFS);
+                if (!page) {
+                        force_submit = true;
+                        ++bypass;
+                        goto skippage;
+                }
+                if (bio && force_submit) {
+submit_bio_retry:
+                        __submit_bio(bio, REQ_OP_READ, 0);
+                        bio = NULL;
+                }
+                if (!bio) {
+                        bio = erofs_grab_bio(sb, first_index + i,
+                                             BIO_MAX_PAGES, bi_private,
+                                             z_erofs_vle_read_endio, true);
+                        ++nr_bios;
+                }
+                err = bio_add_page(bio, page, PAGE_SIZE, 0);
+                if (err < PAGE_SIZE)
+                        goto submit_bio_retry;
+                force_submit = false;
+                last_index = first_index + i;
+skippage:
+                if (++i < clusterpages)
+                        goto repeat;
+                if (bypass < clusterpages)
+                        qtail[JQ_SUBMIT] = &pcl->next;
+                else
+                        move_to_bypass_jobqueue(pcl, qtail, owned_head);
+        } while (owned_head != Z_EROFS_PCLUSTER_TAIL);
+        if (bio)
+                __submit_bio(bio, REQ_OP_READ, 0);
+        if (postsubmit_is_all_bypassed(q, nr_bios, force_fg))
+                return true;
+        z_erofs_vle_unzip_kickoff(bi_private, nr_bios);
+        return true;
+}
+static void z_erofs_submit_and_unzip(struct super_block *sb,
+                                     struct z_erofs_collector *clt,
+                                     struct list_head *pagepool,
+                                     bool force_fg)
+{
+        struct z_erofs_unzip_io io[NR_JOBQUEUES];
+        if (!z_erofs_vle_submit_all(sb, clt->owned_head,
+                                    pagepool, io, force_fg))
+                return;
+        /* decompress no I/O pclusters immediately */
+        z_erofs_vle_unzip_all(sb, &io[JQ_BYPASS], pagepool);
+        if (!force_fg)
+                return;
+        /* wait until all bios are completed */
+        wait_event(io[JQ_SUBMIT].u.wait,
+                   !atomic_read(&io[JQ_SUBMIT].pending_bios));
+        /* let's synchronous decompression */
+        z_erofs_vle_unzip_all(sb, &io[JQ_SUBMIT], pagepool);
+}
+static int z_erofs_vle_normalaccess_readpage(struct file *file,
+                                             struct page *page)
+{
+        struct inode *const inode = page->mapping->host;
+        struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
+        int err;
+        LIST_HEAD(pagepool);
+        trace_erofs_readpage(page, false);
+        f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT;
+        err = z_erofs_do_read_page(&f, page, &pagepool);
+        (void)z_erofs_collector_end(&f.clt);
+        /* if some compressed cluster ready, need submit them anyway */
+        z_erofs_submit_and_unzip(inode->i_sb, &f.clt, &pagepool, true);
+        if (err)
+                errln("%s, failed to read, err [%d]", __func__, err);
+        if (f.map.mpage)
+                put_page(f.map.mpage);
+        /* clean up the remaining free pages */
+        put_pages_list(&pagepool);
+        return err;
+}
+static bool should_decompress_synchronously(struct erofs_sb_info *sbi,
+                                            unsigned int nr)
+{
+        return nr <= sbi->max_sync_decompress_pages;
+}
+static int z_erofs_vle_normalaccess_readpages(struct file *filp,
+                                              struct address_space *mapping,
+                                              struct list_head *pages,
+                                              unsigned int nr_pages)
+{
+        struct inode *const inode = mapping->host;
+        struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
+        bool sync = should_decompress_synchronously(sbi, nr_pages);
+        struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
+        gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL);
+        struct page *head = NULL;
+        LIST_HEAD(pagepool);
+        trace_erofs_readpages(mapping->host, lru_to_page(pages),
+                              nr_pages, false);
+        f.headoffset = (erofs_off_t)lru_to_page(pages)->index << PAGE_SHIFT;
+        for (; nr_pages; --nr_pages) {
+                struct page *page = lru_to_page(pages);
+                prefetchw(&page->flags);
+                list_del(&page->lru);
+                /*
+                 * A pure asynchronous readahead is indicated if
+                 * a PG_readahead marked page is hitted at first.
+                 * Let's also do asynchronous decompression for this case.
+                 */
+                sync &= !(PageReadahead(page) && !head);
+                if (add_to_page_cache_lru(page, mapping, page->index, gfp)) {
+                        list_add(&page->lru, &pagepool);
+                        continue;
+                }
+                set_page_private(page, (unsigned long)head);
+                head = page;
+        }
+        while (head) {
+                struct page *page = head;
+                int err;
+                /* traversal in reverse order */
+                head = (void *)page_private(page);
+                err = z_erofs_do_read_page(&f, page, &pagepool);
+                if (err) {
+                        struct erofs_vnode *vi = EROFS_V(inode);
+                        errln("%s, readahead error at page %lu of nid %llu",
+                              __func__, page->index, vi->nid);
+                }
+                put_page(page);
+        }
+        (void)z_erofs_collector_end(&f.clt);
+        z_erofs_submit_and_unzip(inode->i_sb, &f.clt, &pagepool, sync);
+        if (f.map.mpage)
+                put_page(f.map.mpage);
+        /* clean up the remaining free pages */
+        put_pages_list(&pagepool);
+        return 0;
+}
+const struct address_space_operations z_erofs_vle_normalaccess_aops = {
+        .readpage = z_erofs_vle_normalaccess_readpage,
+        .readpages = z_erofs_vle_normalaccess_readpages,
+};
diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h
new file mode 100644
index 000000000000..4fc547bc01f9
--- /dev/null
+++ b/fs/erofs/zdata.h
@@ -0,0 +1,193 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25@huawei.com>
+ */
+#ifndef __EROFS_FS_ZDATA_H
+#define __EROFS_FS_ZDATA_H
+#include "internal.h"
+#include "zpvec.h"
+#define Z_EROFS_NR_INLINE_PAGEVECS      3
+/*
+ * Structure fields follow one of the following exclusion rules.
+ *
+ * I: Modifiable by initialization/destruction paths and read-only
+ *    for everyone else;
+ *
+ * L: Field should be protected by pageset lock;
+ *
+ * A: Field should be accessed / updated in atomic for parallelized code.
+ */
+struct z_erofs_collection {
+        struct mutex lock;
+        /* I: page offset of start position of decompression */
+        unsigned short pageofs;
+        /* L: maximum relative page index in pagevec[] */
+        unsigned short nr_pages;
+        /* L: total number of pages in pagevec[] */
+        unsigned int vcnt;
+        union {
+                /* L: inline a certain number of pagevecs for bootstrap */
+                erofs_vtptr_t pagevec[Z_EROFS_NR_INLINE_PAGEVECS];
+                /* I: can be used to free the pcluster by RCU. */
+                struct rcu_head rcu;
+        };
+};
+#define Z_EROFS_PCLUSTER_FULL_LENGTH    0x00000001
+#define Z_EROFS_PCLUSTER_LENGTH_BIT     1
+/*
+ * let's leave a type here in case of introducing
+ * another tagged pointer later.
+ */
+typedef void *z_erofs_next_pcluster_t;
+struct z_erofs_pcluster {
+        struct erofs_workgroup obj;
+        struct z_erofs_collection primary_collection;
+        /* A: point to next chained pcluster or TAILs */
+        z_erofs_next_pcluster_t next;
+        /* A: compressed pages (including multi-usage pages) */
+        struct page *compressed_pages[Z_EROFS_CLUSTER_MAX_PAGES];
+        /* A: lower limit of decompressed length and if full length or not */
+        unsigned int length;
+        /* I: compression algorithm format */
+        unsigned char algorithmformat;
+        /* I: bit shift of physical cluster size */
+        unsigned char clusterbits;
+};
+#define z_erofs_primarycollection(pcluster) (&(pcluster)->primary_collection)
+/* let's avoid the valid 32-bit kernel addresses */
+/* the chained workgroup has't submitted io (still open) */
+#define Z_EROFS_PCLUSTER_TAIL           ((void *)0x5F0ECAFE)
+/* the chained workgroup has already submitted io */
+#define Z_EROFS_PCLUSTER_TAIL_CLOSED    ((void *)0x5F0EDEAD)
+#define Z_EROFS_PCLUSTER_NIL            (NULL)
+#define Z_EROFS_WORKGROUP_SIZE  sizeof(struct z_erofs_pcluster)
+struct z_erofs_unzip_io {
+        atomic_t pending_bios;
+        z_erofs_next_pcluster_t head;
+        union {
+                wait_queue_head_t wait;
+                struct work_struct work;
+        } u;
+};
+struct z_erofs_unzip_io_sb {
+        struct z_erofs_unzip_io io;
+        struct super_block *sb;
+};
+#define MNGD_MAPPING(sbi)       ((sbi)->managed_cache->i_mapping)
+static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi,
+                                         struct page *page)
+{
+        return page->mapping == MNGD_MAPPING(sbi);
+}
+#define Z_EROFS_ONLINEPAGE_COUNT_BITS   2
+#define Z_EROFS_ONLINEPAGE_COUNT_MASK   ((1 << Z_EROFS_ONLINEPAGE_COUNT_BITS) - 1)
+#define Z_EROFS_ONLINEPAGE_INDEX_SHIFT  (Z_EROFS_ONLINEPAGE_COUNT_BITS)
+/*
+ * waiters (aka. ongoing_packs): # to unlock the page
+ * sub-index: 0 - for partial page, >= 1 full page sub-index
+ */
+typedef atomic_t z_erofs_onlinepage_t;
+/* type punning */
+union z_erofs_onlinepage_converter {
+        z_erofs_onlinepage_t *o;
+        unsigned long *v;
+};
+static inline unsigned int z_erofs_onlinepage_index(struct page *page)
+{
+        union z_erofs_onlinepage_converter u;
+        DBG_BUGON(!PagePrivate(page));
+        u.v = &page_private(page);
+        return atomic_read(u.o) >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
+}
+static inline void z_erofs_onlinepage_init(struct page *page)
+{
+        union {
+                z_erofs_onlinepage_t o;
+                unsigned long v;
+        /* keep from being unlocked in advance */
+        } u = { .o = ATOMIC_INIT(1) };
+        set_page_private(page, u.v);
+        smp_wmb();
+        SetPagePrivate(page);
+}
+static inline void z_erofs_onlinepage_fixup(struct page *page,
+        uintptr_t index, bool down)
+{
+        unsigned long *p, o, v, id;
+repeat:
+        p = &page_private(page);
+        o = READ_ONCE(*p);
+        id = o >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
+        if (id) {
+                if (!index)
+                        return;
+                DBG_BUGON(id != index);
+        }
+        v = (index << Z_EROFS_ONLINEPAGE_INDEX_SHIFT) |
+                ((o & Z_EROFS_ONLINEPAGE_COUNT_MASK) + (unsigned int)down);
+        if (cmpxchg(p, o, v) != o)
+                goto repeat;
+}
+static inline void z_erofs_onlinepage_endio(struct page *page)
+{
+        union z_erofs_onlinepage_converter u;
+        unsigned int v;
+        DBG_BUGON(!PagePrivate(page));
+        u.v = &page_private(page);
+        v = atomic_dec_return(u.o);
+        if (!(v & Z_EROFS_ONLINEPAGE_COUNT_MASK)) {
+                ClearPagePrivate(page);
+                if (!PageError(page))
+                        SetPageUptodate(page);
+                unlock_page(page);
+        }
+        debugln("%s, page %p value %x", __func__, page, atomic_read(u.o));
+}
+#define Z_EROFS_VMAP_ONSTACK_PAGES      \
+        min_t(unsigned int, THREAD_SIZE / 8 / sizeof(struct page *), 96U)
+#define Z_EROFS_VMAP_GLOBAL_PAGES       2048
+#endif
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
new file mode 100644
index 000000000000..4dc9cec01297
--- /dev/null
+++ b/fs/erofs/zmap.c
@@ -0,0 +1,466 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2018-2019 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25@huawei.com>
+ */
+#include "internal.h"
+#include <asm/unaligned.h>
+#include <trace/events/erofs.h>
+int z_erofs_fill_inode(struct inode *inode)
+{
+        struct erofs_vnode *const vi = EROFS_V(inode);
+        if (vi->datamode == EROFS_INODE_FLAT_COMPRESSION_LEGACY) {
+                vi->z_advise = 0;
+                vi->z_algorithmtype[0] = 0;
+                vi->z_algorithmtype[1] = 0;
+                vi->z_logical_clusterbits = LOG_BLOCK_SIZE;
+                vi->z_physical_clusterbits[0] = vi->z_logical_clusterbits;
+                vi->z_physical_clusterbits[1] = vi->z_logical_clusterbits;
+                set_bit(EROFS_V_Z_INITED_BIT, &vi->flags);
+        }
+        inode->i_mapping->a_ops = &z_erofs_vle_normalaccess_aops;
+        return 0;
+}
+static int fill_inode_lazy(struct inode *inode)
+{
+        struct erofs_vnode *const vi = EROFS_V(inode);
+        struct super_block *const sb = inode->i_sb;
+        int err;
+        erofs_off_t pos;
+        struct page *page;
+        void *kaddr;
+        struct z_erofs_map_header *h;
+        if (test_bit(EROFS_V_Z_INITED_BIT, &vi->flags))
+                return 0;
+        if (wait_on_bit_lock(&vi->flags, EROFS_V_BL_Z_BIT, TASK_KILLABLE))
+                return -ERESTARTSYS;
+        err = 0;
+        if (test_bit(EROFS_V_Z_INITED_BIT, &vi->flags))
+                goto out_unlock;
+        DBG_BUGON(vi->datamode == EROFS_INODE_FLAT_COMPRESSION_LEGACY);
+        pos = ALIGN(iloc(EROFS_SB(sb), vi->nid) + vi->inode_isize +
+                    vi->xattr_isize, 8);
+        page = erofs_get_meta_page(sb, erofs_blknr(pos), false);
+        if (IS_ERR(page)) {
+                err = PTR_ERR(page);
+                goto out_unlock;
+        }
+        kaddr = kmap_atomic(page);
+        h = kaddr + erofs_blkoff(pos);
+        vi->z_advise = le16_to_cpu(h->h_advise);
+        vi->z_algorithmtype[0] = h->h_algorithmtype & 15;
+        vi->z_algorithmtype[1] = h->h_algorithmtype >> 4;
+        if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX) {
+                errln("unknown compression format %u for nid %llu, please upgrade kernel",
+                      vi->z_algorithmtype[0], vi->nid);
+                err = -EOPNOTSUPP;
+                goto unmap_done;
+        }
+        vi->z_logical_clusterbits = LOG_BLOCK_SIZE + (h->h_clusterbits & 7);
+        vi->z_physical_clusterbits[0] = vi->z_logical_clusterbits +
+                                        ((h->h_clusterbits >> 3) & 3);
+        if (vi->z_physical_clusterbits[0] != LOG_BLOCK_SIZE) {
+                errln("unsupported physical clusterbits %u for nid %llu, please upgrade kernel",
+                      vi->z_physical_clusterbits[0], vi->nid);
+                err = -EOPNOTSUPP;
+                goto unmap_done;
+        }
+        vi->z_physical_clusterbits[1] = vi->z_logical_clusterbits +
+                                        ((h->h_clusterbits >> 5) & 7);
+        set_bit(EROFS_V_Z_INITED_BIT, &vi->flags);
+unmap_done:
+        kunmap_atomic(kaddr);
+        unlock_page(page);
+        put_page(page);
+out_unlock:
+        clear_and_wake_up_bit(EROFS_V_BL_Z_BIT, &vi->flags);
+        return err;
+}
+struct z_erofs_maprecorder {
+        struct inode *inode;
+        struct erofs_map_blocks *map;
+        void *kaddr;
+        unsigned long lcn;
+        /* compression extent information gathered */
+        u8  type;
+        u16 clusterofs;
+        u16 delta[2];
+        erofs_blk_t pblk;
+};
+static int z_erofs_reload_indexes(struct z_erofs_maprecorder *m,
+                                  erofs_blk_t eblk)
+{
+        struct super_block *const sb = m->inode->i_sb;
+        struct erofs_map_blocks *const map = m->map;
+        struct page *mpage = map->mpage;
+        if (mpage) {
+                if (mpage->index == eblk) {
+                        if (!m->kaddr)
+                                m->kaddr = kmap_atomic(mpage);
+                        return 0;
+                }
+                if (m->kaddr) {
+                        kunmap_atomic(m->kaddr);
+                        m->kaddr = NULL;
+                }
+                put_page(mpage);
+        }
+        mpage = erofs_get_meta_page(sb, eblk, false);
+        if (IS_ERR(mpage)) {
+                map->mpage = NULL;
+                return PTR_ERR(mpage);
+        }
+        m->kaddr = kmap_atomic(mpage);
+        unlock_page(mpage);
+        map->mpage = mpage;
+        return 0;
+}
+static int vle_legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m,
+                                             unsigned long lcn)
+{
+        struct inode *const inode = m->inode;
+        struct erofs_vnode *const vi = EROFS_V(inode);
+        const erofs_off_t ibase = iloc(EROFS_I_SB(inode), vi->nid);
+        const erofs_off_t pos =
+                Z_EROFS_VLE_LEGACY_INDEX_ALIGN(ibase + vi->inode_isize +
+                                               vi->xattr_isize) +
+                lcn * sizeof(struct z_erofs_vle_decompressed_index);
+        struct z_erofs_vle_decompressed_index *di;
+        unsigned int advise, type;
+        int err;
+        err = z_erofs_reload_indexes(m, erofs_blknr(pos));
+        if (err)
+                return err;
+        m->lcn = lcn;
+        di = m->kaddr + erofs_blkoff(pos);
+        advise = le16_to_cpu(di->di_advise);
+        type = (advise >> Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT) &
+                ((1 << Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS) - 1);
+        switch (type) {
+        case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+                m->clusterofs = 1 << vi->z_logical_clusterbits;
+                m->delta[0] = le16_to_cpu(di->di_u.delta[0]);
+                m->delta[1] = le16_to_cpu(di->di_u.delta[1]);
+                break;
+        case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
+        case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
+                m->clusterofs = le16_to_cpu(di->di_clusterofs);
+                m->pblk = le32_to_cpu(di->di_u.blkaddr);
+                break;
+        default:
+                DBG_BUGON(1);
+                return -EOPNOTSUPP;
+        }
+        m->type = type;
+        return 0;
+}
+static unsigned int decode_compactedbits(unsigned int lobits,
+                                         unsigned int lomask,
+                                         u8 *in, unsigned int pos, u8 *type)
+{
+        const unsigned int v = get_unaligned_le32(in + pos / 8) >> (pos & 7);
+        const unsigned int lo = v & lomask;
+        *type = (v >> lobits) & 3;
+        return lo;
+}
+static int unpack_compacted_index(struct z_erofs_maprecorder *m,
+                                  unsigned int amortizedshift,
+                                  unsigned int eofs)
+{
+        struct erofs_vnode *const vi = EROFS_V(m->inode);
+        const unsigned int lclusterbits = vi->z_logical_clusterbits;
+        const unsigned int lomask = (1 << lclusterbits) - 1;
+        unsigned int vcnt, base, lo, encodebits, nblk;
+        int i;
+        u8 *in, type;
+        if (1 << amortizedshift == 4)
+                vcnt = 2;
+        else if (1 << amortizedshift == 2 && lclusterbits == 12)
+                vcnt = 16;
+        else
+                return -EOPNOTSUPP;
+        encodebits = ((vcnt << amortizedshift) - sizeof(__le32)) * 8 / vcnt;
+        base = round_down(eofs, vcnt << amortizedshift);
+        in = m->kaddr + base;
+        i = (eofs - base) >> amortizedshift;
+        lo = decode_compactedbits(lclusterbits, lomask,
+                                  in, encodebits * i, &type);
+        m->type = type;
+        if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) {
+                m->clusterofs = 1 << lclusterbits;
+                if (i + 1 != vcnt) {
+                        m->delta[0] = lo;
+                        return 0;
+                }
+                /*
+                 * since the last lcluster in the pack is special,
+                 * of which lo saves delta[1] rather than delta[0].
+                 * Hence, get delta[0] by the previous lcluster indirectly.
+                 */
+                lo = decode_compactedbits(lclusterbits, lomask,
+                                          in, encodebits * (i - 1), &type);
+                if (type != Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD)
+                        lo = 0;
+                m->delta[0] = lo + 1;
+                return 0;
+        }
+        m->clusterofs = lo;
+        m->delta[0] = 0;
+        /* figout out blkaddr (pblk) for HEAD lclusters */
+        nblk = 1;
+        while (i > 0) {
+                --i;
+                lo = decode_compactedbits(lclusterbits, lomask,
+                                          in, encodebits * i, &type);
+                if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD)
+                        i -= lo;
+                if (i >= 0)
+                        ++nblk;
+        }
+        in += (vcnt << amortizedshift) - sizeof(__le32);
+        m->pblk = le32_to_cpu(*(__le32 *)in) + nblk;
+        return 0;
+}
+static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m,
+                                            unsigned long lcn)
+{
+        struct inode *const inode = m->inode;
+        struct erofs_vnode *const vi = EROFS_V(inode);
+        const unsigned int lclusterbits = vi->z_logical_clusterbits;
+        const erofs_off_t ebase = ALIGN(iloc(EROFS_I_SB(inode), vi->nid) +
+                                        vi->inode_isize + vi->xattr_isize, 8) +
+                sizeof(struct z_erofs_map_header);
+        const unsigned int totalidx = DIV_ROUND_UP(inode->i_size, EROFS_BLKSIZ);
+        unsigned int compacted_4b_initial, compacted_2b;
+        unsigned int amortizedshift;
+        erofs_off_t pos;
+        int err;
+        if (lclusterbits != 12)
+                return -EOPNOTSUPP;
+        if (lcn >= totalidx)
+                return -EINVAL;
+        m->lcn = lcn;
+        /* used to align to 32-byte (compacted_2b) alignment */
+        compacted_4b_initial = (32 - ebase % 32) / 4;
+        if (compacted_4b_initial == 32 / 4)
+                compacted_4b_initial = 0;
+        if (vi->z_advise & Z_EROFS_ADVISE_COMPACTED_2B)
+                compacted_2b = rounddown(totalidx - compacted_4b_initial, 16);
+        else
+                compacted_2b = 0;
+        pos = ebase;
+        if (lcn < compacted_4b_initial) {
+                amortizedshift = 2;
+                goto out;
+        }
+        pos += compacted_4b_initial * 4;
+        lcn -= compacted_4b_initial;
+        if (lcn < compacted_2b) {
+                amortizedshift = 1;
+                goto out;
+        }
+        pos += compacted_2b * 2;
+        lcn -= compacted_2b;
+        amortizedshift = 2;
+out:
+        pos += lcn * (1 << amortizedshift);
+        err = z_erofs_reload_indexes(m, erofs_blknr(pos));
+        if (err)
+                return err;
+        return unpack_compacted_index(m, amortizedshift, erofs_blkoff(pos));
+}
+static int vle_load_cluster_from_disk(struct z_erofs_maprecorder *m,
+                                      unsigned int lcn)
+{
+        const unsigned int datamode = EROFS_V(m->inode)->datamode;
+        if (datamode == EROFS_INODE_FLAT_COMPRESSION_LEGACY)
+                return vle_legacy_load_cluster_from_disk(m, lcn);
+        if (datamode == EROFS_INODE_FLAT_COMPRESSION)
+                return compacted_load_cluster_from_disk(m, lcn);
+        return -EINVAL;
+}
+static int vle_extent_lookback(struct z_erofs_maprecorder *m,
+                               unsigned int lookback_distance)
+{
+        struct erofs_vnode *const vi = EROFS_V(m->inode);
+        struct erofs_map_blocks *const map = m->map;
+        const unsigned int lclusterbits = vi->z_logical_clusterbits;
+        unsigned long lcn = m->lcn;
+        int err;
+        if (lcn < lookback_distance) {
+                errln("bogus lookback distance @ nid %llu", vi->nid);
+                DBG_BUGON(1);
+                return -EFSCORRUPTED;
+        }
+        /* load extent head logical cluster if needed */
+        lcn -= lookback_distance;
+        err = vle_load_cluster_from_disk(m, lcn);
+        if (err)
+                return err;
+        switch (m->type) {
+        case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+                if (unlikely(!m->delta[0])) {
+                        errln("invalid lookback distance 0 at nid %llu",
+                              vi->nid);
+                        DBG_BUGON(1);
+                        return -EFSCORRUPTED;
+                }
+                return vle_extent_lookback(m, m->delta[0]);
+        case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
+                map->m_flags &= ~EROFS_MAP_ZIPPED;
+                /* fallthrough */
+        case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
+                map->m_la = (lcn << lclusterbits) | m->clusterofs;
+                break;
+        default:
+                errln("unknown type %u at lcn %lu of nid %llu",
+                      m->type, lcn, vi->nid);
+                DBG_BUGON(1);
+                return -EOPNOTSUPP;
+        }
+        return 0;
+}
+int z_erofs_map_blocks_iter(struct inode *inode,
+                            struct erofs_map_blocks *map,
+                            int flags)
+{
+        struct erofs_vnode *const vi = EROFS_V(inode);
+        struct z_erofs_maprecorder m = {
+                .inode = inode,
+                .map = map,
+        };
+        int err = 0;
+        unsigned int lclusterbits, endoff;
+        unsigned long long ofs, end;
+        trace_z_erofs_map_blocks_iter_enter(inode, map, flags);
+        /* when trying to read beyond EOF, leave it unmapped */
+        if (unlikely(map->m_la >= inode->i_size)) {
+                map->m_llen = map->m_la + 1 - inode->i_size;
+                map->m_la = inode->i_size;
+                map->m_flags = 0;
+                goto out;
+        }
+        err = fill_inode_lazy(inode);
+        if (err)
+                goto out;
+        lclusterbits = vi->z_logical_clusterbits;
+        ofs = map->m_la;
+        m.lcn = ofs >> lclusterbits;
+        endoff = ofs & ((1 << lclusterbits) - 1);
+        err = vle_load_cluster_from_disk(&m, m.lcn);
+        if (err)
+                goto unmap_out;
+        map->m_flags = EROFS_MAP_ZIPPED;        /* by default, compressed */
+        end = (m.lcn + 1ULL) << lclusterbits;
+        switch (m.type) {
+        case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
+                if (endoff >= m.clusterofs)
+                        map->m_flags &= ~EROFS_MAP_ZIPPED;
+                /* fallthrough */
+        case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
+                if (endoff >= m.clusterofs) {
+                        map->m_la = (m.lcn << lclusterbits) | m.clusterofs;
+                        break;
+                }
+                /* m.lcn should be >= 1 if endoff < m.clusterofs */
+                if (unlikely(!m.lcn)) {
+                        errln("invalid logical cluster 0 at nid %llu",
+                              vi->nid);
+                        err = -EFSCORRUPTED;
+                        goto unmap_out;
+                }
+                end = (m.lcn << lclusterbits) | m.clusterofs;
+                map->m_flags |= EROFS_MAP_FULL_MAPPED;
+                m.delta[0] = 1;
+                /* fallthrough */
+        case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+                /* get the correspoinding first chunk */
+                err = vle_extent_lookback(&m, m.delta[0]);
+                if (unlikely(err))
+                        goto unmap_out;
+                break;
+        default:
+                errln("unknown type %u at offset %llu of nid %llu",
+                      m.type, ofs, vi->nid);
+                err = -EOPNOTSUPP;
+                goto unmap_out;
+        }
+        map->m_llen = end - map->m_la;
+        map->m_plen = 1 << lclusterbits;
+        map->m_pa = blknr_to_addr(m.pblk);
+        map->m_flags |= EROFS_MAP_MAPPED;
+unmap_out:
+        if (m.kaddr)
+                kunmap_atomic(m.kaddr);
+out:
+        debugln("%s, m_la %llu m_pa %llu m_llen %llu m_plen %llu m_flags 0%o",
+                __func__, map->m_la, map->m_pa,
+                map->m_llen, map->m_plen, map->m_flags);
+        trace_z_erofs_map_blocks_iter_exit(inode, map, flags, err);
+        /* aggressively BUG_ON iff CONFIG_EROFS_FS_DEBUG is on */
+        DBG_BUGON(err < 0 && err != -ENOMEM);
+        return err;
+}
diff --git a/fs/erofs/zpvec.h b/fs/erofs/zpvec.h
new file mode 100644
index 000000000000..bd3cee16491c
--- /dev/null
+++ b/fs/erofs/zpvec.h
@@ -0,0 +1,157 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             http://www.huawei.com/
+ * Created by Gao Xiang <gaoxiang25@huawei.com>
+ */
+#ifndef __EROFS_FS_ZPVEC_H
+#define __EROFS_FS_ZPVEC_H
+#include "tagptr.h"
+/* page type in pagevec for decompress subsystem */
+enum z_erofs_page_type {
+        /* including Z_EROFS_VLE_PAGE_TAIL_EXCLUSIVE */
+        Z_EROFS_PAGE_TYPE_EXCLUSIVE,
+        Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED,
+        Z_EROFS_VLE_PAGE_TYPE_HEAD,
+        Z_EROFS_VLE_PAGE_TYPE_MAX
+};
+extern void __compiletime_error("Z_EROFS_PAGE_TYPE_EXCLUSIVE != 0")
+        __bad_page_type_exclusive(void);
+/* pagevec tagged pointer */
+typedef tagptr2_t       erofs_vtptr_t;
+/* pagevec collector */
+struct z_erofs_pagevec_ctor {
+        struct page *curr, *next;
+        erofs_vtptr_t *pages;
+        unsigned int nr, index;
+};
+static inline void z_erofs_pagevec_ctor_exit(struct z_erofs_pagevec_ctor *ctor,
+                                             bool atomic)
+{
+        if (!ctor->curr)
+                return;
+        if (atomic)
+                kunmap_atomic(ctor->pages);
+        else
+                kunmap(ctor->curr);
+}
+static inline struct page *
+z_erofs_pagevec_ctor_next_page(struct z_erofs_pagevec_ctor *ctor,
+                               unsigned int nr)
+{
+        unsigned int index;
+        /* keep away from occupied pages */
+        if (ctor->next)
+                return ctor->next;
+        for (index = 0; index < nr; ++index) {
+                const erofs_vtptr_t t = ctor->pages[index];
+                const unsigned int tags = tagptr_unfold_tags(t);
+                if (tags == Z_EROFS_PAGE_TYPE_EXCLUSIVE)
+                        return tagptr_unfold_ptr(t);
+        }
+        DBG_BUGON(nr >= ctor->nr);
+        return NULL;
+}
+static inline void
+z_erofs_pagevec_ctor_pagedown(struct z_erofs_pagevec_ctor *ctor,
+                              bool atomic)
+{
+        struct page *next = z_erofs_pagevec_ctor_next_page(ctor, ctor->nr);
+        z_erofs_pagevec_ctor_exit(ctor, atomic);
+        ctor->curr = next;
+        ctor->next = NULL;
+        ctor->pages = atomic ?
+                kmap_atomic(ctor->curr) : kmap(ctor->curr);
+        ctor->nr = PAGE_SIZE / sizeof(struct page *);
+        ctor->index = 0;
+}
+static inline void z_erofs_pagevec_ctor_init(struct z_erofs_pagevec_ctor *ctor,
+                                             unsigned int nr,
+                                             erofs_vtptr_t *pages,
+                                             unsigned int i)
+{
+        ctor->nr = nr;
+        ctor->curr = ctor->next = NULL;
+        ctor->pages = pages;
+        if (i >= nr) {
+                i -= nr;
+                z_erofs_pagevec_ctor_pagedown(ctor, false);
+                while (i > ctor->nr) {
+                        i -= ctor->nr;
+                        z_erofs_pagevec_ctor_pagedown(ctor, false);
+                }
+        }
+        ctor->next = z_erofs_pagevec_ctor_next_page(ctor, i);
+        ctor->index = i;
+}
+static inline bool z_erofs_pagevec_enqueue(struct z_erofs_pagevec_ctor *ctor,
+                                           struct page *page,
+                                           enum z_erofs_page_type type,
+                                           bool *occupied)
+{
+        *occupied = false;
+        if (unlikely(!ctor->next && type))
+                if (ctor->index + 1 == ctor->nr)
+                        return false;
+        if (unlikely(ctor->index >= ctor->nr))
+                z_erofs_pagevec_ctor_pagedown(ctor, false);
+        /* exclusive page type must be 0 */
+        if (Z_EROFS_PAGE_TYPE_EXCLUSIVE != (uintptr_t)NULL)
+                __bad_page_type_exclusive();
+        /* should remind that collector->next never equal to 1, 2 */
+        if (type == (uintptr_t)ctor->next) {
+                ctor->next = page;
+                *occupied = true;
+        }
+        ctor->pages[ctor->index++] = tagptr_fold(erofs_vtptr_t, page, type);
+        return true;
+}
+static inline struct page *
+z_erofs_pagevec_dequeue(struct z_erofs_pagevec_ctor *ctor,
+                        enum z_erofs_page_type *type)
+{
+        erofs_vtptr_t t;
+        if (unlikely(ctor->index >= ctor->nr)) {
+                DBG_BUGON(!ctor->next);
+                z_erofs_pagevec_ctor_pagedown(ctor, true);
+        }
+        t = ctor->pages[ctor->index];
+        *type = tagptr_unfold_tags(t);
+        /* should remind that collector->next never equal to 1, 2 */
+        if (*type == (uintptr_t)ctor->next)
+                ctor->next = tagptr_unfold_ptr(t);
+        ctor->pages[ctor->index++] = tagptr_fold(erofs_vtptr_t, NULL, 0);
+        return tagptr_unfold_ptr(t);
+}
+#endif