From cc83dfc7cea40b7fc76c7a4d432e7a29991ae9e8 Mon Sep 17 00:00:00 2001
From: Gao Xiang <gaoxiang25@huawei.com>
Date: Tue, 30 Oct 2018 15:07:28 -0700
Subject: [PATCH 01/34] lib/lz4: update LZ4 decompressor module

Update the LZ4 compression module based on LZ4 v1.8.3 in order for the
erofs file system to use the newest LZ4_decompress_safe_partial() which
can now decode exactly the nb of bytes requested [1] to take place of the
open hacked code in the erofs file system itself.

Currently, apart from the erofs file system, no other users use
LZ4_decompress_safe_partial, so no worry about the interface.

In addition, LZ4 v1.8.x boosts up decompression speed compared to the
current code which is based on LZ4 v1.7.3, mainly due to shortcut
optimization for the specific common LZ4-sequences [2].

lzbench testdata (tested in kirin710, 8 cores, 4 big cores
at 2189Mhz, 2GB DDR RAM at 1622Mhz, with enwik8 testdata [3]):

Compressor name         Compress. Decompress. Compr. size  Ratio Filename
memcpy                   5004 MB/s  4924 MB/s   100000000 100.00 enwik8
lz4hc 1.7.3 -9             12 MB/s   653 MB/s    42203253  42.20 enwik8
lz4hc 1.8.0 -9             12 MB/s   908 MB/s    42203096  42.20 enwik8
lz4hc 1.8.3 -9             11 MB/s   965 MB/s    42203094  42.20 enwik8

[1] https://github.com/lz4/lz4/issues/566
    https://github.com/lz4/lz4/commit/08d347b5b217b011ff7487130b79480d8cfdaeb8

[2] v1.8.1 perf: slightly faster compression and decompression speed
    https://github.com/lz4/lz4/commit/a31b7058cb97e4393da55e78a77a1c6f0c9ae038
    v1.8.2 perf: slightly faster HC compression and decompression speed
    https://github.com/lz4/lz4/commit/45f8603aae389d34c689d3ff7427b314071ccd2c
    https://github.com/lz4/lz4/commit/1a191b3f8d26b50a7c1d41590b529ec308d768cd

[3] http://mattmahoney.net/dc/textdata.html
    http://mattmahoney.net/dc/enwik8.zip

Link: http://lkml.kernel.org/r/1537181207-21932-1-git-send-email-gaoxiang25@huawei.com
Signed-off-by: Gao Xiang <gaoxiang25@huawei.com>
Tested-by: Guo Xuenan <guoxuenan@huawei.com>
Cc: Colin Ian King <colin.king@canonical.com>
Cc: Yann Collet <yann.collet.73@gmail.com>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Fang Wei <fangwei1@huawei.com>
Cc: Chao Yu <yuchao0@huawei.com>
Cc: Miao Xie <miaoxie@huawei.com>
Cc: Sven Schmidt <4sschmid@informatik.uni-hamburg.de>
Cc: Kyungsik Lee <kyungsik.lee@lge.com>
Cc: <weidu.du@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/lz4/lz4_decompress.c | 495 +++++++++++++++++++++++++++------------
 lib/lz4/lz4defs.h        |   9 +-
 2 files changed, 356 insertions(+), 148 deletions(-)

diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c
index 141734d255e4..0c9d3ad17e0f 100644
--- a/lib/lz4/lz4_decompress.c
+++ b/lib/lz4/lz4_decompress.c
@@ -43,30 +43,36 @@
 /*-*****************************
  *	Decompression functions
  *******************************/
-/* LZ4_decompress_generic() :
- * This generic decompression function cover all use cases.
- * It shall be instantiated several times, using different sets of directives
- * Note that it is important this generic function is really inlined,
+
+#define DEBUGLOG(l, ...) {}	/* disabled */
+
+#ifndef assert
+#define assert(condition) ((void)0)
+#endif
+
+/*
+ * LZ4_decompress_generic() :
+ * This generic decompression function covers all use cases.
+ * It shall be instantiated several times, using different sets of directives.
+ * Note that it is important for performance that this function really get inlined,
  * in order to remove useless branches during compilation optimization.
  */
 static FORCE_INLINE int LZ4_decompress_generic(
-	 const char * const source,
-	 char * const dest,
-	 int inputSize,
+	 const char * const src,
+	 char * const dst,
+	 int srcSize,
 		/*
 		 * If endOnInput == endOnInputSize,
-		 * this value is the max size of Output Buffer.
+		 * this value is `dstCapacity`
 		 */
 	 int outputSize,
 	 /* endOnOutputSize, endOnInputSize */
-	 int endOnInput,
+	 endCondition_directive endOnInput,
 	 /* full, partial */
-	 int partialDecoding,
-	 /* only used if partialDecoding == partial */
-	 int targetOutputSize,
+	 earlyEnd_directive partialDecoding,
 	 /* noDict, withPrefix64k, usingExtDict */
-	 int dict,
-	 /* == dest when no prefix */
+	 dict_directive dict,
+	 /* always <= dst, == dst when no prefix */
 	 const BYTE * const lowPrefix,
 	 /* only if dict == usingExtDict */
 	 const BYTE * const dictStart,
@@ -74,35 +80,43 @@ static FORCE_INLINE int LZ4_decompress_generic(
 	 const size_t dictSize
 	 )
 {
-	/* Local Variables */
-	const BYTE *ip = (const BYTE *) source;
-	const BYTE * const iend = ip + inputSize;
+	const BYTE *ip = (const BYTE *) src;
+	const BYTE * const iend = ip + srcSize;
 
-	BYTE *op = (BYTE *) dest;
+	BYTE *op = (BYTE *) dst;
 	BYTE * const oend = op + outputSize;
 	BYTE *cpy;
-	BYTE *oexit = op + targetOutputSize;
-	const BYTE * const lowLimit = lowPrefix - dictSize;
 
 	const BYTE * const dictEnd = (const BYTE *)dictStart + dictSize;
-	static const unsigned int dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 };
-	static const int dec64table[] = { 0, 0, 0, -1, 0, 1, 2, 3 };
+	static const unsigned int inc32table[8] = {0, 1, 2, 1, 0, 4, 4, 4};
+	static const int dec64table[8] = {0, 0, 0, -1, -4, 1, 2, 3};
 
 	const int safeDecode = (endOnInput == endOnInputSize);
 	const int checkOffset = ((safeDecode) && (dictSize < (int)(64 * KB)));
 
+	/* Set up the "end" pointers for the shortcut. */
+	const BYTE *const shortiend = iend -
+		(endOnInput ? 14 : 8) /*maxLL*/ - 2 /*offset*/;
+	const BYTE *const shortoend = oend -
+		(endOnInput ? 14 : 8) /*maxLL*/ - 18 /*maxML*/;
+
+	DEBUGLOG(5, "%s (srcSize:%i, dstSize:%i)", __func__,
+		 srcSize, outputSize);
+
 	/* Special cases */
-	/* targetOutputSize too high => decode everything */
-	if ((partialDecoding) && (oexit > oend - MFLIMIT))
-		oexit = oend - MFLIMIT;
+	assert(lowPrefix <= op);
+	assert(src != NULL);
 
 	/* Empty output buffer */
 	if ((endOnInput) && (unlikely(outputSize == 0)))
-		return ((inputSize == 1) && (*ip == 0)) ? 0 : -1;
+		return ((srcSize == 1) && (*ip == 0)) ? 0 : -1;
 
 	if ((!endOnInput) && (unlikely(outputSize == 0)))
 		return (*ip == 0 ? 1 : -1);
 
+	if ((endOnInput) && unlikely(srcSize == 0))
+		return -1;
+
 	/* Main Loop : decode sequences */
 	while (1) {
 		size_t length;
@@ -111,12 +125,74 @@ static FORCE_INLINE int LZ4_decompress_generic(
 
 		/* get literal length */
 		unsigned int const token = *ip++;
-
 		length = token>>ML_BITS;
 
+		/* ip < iend before the increment */
+		assert(!endOnInput || ip <= iend);
+
+		/*
+		 * A two-stage shortcut for the most common case:
+		 * 1) If the literal length is 0..14, and there is enough
+		 * space, enter the shortcut and copy 16 bytes on behalf
+		 * of the literals (in the fast mode, only 8 bytes can be
+		 * safely copied this way).
+		 * 2) Further if the match length is 4..18, copy 18 bytes
+		 * in a similar manner; but we ensure that there's enough
+		 * space in the output for those 18 bytes earlier, upon
+		 * entering the shortcut (in other words, there is a
+		 * combined check for both stages).
+		 */
+		if ((endOnInput ? length != RUN_MASK : length <= 8)
+		   /*
+		    * strictly "less than" on input, to re-enter
+		    * the loop with at least one byte
+		    */
+		   && likely((endOnInput ? ip < shortiend : 1) &
+			     (op <= shortoend))) {
+			/* Copy the literals */
+			memcpy(op, ip, endOnInput ? 16 : 8);
+			op += length; ip += length;
+
+			/*
+			 * The second stage:
+			 * prepare for match copying, decode full info.
+			 * If it doesn't work out, the info won't be wasted.
+			 */
+			length = token & ML_MASK; /* match length */
+			offset = LZ4_readLE16(ip);
+			ip += 2;
+			match = op - offset;
+			assert(match <= op); /* check overflow */
+
+			/* Do not deal with overlapping matches. */
+			if ((length != ML_MASK) &&
+			    (offset >= 8) &&
+			    (dict == withPrefix64k || match >= lowPrefix)) {
+				/* Copy the match. */
+				memcpy(op + 0, match + 0, 8);
+				memcpy(op + 8, match + 8, 8);
+				memcpy(op + 16, match + 16, 2);
+				op += length + MINMATCH;
+				/* Both stages worked, load the next token. */
+				continue;
+			}
+
+			/*
+			 * The second stage didn't work out, but the info
+			 * is ready. Propel it right to the point of match
+			 * copying.
+			 */
+			goto _copy_match;
+		}
+
+		/* decode literal length */
 		if (length == RUN_MASK) {
 			unsigned int s;
 
+			if (unlikely(endOnInput ? ip >= iend - RUN_MASK : 0)) {
+				/* overflow detection */
+				goto _output_error;
+			}
 			do {
 				s = *ip++;
 				length += s;
@@ -125,14 +201,14 @@ static FORCE_INLINE int LZ4_decompress_generic(
 				: 1) & (s == 255));
 
 			if ((safeDecode)
-				&& unlikely(
-					(size_t)(op + length) < (size_t)(op))) {
+			    && unlikely((uptrval)(op) +
+					length < (uptrval)(op))) {
 				/* overflow detection */
 				goto _output_error;
 			}
 			if ((safeDecode)
-				&& unlikely(
-					(size_t)(ip + length) < (size_t)(ip))) {
+			    && unlikely((uptrval)(ip) +
+					length < (uptrval)(ip))) {
 				/* overflow detection */
 				goto _output_error;
 			}
@@ -140,16 +216,19 @@ static FORCE_INLINE int LZ4_decompress_generic(
 
 		/* copy literals */
 		cpy = op + length;
-		if (((endOnInput) && ((cpy > (partialDecoding ? oexit : oend - MFLIMIT))
+		LZ4_STATIC_ASSERT(MFLIMIT >= WILDCOPYLENGTH);
+
+		if (((endOnInput) && ((cpy > oend - MFLIMIT)
 			|| (ip + length > iend - (2 + 1 + LASTLITERALS))))
 			|| ((!endOnInput) && (cpy > oend - WILDCOPYLENGTH))) {
 			if (partialDecoding) {
 				if (cpy > oend) {
 					/*
-					 * Error :
-					 * write attempt beyond end of output buffer
+					 * Partial decoding :
+					 * stop in the middle of literal segment
 					 */
-					goto _output_error;
+					cpy = oend;
+					length = oend - op;
 				}
 				if ((endOnInput)
 					&& (ip + length > iend)) {
@@ -184,29 +263,43 @@ static FORCE_INLINE int LZ4_decompress_generic(
 			memcpy(op, ip, length);
 			ip += length;
 			op += length;
-			/* Necessarily EOF, due to parsing restrictions */
-			break;
-		}
 
-		LZ4_wildCopy(op, ip, cpy);
-		ip += length;
-		op = cpy;
+			/* Necessarily EOF, due to parsing restrictions */
+			if (!partialDecoding || (cpy == oend))
+				break;
+		} else {
+			/* may overwrite up to WILDCOPYLENGTH beyond cpy */
+			LZ4_wildCopy(op, ip, cpy);
+			ip += length;
+			op = cpy;
+		}
 
 		/* get offset */
 		offset = LZ4_readLE16(ip);
 		ip += 2;
 		match = op - offset;
 
-		if ((checkOffset) && (unlikely(match < lowLimit))) {
+		/* get matchlength */
+		length = token & ML_MASK;
+
+_copy_match:
+		if ((checkOffset) && (unlikely(match + dictSize < lowPrefix))) {
 			/* Error : offset outside buffers */
 			goto _output_error;
 		}
 
 		/* costs ~1%; silence an msan warning when offset == 0 */
-		LZ4_write32(op, (U32)offset);
+		/*
+		 * note : when partialDecoding, there is no guarantee that
+		 * at least 4 bytes remain available in output buffer
+		 */
+		if (!partialDecoding) {
+			assert(oend > op);
+			assert(oend - op >= 4);
+
+			LZ4_write32(op, (U32)offset);
+		}
 
-		/* get matchlength */
-		length = token & ML_MASK;
 		if (length == ML_MASK) {
 			unsigned int s;
 
@@ -221,7 +314,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
 
 			if ((safeDecode)
 				&& unlikely(
-					(size_t)(op + length) < (size_t)op)) {
+					(uptrval)(op) + length < (uptrval)op)) {
 				/* overflow detection */
 				goto _output_error;
 			}
@@ -229,24 +322,26 @@ static FORCE_INLINE int LZ4_decompress_generic(
 
 		length += MINMATCH;
 
-		/* check external dictionary */
+		/* match starting within external dictionary */
 		if ((dict == usingExtDict) && (match < lowPrefix)) {
 			if (unlikely(op + length > oend - LASTLITERALS)) {
 				/* doesn't respect parsing restriction */
-				goto _output_error;
+				if (!partialDecoding)
+					goto _output_error;
+				length = min(length, (size_t)(oend - op));
 			}
 
 			if (length <= (size_t)(lowPrefix - match)) {
 				/*
-				 * match can be copied as a single segment
-				 * from external dictionary
+				 * match fits entirely within external
+				 * dictionary : just copy
 				 */
 				memmove(op, dictEnd - (lowPrefix - match),
 					length);
 				op += length;
 			} else {
 				/*
-				 * match encompass external
+				 * match stretches into both external
 				 * dictionary and current block
 				 */
 				size_t const copySize = (size_t)(lowPrefix - match);
@@ -254,7 +349,6 @@ static FORCE_INLINE int LZ4_decompress_generic(
 
 				memcpy(op, dictEnd - copySize, copySize);
 				op += copySize;
-
 				if (restSize > (size_t)(op - lowPrefix)) {
 					/* overlap copy */
 					BYTE * const endOfMatch = op + restSize;
@@ -267,23 +361,44 @@ static FORCE_INLINE int LZ4_decompress_generic(
 					op += restSize;
 				}
 			}
-
 			continue;
 		}
 
 		/* copy match within block */
 		cpy = op + length;
 
-		if (unlikely(offset < 8)) {
-			const int dec64 = dec64table[offset];
+		/*
+		 * partialDecoding :
+		 * may not respect endBlock parsing restrictions
+		 */
+		assert(op <= oend);
+		if (partialDecoding &&
+		    (cpy > oend - MATCH_SAFEGUARD_DISTANCE)) {
+			size_t const mlen = min(length, (size_t)(oend - op));
+			const BYTE * const matchEnd = match + mlen;
+			BYTE * const copyEnd = op + mlen;
 
+			if (matchEnd > op) {
+				/* overlap copy */
+				while (op < copyEnd)
+					*op++ = *match++;
+			} else {
+				memcpy(op, match, mlen);
+			}
+			op = copyEnd;
+			if (op == oend)
+				break;
+			continue;
+		}
+
+		if (unlikely(offset < 8)) {
 			op[0] = match[0];
 			op[1] = match[1];
 			op[2] = match[2];
 			op[3] = match[3];
-			match += dec32table[offset];
+			match += inc32table[offset];
 			memcpy(op + 4, match, 4);
-			match -= dec64;
+			match -= dec64table[offset];
 		} else {
 			LZ4_copy8(op, match);
 			match += 8;
@@ -291,7 +406,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
 
 		op += 8;
 
-		if (unlikely(cpy > oend - 12)) {
+		if (unlikely(cpy > oend - MATCH_SAFEGUARD_DISTANCE)) {
 			BYTE * const oCopyLimit = oend - (WILDCOPYLENGTH - 1);
 
 			if (cpy > oend - LASTLITERALS) {
@@ -307,60 +422,139 @@ static FORCE_INLINE int LZ4_decompress_generic(
 				match += oCopyLimit - op;
 				op = oCopyLimit;
 			}
-
 			while (op < cpy)
 				*op++ = *match++;
 		} else {
 			LZ4_copy8(op, match);
-
 			if (length > 16)
 				LZ4_wildCopy(op + 8, match + 8, cpy);
 		}
-
-		op = cpy; /* correction */
+		op = cpy; /* wildcopy correction */
 	}
 
 	/* end of decoding */
 	if (endOnInput) {
 		/* Nb of output bytes decoded */
-		return (int) (((char *)op) - dest);
+		return (int) (((char *)op) - dst);
 	} else {
 		/* Nb of input bytes read */
-		return (int) (((const char *)ip) - source);
+		return (int) (((const char *)ip) - src);
 	}
 
 	/* Overflow error detected */
 _output_error:
-	return -1;
+	return (int) (-(((const char *)ip) - src)) - 1;
 }
 
 int LZ4_decompress_safe(const char *source, char *dest,
 	int compressedSize, int maxDecompressedSize)
 {
-	return LZ4_decompress_generic(source, dest, compressedSize,
-		maxDecompressedSize, endOnInputSize, full, 0,
-		noDict, (BYTE *)dest, NULL, 0);
+	return LZ4_decompress_generic(source, dest,
+				      compressedSize, maxDecompressedSize,
+				      endOnInputSize, decode_full_block,
+				      noDict, (BYTE *)dest, NULL, 0);
 }
 
-int LZ4_decompress_safe_partial(const char *source, char *dest,
-	int compressedSize, int targetOutputSize, int maxDecompressedSize)
+int LZ4_decompress_safe_partial(const char *src, char *dst,
+	int compressedSize, int targetOutputSize, int dstCapacity)
 {
-	return LZ4_decompress_generic(source, dest, compressedSize,
-		maxDecompressedSize, endOnInputSize, partial,
-		targetOutputSize, noDict, (BYTE *)dest, NULL, 0);
+	dstCapacity = min(targetOutputSize, dstCapacity);
+	return LZ4_decompress_generic(src, dst, compressedSize, dstCapacity,
+				      endOnInputSize, partial_decode,
+				      noDict, (BYTE *)dst, NULL, 0);
 }
 
 int LZ4_decompress_fast(const char *source, char *dest, int originalSize)
 {
 	return LZ4_decompress_generic(source, dest, 0, originalSize,
-		endOnOutputSize, full, 0, withPrefix64k,
-		(BYTE *)(dest - 64 * KB), NULL, 64 * KB);
+				      endOnOutputSize, decode_full_block,
+				      withPrefix64k,
+				      (BYTE *)dest - 64 * KB, NULL, 0);
 }
 
+/* ===== Instantiate a few more decoding cases, used more than once. ===== */
+
+int LZ4_decompress_safe_withPrefix64k(const char *source, char *dest,
+				      int compressedSize, int maxOutputSize)
+{
+	return LZ4_decompress_generic(source, dest,
+				      compressedSize, maxOutputSize,
+				      endOnInputSize, decode_full_block,
+				      withPrefix64k,
+				      (BYTE *)dest - 64 * KB, NULL, 0);
+}
+
+static int LZ4_decompress_safe_withSmallPrefix(const char *source, char *dest,
+					       int compressedSize,
+					       int maxOutputSize,
+					       size_t prefixSize)
+{
+	return LZ4_decompress_generic(source, dest,
+				      compressedSize, maxOutputSize,
+				      endOnInputSize, decode_full_block,
+				      noDict,
+				      (BYTE *)dest - prefixSize, NULL, 0);
+}
+
+int LZ4_decompress_safe_forceExtDict(const char *source, char *dest,
+				     int compressedSize, int maxOutputSize,
+				     const void *dictStart, size_t dictSize)
+{
+	return LZ4_decompress_generic(source, dest,
+				      compressedSize, maxOutputSize,
+				      endOnInputSize, decode_full_block,
+				      usingExtDict, (BYTE *)dest,
+				      (const BYTE *)dictStart, dictSize);
+}
+
+static int LZ4_decompress_fast_extDict(const char *source, char *dest,
+				       int originalSize,
+				       const void *dictStart, size_t dictSize)
+{
+	return LZ4_decompress_generic(source, dest,
+				      0, originalSize,
+				      endOnOutputSize, decode_full_block,
+				      usingExtDict, (BYTE *)dest,
+				      (const BYTE *)dictStart, dictSize);
+}
+
+/*
+ * The "double dictionary" mode, for use with e.g. ring buffers: the first part
+ * of the dictionary is passed as prefix, and the second via dictStart + dictSize.
+ * These routines are used only once, in LZ4_decompress_*_continue().
+ */
+static FORCE_INLINE
+int LZ4_decompress_safe_doubleDict(const char *source, char *dest,
+				   int compressedSize, int maxOutputSize,
+				   size_t prefixSize,
+				   const void *dictStart, size_t dictSize)
+{
+	return LZ4_decompress_generic(source, dest,
+				      compressedSize, maxOutputSize,
+				      endOnInputSize, decode_full_block,
+				      usingExtDict, (BYTE *)dest - prefixSize,
+				      (const BYTE *)dictStart, dictSize);
+}
+
+static FORCE_INLINE
+int LZ4_decompress_fast_doubleDict(const char *source, char *dest,
+				   int originalSize, size_t prefixSize,
+				   const void *dictStart, size_t dictSize)
+{
+	return LZ4_decompress_generic(source, dest,
+				      0, originalSize,
+				      endOnOutputSize, decode_full_block,
+				      usingExtDict, (BYTE *)dest - prefixSize,
+				      (const BYTE *)dictStart, dictSize);
+}
+
+/* ===== streaming decompression functions ===== */
+
 int LZ4_setStreamDecode(LZ4_streamDecode_t *LZ4_streamDecode,
 	const char *dictionary, int dictSize)
 {
-	LZ4_streamDecode_t_internal *lz4sd = (LZ4_streamDecode_t_internal *) LZ4_streamDecode;
+	LZ4_streamDecode_t_internal *lz4sd =
+		&LZ4_streamDecode->internal_donotuse;
 
 	lz4sd->prefixSize = (size_t) dictSize;
 	lz4sd->prefixEnd = (const BYTE *) dictionary + dictSize;
@@ -382,35 +576,51 @@ int LZ4_setStreamDecode(LZ4_streamDecode_t *LZ4_streamDecode,
 int LZ4_decompress_safe_continue(LZ4_streamDecode_t *LZ4_streamDecode,
 	const char *source, char *dest, int compressedSize, int maxOutputSize)
 {
-	LZ4_streamDecode_t_internal *lz4sd = &LZ4_streamDecode->internal_donotuse;
+	LZ4_streamDecode_t_internal *lz4sd =
+		&LZ4_streamDecode->internal_donotuse;
 	int result;
 
-	if (lz4sd->prefixEnd == (BYTE *)dest) {
-		result = LZ4_decompress_generic(source, dest,
-			compressedSize,
-			maxOutputSize,
-			endOnInputSize, full, 0,
-			usingExtDict, lz4sd->prefixEnd - lz4sd->prefixSize,
-			lz4sd->externalDict,
-			lz4sd->extDictSize);
-
+	if (lz4sd->prefixSize == 0) {
+		/* The first call, no dictionary yet. */
+		assert(lz4sd->extDictSize == 0);
+		result = LZ4_decompress_safe(source, dest,
+			compressedSize, maxOutputSize);
+		if (result <= 0)
+			return result;
+		lz4sd->prefixSize = result;
+		lz4sd->prefixEnd = (BYTE *)dest + result;
+	} else if (lz4sd->prefixEnd == (BYTE *)dest) {
+		/* They're rolling the current segment. */
+		if (lz4sd->prefixSize >= 64 * KB - 1)
+			result = LZ4_decompress_safe_withPrefix64k(source, dest,
+				compressedSize, maxOutputSize);
+		else if (lz4sd->extDictSize == 0)
+			result = LZ4_decompress_safe_withSmallPrefix(source,
+				dest, compressedSize, maxOutputSize,
+				lz4sd->prefixSize);
+		else
+			result = LZ4_decompress_safe_doubleDict(source, dest,
+				compressedSize, maxOutputSize,
+				lz4sd->prefixSize,
+				lz4sd->externalDict, lz4sd->extDictSize);
 		if (result <= 0)
 			return result;
-
 		lz4sd->prefixSize += result;
-		lz4sd->prefixEnd	+= result;
+		lz4sd->prefixEnd  += result;
 	} else {
+		/*
+		 * The buffer wraps around, or they're
+		 * switching to another buffer.
+		 */
 		lz4sd->extDictSize = lz4sd->prefixSize;
 		lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize;
-		result = LZ4_decompress_generic(source, dest,
+		result = LZ4_decompress_safe_forceExtDict(source, dest,
 			compressedSize, maxOutputSize,
-			endOnInputSize, full, 0,
-			usingExtDict, (BYTE *)dest,
 			lz4sd->externalDict, lz4sd->extDictSize);
 		if (result <= 0)
 			return result;
 		lz4sd->prefixSize = result;
-		lz4sd->prefixEnd	= (BYTE *)dest + result;
+		lz4sd->prefixEnd  = (BYTE *)dest + result;
 	}
 
 	return result;
@@ -422,75 +632,66 @@ int LZ4_decompress_fast_continue(LZ4_streamDecode_t *LZ4_streamDecode,
 	LZ4_streamDecode_t_internal *lz4sd = &LZ4_streamDecode->internal_donotuse;
 	int result;
 
-	if (lz4sd->prefixEnd == (BYTE *)dest) {
-		result = LZ4_decompress_generic(source, dest, 0, originalSize,
-			endOnOutputSize, full, 0,
-			usingExtDict,
-			lz4sd->prefixEnd - lz4sd->prefixSize,
-			lz4sd->externalDict, lz4sd->extDictSize);
-
-		if (result <= 0)
-			return result;
-
-		lz4sd->prefixSize += originalSize;
-		lz4sd->prefixEnd	+= originalSize;
-	} else {
-		lz4sd->extDictSize = lz4sd->prefixSize;
-		lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize;
-		result = LZ4_decompress_generic(source, dest, 0, originalSize,
-			endOnOutputSize, full, 0,
-			usingExtDict, (BYTE *)dest,
-			lz4sd->externalDict, lz4sd->extDictSize);
+	if (lz4sd->prefixSize == 0) {
+		assert(lz4sd->extDictSize == 0);
+		result = LZ4_decompress_fast(source, dest, originalSize);
 		if (result <= 0)
 			return result;
 		lz4sd->prefixSize = originalSize;
-		lz4sd->prefixEnd	= (BYTE *)dest + originalSize;
+		lz4sd->prefixEnd = (BYTE *)dest + originalSize;
+	} else if (lz4sd->prefixEnd == (BYTE *)dest) {
+		if (lz4sd->prefixSize >= 64 * KB - 1 ||
+		    lz4sd->extDictSize == 0)
+			result = LZ4_decompress_fast(source, dest,
+						     originalSize);
+		else
+			result = LZ4_decompress_fast_doubleDict(source, dest,
+				originalSize, lz4sd->prefixSize,
+				lz4sd->externalDict, lz4sd->extDictSize);
+		if (result <= 0)
+			return result;
+		lz4sd->prefixSize += originalSize;
+		lz4sd->prefixEnd  += originalSize;
+	} else {
+		lz4sd->extDictSize = lz4sd->prefixSize;
+		lz4sd->externalDict = lz4sd->prefixEnd - lz4sd->extDictSize;
+		result = LZ4_decompress_fast_extDict(source, dest,
+			originalSize, lz4sd->externalDict, lz4sd->extDictSize);
+		if (result <= 0)
+			return result;
+		lz4sd->prefixSize = originalSize;
+		lz4sd->prefixEnd = (BYTE *)dest + originalSize;
 	}
-
 	return result;
 }
 
-/*
- * Advanced decoding functions :
- * *_usingDict() :
- * These decoding functions work the same as "_continue" ones,
- * the dictionary must be explicitly provided within parameters
- */
-static FORCE_INLINE int LZ4_decompress_usingDict_generic(const char *source,
-	char *dest, int compressedSize, int maxOutputSize, int safe,
-	const char *dictStart, int dictSize)
+int LZ4_decompress_safe_usingDict(const char *source, char *dest,
+				  int compressedSize, int maxOutputSize,
+				  const char *dictStart, int dictSize)
 {
 	if (dictSize == 0)
-		return LZ4_decompress_generic(source, dest,
-			compressedSize, maxOutputSize, safe, full, 0,
-			noDict, (BYTE *)dest, NULL, 0);
-	if (dictStart + dictSize == dest) {
-		if (dictSize >= (int)(64 * KB - 1))
-			return LZ4_decompress_generic(source, dest,
-				compressedSize, maxOutputSize, safe, full, 0,
-				withPrefix64k, (BYTE *)dest - 64 * KB, NULL, 0);
-		return LZ4_decompress_generic(source, dest, compressedSize,
-			maxOutputSize, safe, full, 0, noDict,
-			(BYTE *)dest - dictSize, NULL, 0);
+		return LZ4_decompress_safe(source, dest,
+					   compressedSize, maxOutputSize);
+	if (dictStart+dictSize == dest) {
+		if (dictSize >= 64 * KB - 1)
+			return LZ4_decompress_safe_withPrefix64k(source, dest,
+				compressedSize, maxOutputSize);
+		return LZ4_decompress_safe_withSmallPrefix(source, dest,
+			compressedSize, maxOutputSize, dictSize);
 	}
-	return LZ4_decompress_generic(source, dest, compressedSize,
-		maxOutputSize, safe, full, 0, usingExtDict,
-		(BYTE *)dest, (const BYTE *)dictStart, dictSize);
-}
-
-int LZ4_decompress_safe_usingDict(const char *source, char *dest,
-	int compressedSize, int maxOutputSize,
-	const char *dictStart, int dictSize)
-{
-	return LZ4_decompress_usingDict_generic(source, dest,
-		compressedSize, maxOutputSize, 1, dictStart, dictSize);
+	return LZ4_decompress_safe_forceExtDict(source, dest,
+		compressedSize, maxOutputSize, dictStart, dictSize);
 }
 
 int LZ4_decompress_fast_usingDict(const char *source, char *dest,
-	int originalSize, const char *dictStart, int dictSize)
+				  int originalSize,
+				  const char *dictStart, int dictSize)
 {
-	return LZ4_decompress_usingDict_generic(source, dest, 0,
-		originalSize, 0, dictStart, dictSize);
+	if (dictSize == 0 || dictStart + dictSize == dest)
+		return LZ4_decompress_fast(source, dest, originalSize);
+
+	return LZ4_decompress_fast_extDict(source, dest, originalSize,
+		dictStart, dictSize);
 }
 
 #ifndef STATIC
diff --git a/lib/lz4/lz4defs.h b/lib/lz4/lz4defs.h
index 00a0b58a0871..1a7fa9d9170f 100644
--- a/lib/lz4/lz4defs.h
+++ b/lib/lz4/lz4defs.h
@@ -75,6 +75,11 @@ typedef uintptr_t uptrval;
 #define WILDCOPYLENGTH 8
 #define LASTLITERALS 5
 #define MFLIMIT (WILDCOPYLENGTH + MINMATCH)
+/*
+ * ensure it's possible to write 2 x wildcopyLength
+ * without overflowing output buffer
+ */
+#define MATCH_SAFEGUARD_DISTANCE  ((2 * WILDCOPYLENGTH) - MINMATCH)
 
 /* Increase this value ==> compression run slower on incompressible data */
 #define LZ4_SKIPTRIGGER 6
@@ -222,6 +227,8 @@ typedef enum { noDict = 0, withPrefix64k, usingExtDict } dict_directive;
 typedef enum { noDictIssue = 0, dictSmall } dictIssue_directive;
 
 typedef enum { endOnOutputSize = 0, endOnInputSize = 1 } endCondition_directive;
-typedef enum { full = 0, partial = 1 } earlyEnd_directive;
+typedef enum { decode_full_block = 0, partial_decode = 1 } earlyEnd_directive;
+
+#define LZ4_STATIC_ASSERT(c)	BUILD_BUG_ON(!(c))
 
 #endif

From ba653b10fde5207f13c19b903a78cd02ebf1caa3 Mon Sep 17 00:00:00 2001
From: Nick Terrell <terrelln@fb.com>
Date: Fri, 14 Aug 2020 17:30:10 -0700
Subject: [PATCH 02/34] lz4: fix kernel decompression speed

This patch replaces all memcpy() calls with LZ4_memcpy() which calls
__builtin_memcpy() so the compiler can inline it.

LZ4 relies heavily on memcpy() with a constant size being inlined.  In x86
and i386 pre-boot environments memcpy() cannot be inlined because memcpy()
doesn't get defined as __builtin_memcpy().

An equivalent patch has been applied upstream so that the next import
won't lose this change [1].

I've measured the kernel decompression speed using QEMU before and after
this patch for the x86_64 and i386 architectures.  The speed-up is about
10x as shown below.

Code	Arch	Kernel Size	Time	Speed
v5.8	x86_64	11504832 B	148 ms	 79 MB/s
patch	x86_64	11503872 B	 13 ms	885 MB/s
v5.8	i386	 9621216 B	 91 ms	106 MB/s
patch	i386	 9620224 B	 10 ms	962 MB/s

I also measured the time to decompress the initramfs on x86_64, i386, and
arm.  All three show the same decompression speed before and after, as
expected.

[1] https://github.com/lz4/lz4/pull/890

Signed-off-by: Nick Terrell <terrelln@fb.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Yann Collet <yann.collet.73@gmail.com>
Cc: Gao Xiang <gaoxiang25@huawei.com>
Cc: Sven Schmidt <4sschmid@informatik.uni-hamburg.de>
Cc: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Arvind Sankar <nivedita@alum.mit.edu>
Link: http://lkml.kernel.org/r/20200803194022.2966806-1-nickrterrell@gmail.com
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/lz4/lz4_compress.c   |  4 ++--
 lib/lz4/lz4_decompress.c | 18 +++++++++---------
 lib/lz4/lz4defs.h        | 10 ++++++++++
 lib/lz4/lz4hc_compress.c |  2 +-
 4 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/lib/lz4/lz4_compress.c b/lib/lz4/lz4_compress.c
index cc7b6d4cc7c7..90bb67994688 100644
--- a/lib/lz4/lz4_compress.c
+++ b/lib/lz4/lz4_compress.c
@@ -446,7 +446,7 @@ static FORCE_INLINE int LZ4_compress_generic(
 			*op++ = (BYTE)(lastRun << ML_BITS);
 		}
 
-		memcpy(op, anchor, lastRun);
+		LZ4_memcpy(op, anchor, lastRun);
 
 		op += lastRun;
 	}
@@ -708,7 +708,7 @@ static int LZ4_compress_destSize_generic(
 		} else {
 			*op++ = (BYTE)(lastRunSize<<ML_BITS);
 		}
-		memcpy(op, anchor, lastRunSize);
+		LZ4_memcpy(op, anchor, lastRunSize);
 		op += lastRunSize;
 	}
 
diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c
index 0c9d3ad17e0f..fea6ad57cab7 100644
--- a/lib/lz4/lz4_decompress.c
+++ b/lib/lz4/lz4_decompress.c
@@ -150,7 +150,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
 		   && likely((endOnInput ? ip < shortiend : 1) &
 			     (op <= shortoend))) {
 			/* Copy the literals */
-			memcpy(op, ip, endOnInput ? 16 : 8);
+			LZ4_memcpy(op, ip, endOnInput ? 16 : 8);
 			op += length; ip += length;
 
 			/*
@@ -169,9 +169,9 @@ static FORCE_INLINE int LZ4_decompress_generic(
 			    (offset >= 8) &&
 			    (dict == withPrefix64k || match >= lowPrefix)) {
 				/* Copy the match. */
-				memcpy(op + 0, match + 0, 8);
-				memcpy(op + 8, match + 8, 8);
-				memcpy(op + 16, match + 16, 2);
+				LZ4_memcpy(op + 0, match + 0, 8);
+				LZ4_memcpy(op + 8, match + 8, 8);
+				LZ4_memcpy(op + 16, match + 16, 2);
 				op += length + MINMATCH;
 				/* Both stages worked, load the next token. */
 				continue;
@@ -260,7 +260,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
 				}
 			}
 
-			memcpy(op, ip, length);
+			LZ4_memcpy(op, ip, length);
 			ip += length;
 			op += length;
 
@@ -347,7 +347,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
 				size_t const copySize = (size_t)(lowPrefix - match);
 				size_t const restSize = length - copySize;
 
-				memcpy(op, dictEnd - copySize, copySize);
+				LZ4_memcpy(op, dictEnd - copySize, copySize);
 				op += copySize;
 				if (restSize > (size_t)(op - lowPrefix)) {
 					/* overlap copy */
@@ -357,7 +357,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
 					while (op < endOfMatch)
 						*op++ = *copyFrom++;
 				} else {
-					memcpy(op, lowPrefix, restSize);
+					LZ4_memcpy(op, lowPrefix, restSize);
 					op += restSize;
 				}
 			}
@@ -383,7 +383,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
 				while (op < copyEnd)
 					*op++ = *match++;
 			} else {
-				memcpy(op, match, mlen);
+				LZ4_memcpy(op, match, mlen);
 			}
 			op = copyEnd;
 			if (op == oend)
@@ -397,7 +397,7 @@ static FORCE_INLINE int LZ4_decompress_generic(
 			op[2] = match[2];
 			op[3] = match[3];
 			match += inc32table[offset];
-			memcpy(op + 4, match, 4);
+			LZ4_memcpy(op + 4, match, 4);
 			match -= dec64table[offset];
 		} else {
 			LZ4_copy8(op, match);
diff --git a/lib/lz4/lz4defs.h b/lib/lz4/lz4defs.h
index 1a7fa9d9170f..c91dd96ef629 100644
--- a/lib/lz4/lz4defs.h
+++ b/lib/lz4/lz4defs.h
@@ -137,6 +137,16 @@ static FORCE_INLINE void LZ4_writeLE16(void *memPtr, U16 value)
 	return put_unaligned_le16(value, memPtr);
 }
 
+/*
+ * LZ4 relies on memcpy with a constant size being inlined. In freestanding
+ * environments, the compiler can't assume the implementation of memcpy() is
+ * standard compliant, so apply its specialized memcpy() inlining logic. When
+ * possible, use __builtin_memcpy() to tell the compiler to analyze memcpy()
+ * as-if it were standard compliant, so it can inline it in freestanding
+ * environments. This is needed when decompressing the Linux Kernel, for example.
+ */
+#define LZ4_memcpy(dst, src, size) __builtin_memcpy(dst, src, size)
+
 static FORCE_INLINE void LZ4_copy8(void *dst, const void *src)
 {
 #if LZ4_ARCH64
diff --git a/lib/lz4/lz4hc_compress.c b/lib/lz4/lz4hc_compress.c
index 176f03b83e56..ccbf0e89d713 100644
--- a/lib/lz4/lz4hc_compress.c
+++ b/lib/lz4/lz4hc_compress.c
@@ -570,7 +570,7 @@ static int LZ4HC_compress_generic(
 			*op++ = (BYTE) lastRun;
 		} else
 			*op++ = (BYTE)(lastRun<<ML_BITS);
-		memcpy(op, anchor, iend - anchor);
+		LZ4_memcpy(op, anchor, iend - anchor);
 		op += iend - anchor;
 	}
 

From 701064551480c68d6c9834ce3346c7d64ecdbc63 Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@redhat.com>
Date: Tue, 15 Dec 2020 20:44:03 -0800
Subject: [PATCH 03/34] lib/lz4: explicitly support in-place decompression

LZ4 final literal copy could be overlapped when doing
in-place decompression, so it's unsafe to just use memcpy()
on an optimized memcpy approach but memmove() instead.

Upstream LZ4 has updated this years ago [1] (and the impact
is non-sensible [2] plus only a few bytes remain), this commit
just synchronizes LZ4 upstream code to the kernel side as well.

It can be observed as EROFS in-place decompression failure
on specific files when X86_FEATURE_ERMS is unsupported,
memcpy() optimization of commit 59daa706fbec ("x86, mem:
Optimize memcpy by avoiding memory false dependece") will
be enabled then.

Currently most modern x86-CPUs support ERMS, these CPUs just
use "rep movsb" approach so no problem at all. However, it can
still be verified with forcely disabling ERMS feature...

arch/x86/lib/memcpy_64.S:
        ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
-                     "jmp memcpy_erms", X86_FEATURE_ERMS
+                     "jmp memcpy_orig", X86_FEATURE_ERMS

We didn't observe any strange on arm64/arm/x86 platform before
since most memcpy() would behave in an increasing address order
("copy upwards" [3]) and it's the correct order of in-place
decompression but it really needs an update to memmove() for sure
considering it's an undefined behavior according to the standard
and some unique optimization already exists in the kernel.

[1] https://github.com/lz4/lz4/commit/33cb8518ac385835cc17be9a770b27b40cd0e15b
[2] https://github.com/lz4/lz4/pull/717#issuecomment-497818921
[3] https://sourceware.org/bugzilla/show_bug.cgi?id=12518

Link: https://lkml.kernel.org/r/20201122030749.2698994-1-hsiangkao@redhat.com
Signed-off-by: Gao Xiang <hsiangkao@redhat.com>
Reviewed-by: Nick Terrell <terrelln@fb.com>
Cc: Yann Collet <yann.collet.73@gmail.com>
Cc: Miao Xie <miaoxie@huawei.com>
Cc: Chao Yu <yuchao0@huawei.com>
Cc: Li Guifu <bluce.liguifu@huawei.com>
Cc: Guo Xuenan <guoxuenan@huawei.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 lib/lz4/lz4_decompress.c | 6 +++++-
 lib/lz4/lz4defs.h        | 1 +
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/lib/lz4/lz4_decompress.c b/lib/lz4/lz4_decompress.c
index fea6ad57cab7..cda8b794da04 100644
--- a/lib/lz4/lz4_decompress.c
+++ b/lib/lz4/lz4_decompress.c
@@ -260,7 +260,11 @@ static FORCE_INLINE int LZ4_decompress_generic(
 				}
 			}
 
-			LZ4_memcpy(op, ip, length);
+			/*
+			 * supports overlapping memory regions; only matters
+			 * for in-place decompression scenarios
+			 */
+			LZ4_memmove(op, ip, length);
 			ip += length;
 			op += length;
 
diff --git a/lib/lz4/lz4defs.h b/lib/lz4/lz4defs.h
index c91dd96ef629..673bd206aa98 100644
--- a/lib/lz4/lz4defs.h
+++ b/lib/lz4/lz4defs.h
@@ -146,6 +146,7 @@ static FORCE_INLINE void LZ4_writeLE16(void *memPtr, U16 value)
  * environments. This is needed when decompressing the Linux Kernel, for example.
  */
 #define LZ4_memcpy(dst, src, size) __builtin_memcpy(dst, src, size)
+#define LZ4_memmove(dst, src, size) __builtin_memmove(dst, src, size)
 
 static FORCE_INLINE void LZ4_copy8(void *dst, const void *src)
 {

From 4cbdeb6216e440bddb871d1bffd0d66f7752fd80 Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@linux.alibaba.com>
Date: Wed, 25 Aug 2021 11:42:03 +0800
Subject: [PATCH 04/34] erofs: sync up with kernel 5.10

Backport 5.10 LTS erofs to 4.19.

Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
---
 drivers/staging/erofs/Kconfig                 |   97 +-
 drivers/staging/erofs/Makefile                |   10 +-
 drivers/staging/erofs/compress.h              |   58 +
 drivers/staging/erofs/data.c                  |  224 +--
 drivers/staging/erofs/decompressor.c          |  344 ++++
 drivers/staging/erofs/dir.c                   |   78 +-
 drivers/staging/erofs/erofs_fs.h              |  278 +--
 drivers/staging/erofs/inode.c                 |  415 ++--
 drivers/staging/erofs/internal.h              |  491 ++---
 drivers/staging/erofs/lz4defs.h               |  227 ---
 drivers/staging/erofs/namei.c                 |   82 +-
 drivers/staging/erofs/super.c                 |  592 +++---
 .../erofs/{include/linux => }/tagptr.h        |   15 +-
 drivers/staging/erofs/unzip_lz4.c             |  251 ---
 drivers/staging/erofs/unzip_vle.c             | 1695 -----------------
 drivers/staging/erofs/unzip_vle_lz4.c         |  210 --
 drivers/staging/erofs/utils.c                 |  225 +--
 drivers/staging/erofs/xattr.c                 |  254 ++-
 drivers/staging/erofs/xattr.h                 |   78 +-
 drivers/staging/erofs/zdata.c                 | 1370 +++++++++++++
 drivers/staging/erofs/zdata.h                 |  188 ++
 drivers/staging/erofs/zmap.c                  |  477 +++++
 .../erofs/{unzip_pagevec.h => zpvec.h}        |   71 +-
 .../include => include}/trace/events/erofs.h  |   39 +-
 24 files changed, 3827 insertions(+), 3942 deletions(-)
 create mode 100644 drivers/staging/erofs/compress.h
 create mode 100644 drivers/staging/erofs/decompressor.c
 delete mode 100644 drivers/staging/erofs/lz4defs.h
 rename drivers/staging/erofs/{include/linux => }/tagptr.h (93%)
 delete mode 100644 drivers/staging/erofs/unzip_lz4.c
 delete mode 100644 drivers/staging/erofs/unzip_vle.c
 delete mode 100644 drivers/staging/erofs/unzip_vle_lz4.c
 create mode 100644 drivers/staging/erofs/zdata.c
 create mode 100644 drivers/staging/erofs/zdata.h
 create mode 100644 drivers/staging/erofs/zmap.c
 rename drivers/staging/erofs/{unzip_pagevec.h => zpvec.h} (67%)
 rename {drivers/staging/erofs/include => include}/trace/events/erofs.h (85%)

diff --git a/drivers/staging/erofs/Kconfig b/drivers/staging/erofs/Kconfig
index 663b755bf2fb..74b0aaa7114c 100644
--- a/drivers/staging/erofs/Kconfig
+++ b/drivers/staging/erofs/Kconfig
@@ -1,19 +1,20 @@
-# SPDX-License-Identifier: GPL-2.0
+# SPDX-License-Identifier: GPL-2.0-only
 
 config EROFS_FS
 	tristate "EROFS filesystem support"
 	depends on BLOCK
+	select LIBCRC32C
 	help
-	  EROFS(Enhanced Read-Only File System) is a lightweight
+	  EROFS (Enhanced Read-Only File System) is a lightweight
 	  read-only file system with modern designs (eg. page-sized
 	  blocks, inline xattrs/data, etc.) for scenarios which need
-	  high-performance read-only requirements, eg. firmwares in
-	  mobile phone or LIVECDs.
+	  high-performance read-only requirements, e.g. Android OS
+	  for mobile phones and LIVECDs.
 
-	  It also provides VLE compression support, focusing on
-	  random read improvements, keeping relatively lower
-	  compression ratios, which is useful for high-performance
-	  devices with limited memory and ROM space.
+	  It also provides fixed-sized output compression support,
+	  which improves storage density, keeps relatively higher
+	  compression ratios, which is more useful to achieve high
+	  performance for embedded devices with limited memory.
 
 	  If unsure, say N.
 
@@ -21,8 +22,9 @@ config EROFS_FS_DEBUG
 	bool "EROFS debugging feature"
 	depends on EROFS_FS
 	help
-	  Print EROFS debugging messages and enable more BUG_ONs
-	  which check the filesystem consistency aggressively.
+	  Print debugging messages and enable more BUG_ONs which check
+	  filesystem consistency and find potential issues aggressively,
+	  which can be used for Android eng build, for example.
 
 	  For daily use, say N.
 
@@ -54,6 +56,7 @@ config EROFS_FS_POSIX_ACL
 config EROFS_FS_SECURITY
 	bool "EROFS Security Labels"
 	depends on EROFS_FS_XATTR
+	default y
 	help
 	  Security labels provide an access control facility to support Linux
 	  Security Models (LSMs) accepted by AppArmor, SELinux, Smack and TOMOYO
@@ -63,29 +66,15 @@ config EROFS_FS_SECURITY
 
 	  If you are not using a security module, say N.
 
-config EROFS_FS_USE_VM_MAP_RAM
-	bool "EROFS VM_MAP_RAM Support"
-	depends on EROFS_FS
-	help
-	  use vm_map_ram/vm_unmap_ram instead of vmap/vunmap.
-
-	  If you don't know what these are, say N.
-
-config EROFS_FAULT_INJECTION
-	bool "EROFS fault injection facility"
-	depends on EROFS_FS
-	help
-	  Test EROFS to inject faults such as ENOMEM, EIO, and so on.
-	  If unsure, say N.
-
 config EROFS_FS_ZIP
-	bool "EROFS Data Compresssion Support"
+	bool "EROFS Data Compression Support"
 	depends on EROFS_FS
+	select LZ4_DECOMPRESS
+	default y
 	help
-	  Currently we support VLE Compression only.
-	  Play at your own risk.
+	  Enable fixed-sized output compression for EROFS.
 
-	  If you don't want to use compression feature, say N.
+	  If you don't want to enable compression feature, say N.
 
 config EROFS_FS_CLUSTER_PAGE_LIMIT
 	int "EROFS Cluster Pages Hard Limit"
@@ -93,49 +82,11 @@ config EROFS_FS_CLUSTER_PAGE_LIMIT
 	range 1 256
 	default "1"
 	help
-	  Indicates VLE compressed pages hard limit of a
-	  compressed cluster.
+	  Indicates maximum # of pages of a compressed
+	  physical cluster.
 
-	  For example, if files of a image are compressed
-	  into 8k-unit, the hard limit should not be less
-	  than 2. Otherwise, the image cannot be mounted
-	  correctly on this kernel.
-
-choice
-	prompt "EROFS VLE Data Decompression mode"
-	depends on EROFS_FS_ZIP
-	default EROFS_FS_ZIP_CACHE_BIPOLAR
-	help
-	  EROFS supports three options for VLE decompression.
-	  "In-place Decompression Only" consumes the minimum memory
-	  with lowest random read.
-
-	  "Bipolar Cached Decompression" consumes the maximum memory
-	  with highest random read.
-
-	  If unsure, select "Bipolar Cached Decompression"
-
-config EROFS_FS_ZIP_NO_CACHE
-	bool "In-place Decompression Only"
-	help
-	  Read compressed data into page cache and do in-place
-	  decompression directly.
-
-config EROFS_FS_ZIP_CACHE_UNIPOLAR
-	bool "Unipolar Cached Decompression"
-	help
-	  For each request, it caches the last compressed page
-	  for further reading.
-	  It still decompresses in place for the rest compressed pages.
-
-config EROFS_FS_ZIP_CACHE_BIPOLAR
-	bool "Bipolar Cached Decompression"
-	help
-	  For each request, it caches the both end compressed pages
-	  for further reading.
-	  It still decompresses in place for the rest compressed pages.
-
-	  Recommended for performance priority.
-
-endchoice
+	  For example, if files in a image were compressed
+	  into 8k-unit, hard limit should not be configured
+	  less than 2. Otherwise, the image will be refused
+	  to mount on this kernel.
 
diff --git a/drivers/staging/erofs/Makefile b/drivers/staging/erofs/Makefile
index 9a766eb7ed75..46f2aa4ba46c 100644
--- a/drivers/staging/erofs/Makefile
+++ b/drivers/staging/erofs/Makefile
@@ -1,13 +1,11 @@
-# SPDX-License-Identifier: GPL-2.0
+# SPDX-License-Identifier: GPL-2.0-only
 
-EROFS_VERSION = "1.0pre1"
+EROFS_VERSION = "1.0"
 
-ccflags-y += -Wall -DEROFS_VERSION=\"$(EROFS_VERSION)\"
+ccflags-y += -DEROFS_VERSION=\"$(EROFS_VERSION)\"
 
 obj-$(CONFIG_EROFS_FS) += erofs.o
-# staging requirement: to be self-contained in its own directory
-ccflags-y += -I$(src)/include
 erofs-objs := super.o inode.o data.o namei.o dir.o utils.o
 erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
-erofs-$(CONFIG_EROFS_FS_ZIP) += unzip_vle.o unzip_lz4.o unzip_vle_lz4.o
+erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o
 
diff --git a/drivers/staging/erofs/compress.h b/drivers/staging/erofs/compress.h
new file mode 100644
index 000000000000..e9b1aa0afe0e
--- /dev/null
+++ b/drivers/staging/erofs/compress.h
@@ -0,0 +1,58 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2019 HUAWEI, Inc.
+ *             https://www.huawei.com/
+ */
+#ifndef __EROFS_FS_COMPRESS_H
+#define __EROFS_FS_COMPRESS_H
+
+#include "internal.h"
+
+enum {
+	Z_EROFS_COMPRESSION_SHIFTED = Z_EROFS_COMPRESSION_MAX,
+	Z_EROFS_COMPRESSION_RUNTIME_MAX
+};
+
+struct z_erofs_decompress_req {
+	struct super_block *sb;
+	struct page **in, **out;
+
+	unsigned short pageofs_out;
+	unsigned int inputsize, outputsize;
+
+	/* indicate the algorithm will be used for decompression */
+	unsigned int alg;
+	bool inplace_io, partial_decoding;
+};
+
+/*
+ * - 0x5A110C8D ('sallocated', Z_EROFS_MAPPING_STAGING) -
+ * used to mark temporary allocated pages from other
+ * file/cached pages and NULL mapping pages.
+ */
+#define Z_EROFS_MAPPING_STAGING         ((void *)0x5A110C8D)
+
+/* check if a page is marked as staging */
+static inline bool z_erofs_page_is_staging(struct page *page)
+{
+	return page->mapping == Z_EROFS_MAPPING_STAGING;
+}
+
+static inline bool z_erofs_put_stagingpage(struct list_head *pagepool,
+					   struct page *page)
+{
+	if (!z_erofs_page_is_staging(page))
+		return false;
+
+	/* staging pages should not be used by others at the same time */
+	if (page_ref_count(page) > 1)
+		put_page(page);
+	else
+		list_add(&page->lru, pagepool);
+	return true;
+}
+
+int z_erofs_decompress(struct z_erofs_decompress_req *rq,
+		       struct list_head *pagepool);
+
+#endif
diff --git a/drivers/staging/erofs/data.c b/drivers/staging/erofs/data.c
index 894e60ecebe2..4f305b68497e 100644
--- a/drivers/staging/erofs/data.c
+++ b/drivers/staging/erofs/data.c
@@ -1,21 +1,14 @@
-// SPDX-License-Identifier: GPL-2.0
+// SPDX-License-Identifier: GPL-2.0-only
 /*
- * linux/drivers/staging/erofs/data.c
- *
  * Copyright (C) 2017-2018 HUAWEI, Inc.
- *             http://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file COPYING in the main directory of the Linux
- * distribution for more details.
+ *             https://www.huawei.com/
  */
 #include "internal.h"
 #include <linux/prefetch.h>
 
 #include <trace/events/erofs.h>
 
-static inline void read_endio(struct bio *bio)
+static void erofs_readendio(struct bio *bio)
 {
 	int i;
 	struct bio_vec *bvec;
@@ -27,7 +20,7 @@ static inline void read_endio(struct bio *bio)
 		/* page is already locked */
 		DBG_BUGON(PageUptodate(page));
 
-		if (unlikely(err))
+		if (err)
 			SetPageError(page);
 		else
 			SetPageUptodate(page);
@@ -38,70 +31,35 @@ static inline void read_endio(struct bio *bio)
 	bio_put(bio);
 }
 
-/* prio -- true is used for dir */
-struct page *erofs_get_meta_page(struct super_block *sb,
-	erofs_blk_t blkaddr, bool prio)
+struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr)
 {
-	struct inode *bd_inode = sb->s_bdev->bd_inode;
-	struct address_space *mapping = bd_inode->i_mapping;
+	struct address_space *const mapping = sb->s_bdev->bd_inode->i_mapping;
 	struct page *page;
 
-repeat:
-	page = find_or_create_page(mapping, blkaddr,
-	/*
-	 * Prefer looping in the allocator rather than here,
-	 * at least that code knows what it's doing.
-	 */
-		mapping_gfp_constraint(mapping, ~__GFP_FS) | __GFP_NOFAIL);
-
-	BUG_ON(!page || !PageLocked(page));
-
-	if (!PageUptodate(page)) {
-		struct bio *bio;
-		int err;
-
-		bio = prepare_bio(sb, blkaddr, 1, read_endio);
-		err = bio_add_page(bio, page, PAGE_SIZE, 0);
-		BUG_ON(err != PAGE_SIZE);
-
-		__submit_bio(bio, REQ_OP_READ,
-			REQ_META | (prio ? REQ_PRIO : 0));
-
+	page = read_cache_page_gfp(mapping, blkaddr,
+				   mapping_gfp_constraint(mapping, ~__GFP_FS));
+	/* should already be PageUptodate */
+	if (!IS_ERR(page))
 		lock_page(page);
-
-		/* the page has been truncated by others? */
-		if (unlikely(page->mapping != mapping)) {
-			unlock_page(page);
-			put_page(page);
-			goto repeat;
-		}
-
-		/* more likely a read error */
-		if (unlikely(!PageUptodate(page))) {
-			unlock_page(page);
-			put_page(page);
-
-			page = ERR_PTR(-EIO);
-		}
-	}
 	return page;
 }
 
 static int erofs_map_blocks_flatmode(struct inode *inode,
-	struct erofs_map_blocks *map,
-	int flags)
+				     struct erofs_map_blocks *map,
+				     int flags)
 {
 	int err = 0;
 	erofs_blk_t nblocks, lastblk;
 	u64 offset = map->m_la;
-	struct erofs_vnode *vi = EROFS_V(inode);
+	struct erofs_inode *vi = EROFS_I(inode);
+	bool tailendpacking = (vi->datalayout == EROFS_INODE_FLAT_INLINE);
 
 	trace_erofs_map_blocks_flatmode_enter(inode, map, flags);
 
 	nblocks = DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
-	lastblk = nblocks - is_inode_layout_inline(inode);
+	lastblk = nblocks - tailendpacking;
 
-	if (unlikely(offset >= inode->i_size)) {
+	if (offset >= inode->i_size) {
 		/* leave out-of-bound access unmapped */
 		map->m_flags = 0;
 		map->m_plen = 0;
@@ -114,7 +72,7 @@ static int erofs_map_blocks_flatmode(struct inode *inode,
 	if (offset < blknr_to_addr(lastblk)) {
 		map->m_pa = blknr_to_addr(vi->raw_blkaddr) + map->m_la;
 		map->m_plen = blknr_to_addr(lastblk) - offset;
-	} else if (is_inode_layout_inline(inode)) {
+	} else if (tailendpacking) {
 		/* 2 - inode inline B: inode, [xattrs], inline last blk... */
 		struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb);
 
@@ -122,17 +80,21 @@ static int erofs_map_blocks_flatmode(struct inode *inode,
 			vi->xattr_isize + erofs_blkoff(map->m_la);
 		map->m_plen = inode->i_size - offset;
 
-		/* inline data should locate in one meta block */
+		/* inline data should be located in one meta block */
 		if (erofs_blkoff(map->m_pa) + map->m_plen > PAGE_SIZE) {
+			erofs_err(inode->i_sb,
+				  "inline data cross block boundary @ nid %llu",
+				  vi->nid);
 			DBG_BUGON(1);
-			err = -EIO;
+			err = -EFSCORRUPTED;
 			goto err_out;
 		}
 
 		map->m_flags |= EROFS_MAP_META;
 	} else {
-		errln("internal error @ nid: %llu (size %llu), m_la 0x%llx",
-			vi->nid, inode->i_size, map->m_la);
+		erofs_err(inode->i_sb,
+			  "internal error @ nid: %llu (size %llu), m_la 0x%llx",
+			  vi->nid, inode->i_size, map->m_la);
 		DBG_BUGON(1);
 		err = -EIO;
 		goto err_out;
@@ -146,56 +108,30 @@ static int erofs_map_blocks_flatmode(struct inode *inode,
 	return err;
 }
 
-#ifdef CONFIG_EROFS_FS_ZIP
-extern int z_erofs_map_blocks_iter(struct inode *,
-	struct erofs_map_blocks *, struct page **, int);
-#endif
-
-int erofs_map_blocks_iter(struct inode *inode,
-	struct erofs_map_blocks *map,
-	struct page **mpage_ret, int flags)
-{
-	/* by default, reading raw data never use erofs_map_blocks_iter */
-	if (unlikely(!is_inode_layout_compression(inode))) {
-		if (*mpage_ret != NULL)
-			put_page(*mpage_ret);
-		*mpage_ret = NULL;
-
-		return erofs_map_blocks(inode, map, flags);
-	}
-
-#ifdef CONFIG_EROFS_FS_ZIP
-	return z_erofs_map_blocks_iter(inode, map, mpage_ret, flags);
-#else
-	/* data compression is not available */
-	return -ENOTSUPP;
-#endif
-}
-
 int erofs_map_blocks(struct inode *inode,
-	struct erofs_map_blocks *map, int flags)
+		     struct erofs_map_blocks *map, int flags)
 {
-	if (unlikely(is_inode_layout_compression(inode))) {
-		struct page *mpage = NULL;
-		int err;
+	if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout)) {
+		int err = z_erofs_map_blocks_iter(inode, map, flags);
 
-		err = erofs_map_blocks_iter(inode, map, &mpage, flags);
-		if (mpage != NULL)
-			put_page(mpage);
+		if (map->mpage) {
+			put_page(map->mpage);
+			map->mpage = NULL;
+		}
 		return err;
 	}
 	return erofs_map_blocks_flatmode(inode, map, flags);
 }
 
-static inline struct bio *erofs_read_raw_page(
-	struct bio *bio,
-	struct address_space *mapping,
-	struct page *page,
-	erofs_off_t *last_block,
-	unsigned nblocks,
-	bool ra)
+static inline struct bio *erofs_read_raw_page(struct bio *bio,
+					      struct address_space *mapping,
+					      struct page *page,
+					      erofs_off_t *last_block,
+					      unsigned int nblocks,
+					      bool ra)
 {
-	struct inode *inode = mapping->host;
+	struct inode *const inode = mapping->host;
+	struct super_block *const sb = inode->i_sb;
 	erofs_off_t current_block = (erofs_off_t)page->index;
 	int err;
 
@@ -206,34 +142,28 @@ static inline struct bio *erofs_read_raw_page(
 		goto has_updated;
 	}
 
-	if (cleancache_get_page(page) == 0) {
-		err = 0;
-		SetPageUptodate(page);
-		goto has_updated;
-	}
-
 	/* note that for readpage case, bio also equals to NULL */
-	if (bio != NULL &&
-		/* not continuous */
-		*last_block + 1 != current_block) {
+	if (bio &&
+	    /* not continuous */
+	    *last_block + 1 != current_block) {
 submit_bio_retry:
-		__submit_bio(bio, REQ_OP_READ, 0);
+		submit_bio(bio);
 		bio = NULL;
 	}
 
-	if (bio == NULL) {
+	if (!bio) {
 		struct erofs_map_blocks map = {
 			.m_la = blknr_to_addr(current_block),
 		};
 		erofs_blk_t blknr;
-		unsigned blkoff;
+		unsigned int blkoff;
 
 		err = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW);
-		if (unlikely(err))
+		if (err)
 			goto err_out;
 
 		/* zero out the holed page */
-		if (unlikely(!(map.m_flags & EROFS_MAP_MAPPED))) {
+		if (!(map.m_flags & EROFS_MAP_MAPPED)) {
 			zero_user_segment(page, 0, PAGE_SIZE);
 			SetPageUptodate(page);
 
@@ -254,7 +184,7 @@ static inline struct bio *erofs_read_raw_page(
 
 			DBG_BUGON(map.m_plen > PAGE_SIZE);
 
-			ipage = erofs_get_meta_page(inode->i_sb, blknr, 0);
+			ipage = erofs_get_meta_page(inode->i_sb, blknr);
 
 			if (IS_ERR(ipage)) {
 				err = PTR_ERR(ipage);
@@ -287,7 +217,13 @@ static inline struct bio *erofs_read_raw_page(
 		if (nblocks > BIO_MAX_PAGES)
 			nblocks = BIO_MAX_PAGES;
 
-		bio = prepare_bio(inode->i_sb, blknr, nblocks, read_endio);
+		bio = bio_alloc(GFP_NOIO, nblocks);
+
+		bio->bi_end_io = erofs_readendio;
+		bio_set_dev(bio, sb->s_bdev);
+		bio->bi_iter.bi_sector = (sector_t)blknr <<
+			LOG_SECTORS_PER_BLOCK;
+		bio->bi_opf = REQ_OP_READ | (ra ? REQ_RAHEAD : 0);
 	}
 
 	err = bio_add_page(bio, page, PAGE_SIZE, 0);
@@ -298,7 +234,7 @@ static inline struct bio *erofs_read_raw_page(
 	*last_block = current_block;
 
 	/* shift in advance in case of it followed by too many gaps */
-	if (unlikely(bio->bi_vcnt >= bio->bi_max_vecs)) {
+	if (bio->bi_iter.bi_size >= bio->bi_max_vecs * PAGE_SIZE) {
 		/* err should reassign to 0 after submitting */
 		err = 0;
 		goto submit_bio_out;
@@ -316,11 +252,10 @@ static inline struct bio *erofs_read_raw_page(
 	unlock_page(page);
 
 	/* if updated manually, continuous pages has a gap */
-	if (bio != NULL)
+	if (bio)
 submit_bio_out:
-		__submit_bio(bio, REQ_OP_READ, 0);
-
-	return unlikely(err) ? ERR_PTR(err) : NULL;
+		submit_bio(bio);
+	return err ? ERR_PTR(err) : NULL;
 }
 
 /*
@@ -335,7 +270,7 @@ static int erofs_raw_access_readpage(struct file *file, struct page *page)
 	trace_erofs_readpage(page, true);
 
 	bio = erofs_read_raw_page(NULL, page->mapping,
-		page, &last_block, 1, false);
+				  page, &last_block, 1, false);
 
 	if (IS_ERR(bio))
 		return PTR_ERR(bio);
@@ -345,8 +280,9 @@ static int erofs_raw_access_readpage(struct file *file, struct page *page)
 }
 
 static int erofs_raw_access_readpages(struct file *filp,
-	struct address_space *mapping,
-	struct list_head *pages, unsigned nr_pages)
+				      struct address_space *mapping,
+				      struct list_head *pages,
+				      unsigned int nr_pages)
 {
 	erofs_off_t last_block;
 	struct bio *bio = NULL;
@@ -363,13 +299,13 @@ static int erofs_raw_access_readpages(struct file *filp,
 
 		if (!add_to_page_cache_lru(page, mapping, page->index, gfp)) {
 			bio = erofs_read_raw_page(bio, mapping, page,
-				&last_block, nr_pages, true);
+						  &last_block, nr_pages, true);
 
 			/* all the page errors are ignored when readahead */
 			if (IS_ERR(bio)) {
 				pr_err("%s, readahead error at page %lu of nid %llu\n",
-					__func__, page->index,
-					EROFS_V(mapping->host)->nid);
+				       __func__, page->index,
+				       EROFS_I(mapping->host)->nid);
 
 				bio = NULL;
 			}
@@ -381,8 +317,28 @@ static int erofs_raw_access_readpages(struct file *filp,
 	DBG_BUGON(!list_empty(pages));
 
 	/* the rare case (end in gaps) */
-	if (unlikely(bio != NULL))
-		__submit_bio(bio, REQ_OP_READ, 0);
+	if (bio)
+		submit_bio(bio);
+	return 0;
+}
+
+static sector_t erofs_bmap(struct address_space *mapping, sector_t block)
+{
+	struct inode *inode = mapping->host;
+	struct erofs_map_blocks map = {
+		.m_la = blknr_to_addr(block),
+	};
+
+	if (EROFS_I(inode)->datalayout == EROFS_INODE_FLAT_INLINE) {
+		erofs_blk_t blks = i_size_read(inode) >> LOG_BLOCK_SIZE;
+
+		if (block >> LOG_SECTORS_PER_BLOCK >= blks)
+			return 0;
+	}
+
+	if (!erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW))
+		return erofs_blknr(map.m_pa);
+
 	return 0;
 }
 
@@ -390,5 +346,5 @@ static int erofs_raw_access_readpages(struct file *filp,
 const struct address_space_operations erofs_raw_access_aops = {
 	.readpage = erofs_raw_access_readpage,
 	.readpages = erofs_raw_access_readpages,
+	.bmap = erofs_bmap,
 };
-
diff --git a/drivers/staging/erofs/decompressor.c b/drivers/staging/erofs/decompressor.c
new file mode 100644
index 000000000000..7822ab4a8262
--- /dev/null
+++ b/drivers/staging/erofs/decompressor.c
@@ -0,0 +1,344 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2019 HUAWEI, Inc.
+ *             https://www.huawei.com/
+ */
+#include "compress.h"
+#include <linux/module.h>
+#include <linux/lz4.h>
+
+#ifndef LZ4_DISTANCE_MAX	/* history window size */
+#define LZ4_DISTANCE_MAX 65535	/* set to maximum value by default */
+#endif
+
+#define LZ4_MAX_DISTANCE_PAGES	(DIV_ROUND_UP(LZ4_DISTANCE_MAX, PAGE_SIZE) + 1)
+#ifndef LZ4_DECOMPRESS_INPLACE_MARGIN
+#define LZ4_DECOMPRESS_INPLACE_MARGIN(srcsize)  (((srcsize) >> 8) + 32)
+#endif
+
+struct z_erofs_decompressor {
+	/*
+	 * if destpages have sparsed pages, fill them with bounce pages.
+	 * it also check whether destpages indicate continuous physical memory.
+	 */
+	int (*prepare_destpages)(struct z_erofs_decompress_req *rq,
+				 struct list_head *pagepool);
+	int (*decompress)(struct z_erofs_decompress_req *rq, u8 *out);
+	char *name;
+};
+
+static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
+					 struct list_head *pagepool)
+{
+	const unsigned int nr =
+		PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
+	struct page *availables[LZ4_MAX_DISTANCE_PAGES] = { NULL };
+	unsigned long bounced[DIV_ROUND_UP(LZ4_MAX_DISTANCE_PAGES,
+					   BITS_PER_LONG)] = { 0 };
+	void *kaddr = NULL;
+	unsigned int i, j, top;
+
+	top = 0;
+	for (i = j = 0; i < nr; ++i, ++j) {
+		struct page *const page = rq->out[i];
+		struct page *victim;
+
+		if (j >= LZ4_MAX_DISTANCE_PAGES)
+			j = 0;
+
+		/* 'valid' bounced can only be tested after a complete round */
+		if (test_bit(j, bounced)) {
+			DBG_BUGON(i < LZ4_MAX_DISTANCE_PAGES);
+			DBG_BUGON(top >= LZ4_MAX_DISTANCE_PAGES);
+			availables[top++] = rq->out[i - LZ4_MAX_DISTANCE_PAGES];
+		}
+
+		if (page) {
+			__clear_bit(j, bounced);
+			if (kaddr) {
+				if (kaddr + PAGE_SIZE == page_address(page))
+					kaddr += PAGE_SIZE;
+				else
+					kaddr = NULL;
+			} else if (!i) {
+				kaddr = page_address(page);
+			}
+			continue;
+		}
+		kaddr = NULL;
+		__set_bit(j, bounced);
+
+		if (top) {
+			victim = availables[--top];
+			get_page(victim);
+		} else {
+			victim = erofs_allocpage(pagepool, GFP_KERNEL);
+			if (!victim)
+				return -ENOMEM;
+			victim->mapping = Z_EROFS_MAPPING_STAGING;
+		}
+		rq->out[i] = victim;
+	}
+	return kaddr ? 1 : 0;
+}
+
+static void *generic_copy_inplace_data(struct z_erofs_decompress_req *rq,
+				       u8 *src, unsigned int pageofs_in)
+{
+	/*
+	 * if in-place decompression is ongoing, those decompressed
+	 * pages should be copied in order to avoid being overlapped.
+	 */
+	struct page **in = rq->in;
+	u8 *const tmp = erofs_get_pcpubuf(0);
+	u8 *tmpp = tmp;
+	unsigned int inlen = rq->inputsize - pageofs_in;
+	unsigned int count = min_t(uint, inlen, PAGE_SIZE - pageofs_in);
+
+	while (tmpp < tmp + inlen) {
+		if (!src)
+			src = kmap_atomic(*in);
+		memcpy(tmpp, src + pageofs_in, count);
+		kunmap_atomic(src);
+		src = NULL;
+		tmpp += count;
+		pageofs_in = 0;
+		count = PAGE_SIZE;
+		++in;
+	}
+	return tmp;
+}
+
+static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out)
+{
+	unsigned int inputmargin, inlen;
+	u8 *src;
+	bool copied, support_0padding;
+	int ret;
+
+	if (rq->inputsize > PAGE_SIZE)
+		return -EOPNOTSUPP;
+
+	src = kmap_atomic(*rq->in);
+	inputmargin = 0;
+	support_0padding = false;
+
+	/* decompression inplace is only safe when 0padding is enabled */
+	if (EROFS_SB(rq->sb)->feature_incompat &
+	    EROFS_FEATURE_INCOMPAT_LZ4_0PADDING) {
+		support_0padding = true;
+
+		while (!src[inputmargin & ~PAGE_MASK])
+			if (!(++inputmargin & ~PAGE_MASK))
+				break;
+
+		if (inputmargin >= rq->inputsize) {
+			kunmap_atomic(src);
+			return -EIO;
+		}
+	}
+
+	copied = false;
+	inlen = rq->inputsize - inputmargin;
+	if (rq->inplace_io) {
+		const uint oend = (rq->pageofs_out +
+				   rq->outputsize) & ~PAGE_MASK;
+		const uint nr = PAGE_ALIGN(rq->pageofs_out +
+					   rq->outputsize) >> PAGE_SHIFT;
+
+		if (rq->partial_decoding || !support_0padding ||
+		    rq->out[nr - 1] != rq->in[0] ||
+		    rq->inputsize - oend <
+		      LZ4_DECOMPRESS_INPLACE_MARGIN(inlen)) {
+			src = generic_copy_inplace_data(rq, src, inputmargin);
+			inputmargin = 0;
+			copied = true;
+		}
+	}
+
+	/* legacy format could compress extra data in a pcluster. */
+	if (rq->partial_decoding || !support_0padding)
+		ret = LZ4_decompress_safe_partial(src + inputmargin, out,
+						  inlen, rq->outputsize,
+						  rq->outputsize);
+	else
+		ret = LZ4_decompress_safe(src + inputmargin, out,
+					  inlen, rq->outputsize);
+
+	if (ret != rq->outputsize) {
+		erofs_err(rq->sb, "failed to decompress %d in[%u, %u] out[%u]",
+			  ret, inlen, inputmargin, rq->outputsize);
+
+		WARN_ON(1);
+		print_hex_dump(KERN_DEBUG, "[ in]: ", DUMP_PREFIX_OFFSET,
+			       16, 1, src + inputmargin, inlen, true);
+		print_hex_dump(KERN_DEBUG, "[out]: ", DUMP_PREFIX_OFFSET,
+			       16, 1, out, rq->outputsize, true);
+
+		if (ret >= 0)
+			memset(out + ret, 0, rq->outputsize - ret);
+		ret = -EIO;
+	}
+
+	if (copied)
+		erofs_put_pcpubuf(src);
+	else
+		kunmap_atomic(src);
+	return ret;
+}
+
+static struct z_erofs_decompressor decompressors[] = {
+	[Z_EROFS_COMPRESSION_SHIFTED] = {
+		.name = "shifted"
+	},
+	[Z_EROFS_COMPRESSION_LZ4] = {
+		.prepare_destpages = z_erofs_lz4_prepare_destpages,
+		.decompress = z_erofs_lz4_decompress,
+		.name = "lz4"
+	},
+};
+
+static void copy_from_pcpubuf(struct page **out, const char *dst,
+			      unsigned short pageofs_out,
+			      unsigned int outputsize)
+{
+	const char *end = dst + outputsize;
+	const unsigned int righthalf = PAGE_SIZE - pageofs_out;
+	const char *cur = dst - pageofs_out;
+
+	while (cur < end) {
+		struct page *const page = *out++;
+
+		if (page) {
+			char *buf = kmap_atomic(page);
+
+			if (cur >= dst) {
+				memcpy(buf, cur, min_t(uint, PAGE_SIZE,
+						       end - cur));
+			} else {
+				memcpy(buf + pageofs_out, cur + pageofs_out,
+				       min_t(uint, righthalf, end - cur));
+			}
+			kunmap_atomic(buf);
+		}
+		cur += PAGE_SIZE;
+	}
+}
+
+static int z_erofs_decompress_generic(struct z_erofs_decompress_req *rq,
+				      struct list_head *pagepool)
+{
+	const unsigned int nrpages_out =
+		PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
+	const struct z_erofs_decompressor *alg = decompressors + rq->alg;
+	unsigned int dst_maptype;
+	void *dst;
+	int ret, i;
+
+	if (nrpages_out == 1 && !rq->inplace_io) {
+		DBG_BUGON(!*rq->out);
+		dst = kmap_atomic(*rq->out);
+		dst_maptype = 0;
+		goto dstmap_out;
+	}
+
+	/*
+	 * For the case of small output size (especially much less
+	 * than PAGE_SIZE), memcpy the decompressed data rather than
+	 * compressed data is preferred.
+	 */
+	if (rq->outputsize <= PAGE_SIZE * 7 / 8) {
+		dst = erofs_get_pcpubuf(0);
+		if (IS_ERR(dst))
+			return PTR_ERR(dst);
+
+		rq->inplace_io = false;
+		ret = alg->decompress(rq, dst);
+		if (!ret)
+			copy_from_pcpubuf(rq->out, dst, rq->pageofs_out,
+					  rq->outputsize);
+
+		erofs_put_pcpubuf(dst);
+		return ret;
+	}
+
+	ret = alg->prepare_destpages(rq, pagepool);
+	if (ret < 0) {
+		return ret;
+	} else if (ret) {
+		dst = page_address(*rq->out);
+		dst_maptype = 1;
+		goto dstmap_out;
+	}
+
+	i = 0;
+	while (1) {
+		dst = vm_map_ram(rq->out, nrpages_out, -1, PAGE_KERNEL);
+
+		/* retry two more times (totally 3 times) */
+		if (dst || ++i >= 3)
+			break;
+		vm_unmap_aliases();
+	}
+
+	if (!dst)
+		return -ENOMEM;
+
+	dst_maptype = 2;
+
+dstmap_out:
+	ret = alg->decompress(rq, dst + rq->pageofs_out);
+
+	if (!dst_maptype)
+		kunmap_atomic(dst);
+	else if (dst_maptype == 2)
+		vm_unmap_ram(dst, nrpages_out);
+	return ret;
+}
+
+static int z_erofs_shifted_transform(const struct z_erofs_decompress_req *rq,
+				     struct list_head *pagepool)
+{
+	const unsigned int nrpages_out =
+		PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT;
+	const unsigned int righthalf = PAGE_SIZE - rq->pageofs_out;
+	unsigned char *src, *dst;
+
+	if (nrpages_out > 2) {
+		DBG_BUGON(1);
+		return -EIO;
+	}
+
+	if (rq->out[0] == *rq->in) {
+		DBG_BUGON(nrpages_out != 1);
+		return 0;
+	}
+
+	src = kmap_atomic(*rq->in);
+	if (rq->out[0]) {
+		dst = kmap_atomic(rq->out[0]);
+		memcpy(dst + rq->pageofs_out, src, righthalf);
+		kunmap_atomic(dst);
+	}
+
+	if (nrpages_out == 2) {
+		DBG_BUGON(!rq->out[1]);
+		if (rq->out[1] == *rq->in) {
+			memmove(src, src + righthalf, rq->pageofs_out);
+		} else {
+			dst = kmap_atomic(rq->out[1]);
+			memcpy(dst, src + righthalf, rq->pageofs_out);
+			kunmap_atomic(dst);
+		}
+	}
+	kunmap_atomic(src);
+	return 0;
+}
+
+int z_erofs_decompress(struct z_erofs_decompress_req *rq,
+		       struct list_head *pagepool)
+{
+	if (rq->alg == Z_EROFS_COMPRESSION_SHIFTED)
+		return z_erofs_shifted_transform(rq, pagepool);
+	return z_erofs_decompress_generic(rq, pagepool);
+}
diff --git a/drivers/staging/erofs/dir.c b/drivers/staging/erofs/dir.c
index fe6683effd05..722b653c7a77 100644
--- a/drivers/staging/erofs/dir.c
+++ b/drivers/staging/erofs/dir.c
@@ -1,14 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0
+// SPDX-License-Identifier: GPL-2.0-only
 /*
- * linux/drivers/staging/erofs/dir.c
- *
  * Copyright (C) 2017-2018 HUAWEI, Inc.
- *             http://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file COPYING in the main directory of the Linux
- * distribution for more details.
+ *             https://www.huawei.com/
  */
 #include "internal.h"
 
@@ -33,19 +26,18 @@ static void debug_one_dentry(unsigned char d_type, const char *de_name,
 	memcpy(dbg_namebuf, de_name, de_namelen);
 	dbg_namebuf[de_namelen] = '\0';
 
-	debugln("found dirent %s de_len %u d_type %d", dbg_namebuf,
-		de_namelen, d_type);
+	erofs_dbg("found dirent %s de_len %u d_type %d", dbg_namebuf,
+		  de_namelen, d_type);
 #endif
 }
 
-static int erofs_fill_dentries(struct dir_context *ctx,
-	void *dentry_blk, unsigned *ofs,
-	unsigned nameoff, unsigned maxsize)
+static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx,
+			       void *dentry_blk, unsigned int *ofs,
+			       unsigned int nameoff, unsigned int maxsize)
 {
-	struct erofs_dirent *de = dentry_blk;
+	struct erofs_dirent *de = dentry_blk + *ofs;
 	const struct erofs_dirent *end = dentry_blk + nameoff;
 
-	de = dentry_blk + *ofs;
 	while (de < end) {
 		const char *de_name;
 		unsigned int de_namelen;
@@ -66,16 +58,18 @@ static int erofs_fill_dentries(struct dir_context *ctx,
 			de_namelen = le16_to_cpu(de[1].nameoff) - nameoff;
 
 		/* a corrupted entry is found */
-		if (unlikely(nameoff + de_namelen > maxsize ||
-			     de_namelen > EROFS_NAME_LEN)) {
+		if (nameoff + de_namelen > maxsize ||
+		    de_namelen > EROFS_NAME_LEN) {
+			erofs_err(dir->i_sb, "bogus dirent @ nid %llu",
+				  EROFS_I(dir)->nid);
 			DBG_BUGON(1);
-			return -EIO;
+			return -EFSCORRUPTED;
 		}
 
 		debug_one_dentry(d_type, de_name, de_namelen);
 		if (!dir_emit(ctx, de_name, de_namelen,
-					le64_to_cpu(de->nid), d_type))
-			/* stoped by some reason */
+			      le64_to_cpu(de->nid), d_type))
+			/* stopped by some reason */
 			return 1;
 		++de;
 		*ofs += sizeof(struct erofs_dirent);
@@ -89,62 +83,63 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx)
 	struct inode *dir = file_inode(f);
 	struct address_space *mapping = dir->i_mapping;
 	const size_t dirsize = i_size_read(dir);
-	unsigned i = ctx->pos / EROFS_BLKSIZ;
-	unsigned ofs = ctx->pos % EROFS_BLKSIZ;
+	unsigned int i = ctx->pos / EROFS_BLKSIZ;
+	unsigned int ofs = ctx->pos % EROFS_BLKSIZ;
 	int err = 0;
 	bool initial = true;
 
 	while (ctx->pos < dirsize) {
 		struct page *dentry_page;
 		struct erofs_dirent *de;
-		unsigned nameoff, maxsize;
+		unsigned int nameoff, maxsize;
 
 		dentry_page = read_mapping_page(mapping, i, NULL);
 		if (dentry_page == ERR_PTR(-ENOMEM)) {
 			err = -ENOMEM;
 			break;
 		} else if (IS_ERR(dentry_page)) {
-			errln("fail to readdir of logical block %u of nid %llu",
-			      i, EROFS_V(dir)->nid);
-			err = PTR_ERR(dentry_page);
+			erofs_err(dir->i_sb,
+				  "fail to readdir of logical block %u of nid %llu",
+				  i, EROFS_I(dir)->nid);
+			err = -EFSCORRUPTED;
 			break;
 		}
 
-		lock_page(dentry_page);
 		de = (struct erofs_dirent *)kmap(dentry_page);
 
 		nameoff = le16_to_cpu(de->nameoff);
 
-		if (unlikely(nameoff < sizeof(struct erofs_dirent) ||
-			nameoff >= PAGE_SIZE)) {
-			errln("%s, invalid de[0].nameoff %u",
-				__func__, nameoff);
-
-			err = -EIO;
+		if (nameoff < sizeof(struct erofs_dirent) ||
+		    nameoff >= PAGE_SIZE) {
+			erofs_err(dir->i_sb,
+				  "invalid de[0].nameoff %u @ nid %llu",
+				  nameoff, EROFS_I(dir)->nid);
+			err = -EFSCORRUPTED;
 			goto skip_this;
 		}
 
-		maxsize = min_t(unsigned, dirsize - ctx->pos + ofs, PAGE_SIZE);
+		maxsize = min_t(unsigned int,
+				dirsize - ctx->pos + ofs, PAGE_SIZE);
 
 		/* search dirents at the arbitrary position */
-		if (unlikely(initial)) {
+		if (initial) {
 			initial = false;
 
 			ofs = roundup(ofs, sizeof(struct erofs_dirent));
-			if (unlikely(ofs >= nameoff))
+			if (ofs >= nameoff)
 				goto skip_this;
 		}
 
-		err = erofs_fill_dentries(ctx, de, &ofs, nameoff, maxsize);
+		err = erofs_fill_dentries(dir, ctx, de, &ofs,
+					  nameoff, maxsize);
 skip_this:
 		kunmap(dentry_page);
 
-		unlock_page(dentry_page);
 		put_page(dentry_page);
 
 		ctx->pos = blknr_to_addr(i) + ofs;
 
-		if (unlikely(err))
+		if (err)
 			break;
 		++i;
 		ofs = 0;
@@ -155,6 +150,5 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx)
 const struct file_operations erofs_dir_fops = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
-	.iterate	= erofs_readdir,
+	.iterate_shared	= erofs_readdir,
 };
-
diff --git a/drivers/staging/erofs/erofs_fs.h b/drivers/staging/erofs/erofs_fs.h
index d8838ce66941..011c3e281f7f 100644
--- a/drivers/staging/erofs/erofs_fs.h
+++ b/drivers/staging/erofs/erofs_fs.h
@@ -1,115 +1,93 @@
-/* SPDX-License-Identifier: GPL-2.0 OR Apache-2.0
- *
- * linux/drivers/staging/erofs/erofs_fs.h
+/* SPDX-License-Identifier: GPL-2.0-only OR Apache-2.0 */
+/*
+ * EROFS (Enhanced ROM File System) on-disk format definition
  *
  * Copyright (C) 2017-2018 HUAWEI, Inc.
- *             http://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
- *
- * This file is dual-licensed; you may select either the GNU General Public
- * License version 2 or Apache License, Version 2.0. See the file COPYING
- * in the main directory of the Linux distribution for more details.
+ *             https://www.huawei.com/
  */
 #ifndef __EROFS_FS_H
 #define __EROFS_FS_H
 
-/* Enhanced(Extended) ROM File System */
-#define EROFS_SUPER_MAGIC_V1    0xE0F5E1E2
+#define EROFS_SUPER_MAGIC_V1	0xE0F5E1E2
 #define EROFS_SUPER_OFFSET      1024
 
+#define EROFS_FEATURE_COMPAT_SB_CHKSUM          0x00000001
+
 /*
- * Any bits that aren't in EROFS_ALL_REQUIREMENTS should be
- * incompatible with this kernel version.
+ * Any bits that aren't in EROFS_ALL_FEATURE_INCOMPAT should
+ * be incompatible with this kernel version.
  */
-#define EROFS_ALL_REQUIREMENTS  0
+#define EROFS_FEATURE_INCOMPAT_LZ4_0PADDING	0x00000001
+#define EROFS_ALL_FEATURE_INCOMPAT		EROFS_FEATURE_INCOMPAT_LZ4_0PADDING
 
+/* 128-byte erofs on-disk super block */
 struct erofs_super_block {
-/*  0 */__le32 magic;           /* in the little endian */
-/*  4 */__le32 checksum;        /* crc32c(super_block) */
-/*  8 */__le32 features;        /* (aka. feature_compat) */
-/* 12 */__u8 blkszbits;         /* support block_size == PAGE_SIZE only */
-/* 13 */__u8 reserved;
+	__le32 magic;           /* file system magic number */
+	__le32 checksum;        /* crc32c(super_block) */
+	__le32 feature_compat;
+	__u8 blkszbits;         /* support block_size == PAGE_SIZE only */
+	__u8 reserved;
 
-/* 14 */__le16 root_nid;
-/* 16 */__le64 inos;            /* total valid ino # (== f_files - f_favail) */
+	__le16 root_nid;	/* nid of root directory */
+	__le64 inos;            /* total valid ino # (== f_files - f_favail) */
 
-/* 24 */__le64 build_time;      /* inode v1 time derivation */
-/* 32 */__le32 build_time_nsec;
-/* 36 */__le32 blocks;          /* used for statfs */
-/* 40 */__le32 meta_blkaddr;
-/* 44 */__le32 xattr_blkaddr;
-/* 48 */__u8 uuid[16];          /* 128-bit uuid for volume */
-/* 64 */__u8 volume_name[16];   /* volume name */
-/* 80 */__le32 requirements;    /* (aka. feature_incompat) */
-
-/* 84 */__u8 reserved2[44];
-} __packed;                     /* 128 bytes */
-
-#define __EROFS_BIT(_prefix, _cur, _pre)	enum {	\
-	_prefix ## _cur ## _BIT = _prefix ## _pre ## _BIT + \
-		_prefix ## _pre ## _BITS }
+	__le64 build_time;      /* inode v1 time derivation */
+	__le32 build_time_nsec;	/* inode v1 time derivation in nano scale */
+	__le32 blocks;          /* used for statfs */
+	__le32 meta_blkaddr;	/* start block address of metadata area */
+	__le32 xattr_blkaddr;	/* start block address of shared xattr area */
+	__u8 uuid[16];          /* 128-bit uuid for volume */
+	__u8 volume_name[16];   /* volume name */
+	__le32 feature_incompat;
+	__u8 reserved2[44];
+};
 
 /*
- * erofs inode data mapping:
+ * erofs inode datalayout (i_format in on-disk inode):
  * 0 - inode plain without inline data A:
  * inode, [xattrs], ... | ... | no-holed data
- * 1 - inode VLE compression B:
+ * 1 - inode VLE compression B (legacy):
  * inode, [xattrs], extents ... | ...
  * 2 - inode plain with inline data C:
  * inode, [xattrs], last_inline_data, ... | ... | no-holed data
- * 3~7 - reserved
+ * 3 - inode compression D:
+ * inode, [xattrs], map_header, extents ... | ...
+ * 4~7 - reserved
  */
 enum {
-	EROFS_INODE_LAYOUT_PLAIN,
-	EROFS_INODE_LAYOUT_COMPRESSION,
-	EROFS_INODE_LAYOUT_INLINE,
-	EROFS_INODE_LAYOUT_MAX
+	EROFS_INODE_FLAT_PLAIN			= 0,
+	EROFS_INODE_FLAT_COMPRESSION_LEGACY	= 1,
+	EROFS_INODE_FLAT_INLINE			= 2,
+	EROFS_INODE_FLAT_COMPRESSION		= 3,
+	EROFS_INODE_DATALAYOUT_MAX
 };
+
+static inline bool erofs_inode_is_data_compressed(unsigned int datamode)
+{
+	return datamode == EROFS_INODE_FLAT_COMPRESSION ||
+		datamode == EROFS_INODE_FLAT_COMPRESSION_LEGACY;
+}
+
+/* bit definitions of inode i_advise */
 #define EROFS_I_VERSION_BITS            1
-#define EROFS_I_DATA_MAPPING_BITS       3
+#define EROFS_I_DATALAYOUT_BITS         3
 
 #define EROFS_I_VERSION_BIT             0
-__EROFS_BIT(EROFS_I_, DATA_MAPPING, VERSION);
+#define EROFS_I_DATALAYOUT_BIT          1
 
 #define EROFS_I_ALL	\
-	((1 << (EROFS_I_DATA_MAPPING_BIT + EROFS_I_DATA_MAPPING_BITS)) - 1)
+	((1 << (EROFS_I_DATALAYOUT_BIT + EROFS_I_DATALAYOUT_BITS)) - 1)
 
-struct erofs_inode_v1 {
-/*  0 */__le16 i_advise;
+/* 32-byte reduced form of an ondisk inode */
+struct erofs_inode_compact {
+	__le16 i_format;	/* inode format hints */
 
 /* 1 header + n-1 * 4 bytes inline xattr to keep continuity */
-/*  2 */__le16 i_xattr_icount;
-/*  4 */__le16 i_mode;
-/*  6 */__le16 i_nlink;
-/*  8 */__le32 i_size;
-/* 12 */__le32 i_reserved;
-/* 16 */union {
-		/* file total compressed blocks for data mapping 1 */
-		__le32 compressed_blocks;
-		__le32 raw_blkaddr;
-
-		/* for device files, used to indicate old/new device # */
-		__le32 rdev;
-	} i_u __packed;
-/* 20 */__le32 i_ino;           /* only used for 32-bit stat compatibility */
-/* 24 */__le16 i_uid;
-/* 26 */__le16 i_gid;
-/* 28 */__le32 i_checksum;
-} __packed;
-
-/* 32 bytes on-disk inode */
-#define EROFS_INODE_LAYOUT_V1   0
-/* 64 bytes on-disk inode */
-#define EROFS_INODE_LAYOUT_V2   1
-
-struct erofs_inode_v2 {
-	__le16 i_advise;
-
-	/* 1 header + n-1 * 4 bytes inline xattr to keep continuity */
 	__le16 i_xattr_icount;
 	__le16 i_mode;
-	__le16 i_reserved;      /* 8 bytes */
-	__le64 i_size;          /* 16 bytes */
+	__le16 i_nlink;
+	__le32 i_size;
+	__le32 i_reserved;
 	union {
 		/* file total compressed blocks for data mapping 1 */
 		__le32 compressed_blocks;
@@ -117,19 +95,46 @@ struct erofs_inode_v2 {
 
 		/* for device files, used to indicate old/new device # */
 		__le32 rdev;
-	} i_u __packed;
+	} i_u;
+	__le32 i_ino;           /* only used for 32-bit stat compatibility */
+	__le16 i_uid;
+	__le16 i_gid;
+	__le32 i_reserved2;
+};
+
+/* 32 bytes on-disk inode */
+#define EROFS_INODE_LAYOUT_COMPACT	0
+/* 64 bytes on-disk inode */
+#define EROFS_INODE_LAYOUT_EXTENDED	1
+
+/* 64-byte complete form of an ondisk inode */
+struct erofs_inode_extended {
+	__le16 i_format;	/* inode format hints */
+
+/* 1 header + n-1 * 4 bytes inline xattr to keep continuity */
+	__le16 i_xattr_icount;
+	__le16 i_mode;
+	__le16 i_reserved;
+	__le64 i_size;
+	union {
+		/* file total compressed blocks for data mapping 1 */
+		__le32 compressed_blocks;
+		__le32 raw_blkaddr;
+
+		/* for device files, used to indicate old/new device # */
+		__le32 rdev;
+	} i_u;
 
 	/* only used for 32-bit stat compatibility */
-	__le32 i_ino;           /* 24 bytes */
+	__le32 i_ino;
 
 	__le32 i_uid;
 	__le32 i_gid;
-	__le64 i_ctime;         /* 32 bytes */
+	__le64 i_ctime;
 	__le32 i_ctime_nsec;
 	__le32 i_nlink;
-	__u8   i_reserved2[12];
-	__le32 i_checksum;      /* 64 bytes */
-} __packed;
+	__u8   i_reserved2[16];
+};
 
 #define EROFS_MAX_SHARED_XATTRS         (128)
 /* h_shared_count between 129 ... 255 are special # */
@@ -147,11 +152,11 @@ struct erofs_inode_v2 {
  * for read-only fs, no need to introduce h_refcount
  */
 struct erofs_xattr_ibody_header {
-	__le32 h_checksum;
+	__le32 h_reserved;
 	__u8   h_shared_count;
-	__u8   h_reserved[7];
+	__u8   h_reserved2[7];
 	__le32 h_shared_xattrs[0];      /* shared xattr id array */
-} __packed;
+};
 
 /* Name indexes */
 #define EROFS_XATTR_INDEX_USER              1
@@ -168,27 +173,61 @@ struct erofs_xattr_entry {
 	__le16 e_value_size;    /* size of attribute value */
 	/* followed by e_name and e_value */
 	char   e_name[0];       /* attribute name */
-} __packed;
+};
 
-#define ondisk_xattr_ibody_size(count)	({\
-	u32 __count = le16_to_cpu(count); \
-	((__count) == 0) ? 0 : \
-	sizeof(struct erofs_xattr_ibody_header) + \
-		sizeof(__u32) * ((__count) - 1); })
+static inline unsigned int erofs_xattr_ibody_size(__le16 i_xattr_icount)
+{
+	if (!i_xattr_icount)
+		return 0;
+
+	return sizeof(struct erofs_xattr_ibody_header) +
+		sizeof(__u32) * (le16_to_cpu(i_xattr_icount) - 1);
+}
 
 #define EROFS_XATTR_ALIGN(size) round_up(size, sizeof(struct erofs_xattr_entry))
-#define EROFS_XATTR_ENTRY_SIZE(entry) EROFS_XATTR_ALIGN( \
-	sizeof(struct erofs_xattr_entry) + \
-	(entry)->e_name_len + le16_to_cpu((entry)->e_value_size))
 
-/* have to be aligned with 8 bytes on disk */
-struct erofs_extent_header {
-	__le32 eh_checksum;
-	__le32 eh_reserved[3];
-} __packed;
+static inline unsigned int erofs_xattr_entry_size(struct erofs_xattr_entry *e)
+{
+	return EROFS_XATTR_ALIGN(sizeof(struct erofs_xattr_entry) +
+				 e->e_name_len + le16_to_cpu(e->e_value_size));
+}
+
+/* available compression algorithm types (for h_algorithmtype) */
+enum {
+	Z_EROFS_COMPRESSION_LZ4	= 0,
+	Z_EROFS_COMPRESSION_MAX
+};
 
 /*
- * Z_EROFS Variable-sized Logical Extent cluster type:
+ * bit 0 : COMPACTED_2B indexes (0 - off; 1 - on)
+ *  e.g. for 4k logical cluster size,      4B        if compacted 2B is off;
+ *                                  (4B) + 2B + (4B) if compacted 2B is on.
+ */
+#define Z_EROFS_ADVISE_COMPACTED_2B_BIT         0
+
+#define Z_EROFS_ADVISE_COMPACTED_2B     (1 << Z_EROFS_ADVISE_COMPACTED_2B_BIT)
+
+struct z_erofs_map_header {
+	__le32	h_reserved1;
+	__le16	h_advise;
+	/*
+	 * bit 0-3 : algorithm type of head 1 (logical cluster type 01);
+	 * bit 4-7 : algorithm type of head 2 (logical cluster type 11).
+	 */
+	__u8	h_algorithmtype;
+	/*
+	 * bit 0-2 : logical cluster bits - 12, e.g. 0 for 4096;
+	 * bit 3-4 : (physical - logical) cluster bits of head 1:
+	 *       For example, if logical clustersize = 4096, 1 for 8192.
+	 * bit 5-7 : (physical - logical) cluster bits of head 2.
+	 */
+	__u8	h_clusterbits;
+};
+
+#define Z_EROFS_VLE_LEGACY_HEADER_PADDING       8
+
+/*
+ * Fixed-sized output compression ondisk Logical Extent cluster type:
  *    0 - literal (uncompressed) cluster
  *    1 - compressed cluster (for the head logical cluster)
  *    2 - compressed cluster (for the other logical clusters)
@@ -212,6 +251,14 @@ struct erofs_extent_header {
  *        di_u.delta[1] = distance to its corresponding tail cluster
  *                (di_advise could be 0, 1 or 2)
  */
+enum {
+	Z_EROFS_VLE_CLUSTER_TYPE_PLAIN		= 0,
+	Z_EROFS_VLE_CLUSTER_TYPE_HEAD		= 1,
+	Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD	= 2,
+	Z_EROFS_VLE_CLUSTER_TYPE_RESERVED	= 3,
+	Z_EROFS_VLE_CLUSTER_TYPE_MAX
+};
+
 #define Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS        2
 #define Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT         0
 
@@ -230,18 +277,19 @@ struct z_erofs_vle_decompressed_index {
 		 * [1] - pointing to the tail cluster
 		 */
 		__le16 delta[2];
-	} di_u __packed;		/* 8 bytes */
-} __packed;
+	} di_u;
+};
 
-#define Z_EROFS_VLE_EXTENT_ALIGN(size) round_up(size, \
-	sizeof(struct z_erofs_vle_decompressed_index))
+#define Z_EROFS_VLE_LEGACY_INDEX_ALIGN(size) \
+	(round_up(size, sizeof(struct z_erofs_vle_decompressed_index)) + \
+	 sizeof(struct z_erofs_map_header) + Z_EROFS_VLE_LEGACY_HEADER_PADDING)
 
 /* dirent sorts in alphabet order, thus we can do binary search */
 struct erofs_dirent {
-	__le64 nid;     /*  0, node number */
-	__le16 nameoff; /*  8, start offset of file name */
-	__u8 file_type; /* 10, file type */
-	__u8 reserved;  /* 11, reserved */
+	__le64 nid;     /* node number */
+	__le16 nameoff; /* start offset of file name */
+	__u8 file_type; /* file type */
+	__u8 reserved;  /* reserved */
 } __packed;
 
 /* file types used in inode_info->flags */
@@ -263,14 +311,16 @@ enum {
 static inline void erofs_check_ondisk_layout_definitions(void)
 {
 	BUILD_BUG_ON(sizeof(struct erofs_super_block) != 128);
-	BUILD_BUG_ON(sizeof(struct erofs_inode_v1) != 32);
-	BUILD_BUG_ON(sizeof(struct erofs_inode_v2) != 64);
+	BUILD_BUG_ON(sizeof(struct erofs_inode_compact) != 32);
+	BUILD_BUG_ON(sizeof(struct erofs_inode_extended) != 64);
 	BUILD_BUG_ON(sizeof(struct erofs_xattr_ibody_header) != 12);
 	BUILD_BUG_ON(sizeof(struct erofs_xattr_entry) != 4);
-	BUILD_BUG_ON(sizeof(struct erofs_extent_header) != 16);
+	BUILD_BUG_ON(sizeof(struct z_erofs_map_header) != 8);
 	BUILD_BUG_ON(sizeof(struct z_erofs_vle_decompressed_index) != 8);
 	BUILD_BUG_ON(sizeof(struct erofs_dirent) != 12);
+
+	BUILD_BUG_ON(BIT(Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS) <
+		     Z_EROFS_VLE_CLUSTER_TYPE_MAX - 1);
 }
 
 #endif
-
diff --git a/drivers/staging/erofs/inode.c b/drivers/staging/erofs/inode.c
index 02398c7eb4a4..5f9c6fb79ac0 100644
--- a/drivers/staging/erofs/inode.c
+++ b/drivers/staging/erofs/inode.c
@@ -1,14 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0
+// SPDX-License-Identifier: GPL-2.0-only
 /*
- * linux/drivers/staging/erofs/inode.c
- *
  * Copyright (C) 2017-2018 HUAWEI, Inc.
- *             http://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file COPYING in the main directory of the Linux
- * distribution for more details.
+ *             https://www.huawei.com/
  */
 #include "xattr.h"
 
@@ -19,56 +12,59 @@
  * the inode payload page if it's an extended inode) in order to fill
  * inline data if possible.
  */
-static struct page *read_inode(struct inode *inode, unsigned int *ofs)
+static struct page *erofs_read_inode(struct inode *inode,
+				     unsigned int *ofs)
 {
 	struct super_block *sb = inode->i_sb;
 	struct erofs_sb_info *sbi = EROFS_SB(sb);
-	struct erofs_vnode *vi = EROFS_V(inode);
+	struct erofs_inode *vi = EROFS_I(inode);
 	const erofs_off_t inode_loc = iloc(sbi, vi->nid);
-	erofs_blk_t blkaddr;
+
+	erofs_blk_t blkaddr, nblks = 0;
 	struct page *page;
-	struct erofs_inode_v1 *v1;
-	struct erofs_inode_v2 *v2, *copied = NULL;
+	struct erofs_inode_compact *dic;
+	struct erofs_inode_extended *die, *copied = NULL;
 	unsigned int ifmt;
 	int err;
 
 	blkaddr = erofs_blknr(inode_loc);
 	*ofs = erofs_blkoff(inode_loc);
 
-	debugln("%s, reading inode nid %llu at %u of blkaddr %u",
-		__func__, vi->nid, *ofs, blkaddr);
+	erofs_dbg("%s, reading inode nid %llu at %u of blkaddr %u",
+		  __func__, vi->nid, *ofs, blkaddr);
 
-	page = erofs_get_meta_page(sb, blkaddr, false);
+	page = erofs_get_meta_page(sb, blkaddr);
 	if (IS_ERR(page)) {
-		errln("failed to get inode (nid: %llu) page, err %ld",
-		      vi->nid, PTR_ERR(page));
+		erofs_err(sb, "failed to get inode (nid: %llu) page, err %ld",
+			  vi->nid, PTR_ERR(page));
 		return page;
 	}
 
-	v1 = page_address(page) + *ofs;
-	ifmt = le16_to_cpu(v1->i_advise);
+	dic = page_address(page) + *ofs;
+	ifmt = le16_to_cpu(dic->i_format);
 
 	if (ifmt & ~EROFS_I_ALL) {
-		errln("unsupported i_format %u of nid %llu", ifmt, vi->nid);
+		erofs_err(inode->i_sb, "unsupported i_format %u of nid %llu",
+			  ifmt, vi->nid);
 		err = -EOPNOTSUPP;
 		goto err_out;
 	}
 
-	vi->data_mapping_mode = __inode_data_mapping(ifmt);
-	if (unlikely(vi->data_mapping_mode >= EROFS_INODE_LAYOUT_MAX)) {
-		errln("unknown data mapping mode %u of nid %llu",
-			vi->data_mapping_mode, vi->nid);
+	vi->datalayout = erofs_inode_datalayout(ifmt);
+	if (vi->datalayout >= EROFS_INODE_DATALAYOUT_MAX) {
+		erofs_err(inode->i_sb, "unsupported datalayout %u of nid %llu",
+			  vi->datalayout, vi->nid);
 		err = -EOPNOTSUPP;
 		goto err_out;
 	}
 
-	switch (__inode_version(ifmt)) {
-	case EROFS_INODE_LAYOUT_V2:
-		vi->inode_isize = sizeof(struct erofs_inode_v2);
+	switch (erofs_inode_version(ifmt)) {
+	case EROFS_INODE_LAYOUT_EXTENDED:
+		vi->inode_isize = sizeof(struct erofs_inode_extended);
 		/* check if the inode acrosses page boundary */
 		if (*ofs + vi->inode_isize <= PAGE_SIZE) {
 			*ofs += vi->inode_isize;
-			v2 = (struct erofs_inode_v2 *)v1;
+			die = (struct erofs_inode_extended *)dic;
 		} else {
 			const unsigned int gotten = PAGE_SIZE - *ofs;
 
@@ -77,78 +73,98 @@ static struct page *read_inode(struct inode *inode, unsigned int *ofs)
 				err = -ENOMEM;
 				goto err_out;
 			}
-			memcpy(copied, v1, gotten);
+			memcpy(copied, dic, gotten);
 			unlock_page(page);
 			put_page(page);
 
-			page = erofs_get_meta_page(sb, blkaddr + 1, false);
+			page = erofs_get_meta_page(sb, blkaddr + 1);
 			if (IS_ERR(page)) {
-				errln("failed to get inode payload page (nid: %llu), err %ld",
-				      vi->nid, PTR_ERR(page));
+				erofs_err(sb, "failed to get inode payload page (nid: %llu), err %ld",
+					  vi->nid, PTR_ERR(page));
 				kfree(copied);
 				return page;
 			}
 			*ofs = vi->inode_isize - gotten;
 			memcpy((u8 *)copied + gotten, page_address(page), *ofs);
-			v2 = copied;
+			die = copied;
 		}
-		vi->xattr_isize = ondisk_xattr_ibody_size(v2->i_xattr_icount);
+		vi->xattr_isize = erofs_xattr_ibody_size(die->i_xattr_icount);
 
-		inode->i_mode = le16_to_cpu(v2->i_mode);
-		if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
-						S_ISLNK(inode->i_mode)) {
-			vi->raw_blkaddr = le32_to_cpu(v2->i_u.raw_blkaddr);
-		} else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
+		inode->i_mode = le16_to_cpu(die->i_mode);
+		switch (inode->i_mode & S_IFMT) {
+		case S_IFREG:
+		case S_IFDIR:
+		case S_IFLNK:
+			vi->raw_blkaddr = le32_to_cpu(die->i_u.raw_blkaddr);
+			break;
+		case S_IFCHR:
+		case S_IFBLK:
 			inode->i_rdev =
-				new_decode_dev(le32_to_cpu(v2->i_u.rdev));
-		} else if (S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
+				new_decode_dev(le32_to_cpu(die->i_u.rdev));
+			break;
+		case S_IFIFO:
+		case S_IFSOCK:
 			inode->i_rdev = 0;
-		} else {
+			break;
+		default:
 			goto bogusimode;
 		}
-
-		i_uid_write(inode, le32_to_cpu(v2->i_uid));
-		i_gid_write(inode, le32_to_cpu(v2->i_gid));
-		set_nlink(inode, le32_to_cpu(v2->i_nlink));
+		i_uid_write(inode, le32_to_cpu(die->i_uid));
+		i_gid_write(inode, le32_to_cpu(die->i_gid));
+		set_nlink(inode, le32_to_cpu(die->i_nlink));
 
 		/* extended inode has its own timestamp */
-		inode->i_ctime.tv_sec = le64_to_cpu(v2->i_ctime);
-		inode->i_ctime.tv_nsec = le32_to_cpu(v2->i_ctime_nsec);
+		inode->i_ctime.tv_sec = le64_to_cpu(die->i_ctime);
+		inode->i_ctime.tv_nsec = le32_to_cpu(die->i_ctime_nsec);
+
+		inode->i_size = le64_to_cpu(die->i_size);
+
+		/* total blocks for compressed files */
+		if (erofs_inode_is_data_compressed(vi->datalayout))
+			nblks = le32_to_cpu(die->i_u.compressed_blocks);
 
-		inode->i_size = le64_to_cpu(v2->i_size);
 		kfree(copied);
 		break;
-	case EROFS_INODE_LAYOUT_V1:
-		vi->inode_isize = sizeof(struct erofs_inode_v1);
+	case EROFS_INODE_LAYOUT_COMPACT:
+		vi->inode_isize = sizeof(struct erofs_inode_compact);
 		*ofs += vi->inode_isize;
-		vi->xattr_isize = ondisk_xattr_ibody_size(v1->i_xattr_icount);
+		vi->xattr_isize = erofs_xattr_ibody_size(dic->i_xattr_icount);
 
-		inode->i_mode = le16_to_cpu(v1->i_mode);
-		if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
-						S_ISLNK(inode->i_mode)) {
-			vi->raw_blkaddr = le32_to_cpu(v1->i_u.raw_blkaddr);
-		} else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
+		inode->i_mode = le16_to_cpu(dic->i_mode);
+		switch (inode->i_mode & S_IFMT) {
+		case S_IFREG:
+		case S_IFDIR:
+		case S_IFLNK:
+			vi->raw_blkaddr = le32_to_cpu(dic->i_u.raw_blkaddr);
+			break;
+		case S_IFCHR:
+		case S_IFBLK:
 			inode->i_rdev =
-				new_decode_dev(le32_to_cpu(v1->i_u.rdev));
-		} else if (S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
+				new_decode_dev(le32_to_cpu(dic->i_u.rdev));
+			break;
+		case S_IFIFO:
+		case S_IFSOCK:
 			inode->i_rdev = 0;
-		} else {
+			break;
+		default:
 			goto bogusimode;
 		}
-
-		i_uid_write(inode, le16_to_cpu(v1->i_uid));
-		i_gid_write(inode, le16_to_cpu(v1->i_gid));
-		set_nlink(inode, le16_to_cpu(v1->i_nlink));
+		i_uid_write(inode, le16_to_cpu(dic->i_uid));
+		i_gid_write(inode, le16_to_cpu(dic->i_gid));
+		set_nlink(inode, le16_to_cpu(dic->i_nlink));
 
 		/* use build time for compact inodes */
 		inode->i_ctime.tv_sec = sbi->build_time;
 		inode->i_ctime.tv_nsec = sbi->build_time_nsec;
 
-		inode->i_size = le32_to_cpu(v1->i_size);
+		inode->i_size = le32_to_cpu(dic->i_size);
+		if (erofs_inode_is_data_compressed(vi->datalayout))
+			nblks = le32_to_cpu(dic->i_u.compressed_blocks);
 		break;
 	default:
-		errln("unsupported on-disk inode version %u of nid %llu",
-		      __inode_version(ifmt), vi->nid);
+		erofs_err(inode->i_sb,
+			  "unsupported on-disk inode version %u of nid %llu",
+			  erofs_inode_version(ifmt), vi->nid);
 		err = -EOPNOTSUPP;
 		goto err_out;
 	}
@@ -158,12 +174,17 @@ static struct page *read_inode(struct inode *inode, unsigned int *ofs)
 	inode->i_mtime.tv_nsec = inode->i_ctime.tv_nsec;
 	inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec;
 
-	/* measure inode.i_blocks as the generic filesystem */
-	inode->i_blocks = ((inode->i_size - 1) >> 9) + 1;
+	if (!nblks)
+		/* measure inode.i_blocks as generic filesystems */
+		inode->i_blocks = roundup(inode->i_size, EROFS_BLKSIZ) >> 9;
+	else
+		inode->i_blocks = nblks << LOG_SECTORS_PER_BLOCK;
 	return page;
+
 bogusimode:
-	errln("bogus i_mode (%o) @ nid %llu", inode->i_mode, vi->nid);
-	err = -EIO;
+	erofs_err(inode->i_sb, "bogus i_mode (%o) @ nid %llu",
+		  inode->i_mode, vi->nid);
+	err = -EFSCORRUPTED;
 err_out:
 	DBG_BUGON(1);
 	kfree(copied);
@@ -172,56 +193,45 @@ static struct page *read_inode(struct inode *inode, unsigned int *ofs)
 	return ERR_PTR(err);
 }
 
-/*
- * try_lock can be required since locking order is:
- *   file data(fs_inode)
- *        meta(bd_inode)
- * but the majority of the callers is "iget",
- * in that case we are pretty sure no deadlock since
- * no data operations exist. However I tend to
- * try_lock since it takes no much overhead and
- * will success immediately.
- */
-static int fill_inline_data(struct inode *inode, void *data, unsigned m_pofs)
+static int erofs_fill_symlink(struct inode *inode, void *data,
+			      unsigned int m_pofs)
 {
-	struct erofs_vnode *vi = EROFS_V(inode);
-	struct erofs_sb_info *sbi = EROFS_I_SB(inode);
-	int mode = vi->data_mapping_mode;
+	struct erofs_inode *vi = EROFS_I(inode);
+	char *lnk;
 
-	DBG_BUGON(mode >= EROFS_INODE_LAYOUT_MAX);
-
-	/* should be inode inline C */
-	if (mode != EROFS_INODE_LAYOUT_INLINE)
+	/* if it cannot be handled with fast symlink scheme */
+	if (vi->datalayout != EROFS_INODE_FLAT_INLINE ||
+	    inode->i_size >= PAGE_SIZE) {
+		inode->i_op = &erofs_symlink_iops;
 		return 0;
-
-	/* fast symlink (following ext4) */
-	if (S_ISLNK(inode->i_mode) && inode->i_size < PAGE_SIZE) {
-		char *lnk = erofs_kmalloc(sbi, inode->i_size + 1, GFP_KERNEL);
-
-		if (unlikely(lnk == NULL))
-			return -ENOMEM;
-
-		m_pofs += vi->xattr_isize;
-
-		/* inline symlink data shouldn't across page boundary as well */
-		if (unlikely(m_pofs + inode->i_size > PAGE_SIZE)) {
-			DBG_BUGON(1);
-			kfree(lnk);
-			return -EIO;
-		}
-
-		/* get in-page inline data */
-		memcpy(lnk, data + m_pofs, inode->i_size);
-		lnk[inode->i_size] = '\0';
-
-		inode->i_link = lnk;
-		set_inode_fast_symlink(inode);
 	}
-	return -EAGAIN;
+
+	lnk = kmalloc(inode->i_size + 1, GFP_KERNEL);
+	if (!lnk)
+		return -ENOMEM;
+
+	m_pofs += vi->xattr_isize;
+	/* inline symlink data shouldn't cross page boundary as well */
+	if (m_pofs + inode->i_size > PAGE_SIZE) {
+		kfree(lnk);
+		erofs_err(inode->i_sb,
+			  "inline data cross block boundary @ nid %llu",
+			  vi->nid);
+		DBG_BUGON(1);
+		return -EFSCORRUPTED;
+	}
+
+	memcpy(lnk, data + m_pofs, inode->i_size);
+	lnk[inode->i_size] = '\0';
+
+	inode->i_link = lnk;
+	inode->i_op = &erofs_fast_symlink_iops;
+	return 0;
 }
 
-static int fill_inode(struct inode *inode, int isdir)
+static int erofs_fill_inode(struct inode *inode, int isdir)
 {
+	struct erofs_inode *vi = EROFS_I(inode);
 	struct page *page;
 	unsigned int ofs;
 	int err = 0;
@@ -229,81 +239,95 @@ static int fill_inode(struct inode *inode, int isdir)
 	trace_erofs_fill_inode(inode, isdir);
 
 	/* read inode base data from disk */
-	page = read_inode(inode, &ofs);
-	if (IS_ERR(page)) {
+	page = erofs_read_inode(inode, &ofs);
+	if (IS_ERR(page))
 		return PTR_ERR(page);
-	} else {
-		/* setup the new inode */
-		if (S_ISREG(inode->i_mode)) {
-#ifdef CONFIG_EROFS_FS_XATTR
-			inode->i_op = &erofs_generic_xattr_iops;
-#endif
-			inode->i_fop = &generic_ro_fops;
-		} else if (S_ISDIR(inode->i_mode)) {
-			inode->i_op =
-#ifdef CONFIG_EROFS_FS_XATTR
-				&erofs_dir_xattr_iops;
-#else
-				&erofs_dir_iops;
-#endif
-			inode->i_fop = &erofs_dir_fops;
-		} else if (S_ISLNK(inode->i_mode)) {
-			/* by default, page_get_link is used for symlink */
-			inode->i_op =
-#ifdef CONFIG_EROFS_FS_XATTR
-				&erofs_symlink_xattr_iops,
-#else
-				&page_symlink_inode_operations;
-#endif
-			inode_nohighmem(inode);
-		} else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
-			S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
-#ifdef CONFIG_EROFS_FS_XATTR
-			inode->i_op = &erofs_special_inode_operations;
-#endif
-			init_special_inode(inode, inode->i_mode, inode->i_rdev);
-		} else {
-			err = -EIO;
+
+	/* setup the new inode */
+	switch (inode->i_mode & S_IFMT) {
+	case S_IFREG:
+		inode->i_op = &erofs_generic_iops;
+		inode->i_fop = &generic_ro_fops;
+		break;
+	case S_IFDIR:
+		inode->i_op = &erofs_dir_iops;
+		inode->i_fop = &erofs_dir_fops;
+		break;
+	case S_IFLNK:
+		err = erofs_fill_symlink(inode, page_address(page), ofs);
+		if (err)
 			goto out_unlock;
-		}
-
-		if (is_inode_layout_compression(inode)) {
-#ifdef CONFIG_EROFS_FS_ZIP
-			inode->i_mapping->a_ops =
-				&z_erofs_vle_normalaccess_aops;
-#else
-			err = -ENOTSUPP;
-#endif
-			goto out_unlock;
-		}
-
-		inode->i_mapping->a_ops = &erofs_raw_access_aops;
-
-		/* fill last page if inline data is available */
-		fill_inline_data(inode, page_address(page), ofs);
+		inode_nohighmem(inode);
+		break;
+	case S_IFCHR:
+	case S_IFBLK:
+	case S_IFIFO:
+	case S_IFSOCK:
+		inode->i_op = &erofs_generic_iops;
+		init_special_inode(inode, inode->i_mode, inode->i_rdev);
+		goto out_unlock;
+	default:
+		err = -EFSCORRUPTED;
+		goto out_unlock;
 	}
 
+	if (erofs_inode_is_data_compressed(vi->datalayout)) {
+		err = z_erofs_fill_inode(inode);
+		goto out_unlock;
+	}
+	inode->i_mapping->a_ops = &erofs_raw_access_aops;
+
 out_unlock:
 	unlock_page(page);
 	put_page(page);
 	return err;
 }
 
-struct inode *erofs_iget(struct super_block *sb,
-	erofs_nid_t nid, bool isdir)
+/*
+ * erofs nid is 64bits, but i_ino is 'unsigned long', therefore
+ * we should do more for 32-bit platform to find the right inode.
+ */
+static int erofs_ilookup_test_actor(struct inode *inode, void *opaque)
 {
-	struct inode *inode = iget_locked(sb, nid);
+	const erofs_nid_t nid = *(erofs_nid_t *)opaque;
 
-	if (unlikely(inode == NULL))
+	return EROFS_I(inode)->nid == nid;
+}
+
+static int erofs_iget_set_actor(struct inode *inode, void *opaque)
+{
+	const erofs_nid_t nid = *(erofs_nid_t *)opaque;
+
+	inode->i_ino = erofs_inode_hash(nid);
+	return 0;
+}
+
+static inline struct inode *erofs_iget_locked(struct super_block *sb,
+					      erofs_nid_t nid)
+{
+	const unsigned long hashval = erofs_inode_hash(nid);
+
+	return iget5_locked(sb, hashval, erofs_ilookup_test_actor,
+		erofs_iget_set_actor, &nid);
+}
+
+struct inode *erofs_iget(struct super_block *sb,
+			 erofs_nid_t nid,
+			 bool isdir)
+{
+	struct inode *inode = erofs_iget_locked(sb, nid);
+
+	if (!inode)
 		return ERR_PTR(-ENOMEM);
 
 	if (inode->i_state & I_NEW) {
 		int err;
-		struct erofs_vnode *vi = EROFS_V(inode);
+		struct erofs_inode *vi = EROFS_I(inode);
+
 		vi->nid = nid;
 
-		err = fill_inode(inode, isdir);
-		if (likely(!err))
+		err = erofs_fill_inode(inode, isdir);
+		if (!err)
 			unlock_new_inode(inode);
 		else {
 			iget_failed(inode);
@@ -313,29 +337,38 @@ struct inode *erofs_iget(struct super_block *sb,
 	return inode;
 }
 
-#ifdef CONFIG_EROFS_FS_XATTR
-const struct inode_operations erofs_generic_xattr_iops = {
-	.listxattr = erofs_listxattr,
-};
-#endif
+int erofs_getattr(const struct path *path, struct kstat *stat,
+		  u32 request_mask, unsigned int query_flags)
+{
+	struct inode *const inode = d_inode(path->dentry);
 
-#ifdef CONFIG_EROFS_FS_XATTR
-const struct inode_operations erofs_symlink_xattr_iops = {
+	if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout))
+		stat->attributes |= STATX_ATTR_COMPRESSED;
+
+	stat->attributes |= STATX_ATTR_IMMUTABLE;
+	stat->attributes_mask |= (STATX_ATTR_COMPRESSED |
+				  STATX_ATTR_IMMUTABLE);
+
+	generic_fillattr(inode, stat);
+	return 0;
+}
+
+const struct inode_operations erofs_generic_iops = {
+	.getattr = erofs_getattr,
+	.listxattr = erofs_listxattr,
+	.get_acl = erofs_get_acl,
+};
+
+const struct inode_operations erofs_symlink_iops = {
 	.get_link = page_get_link,
+	.getattr = erofs_getattr,
 	.listxattr = erofs_listxattr,
-};
-#endif
-
-const struct inode_operations erofs_special_inode_operations = {
-#ifdef CONFIG_EROFS_FS_XATTR
-	.listxattr = erofs_listxattr,
-#endif
+	.get_acl = erofs_get_acl,
 };
 
-#ifdef CONFIG_EROFS_FS_XATTR
-const struct inode_operations erofs_fast_symlink_xattr_iops = {
+const struct inode_operations erofs_fast_symlink_iops = {
 	.get_link = simple_get_link,
+	.getattr = erofs_getattr,
 	.listxattr = erofs_listxattr,
+	.get_acl = erofs_get_acl,
 };
-#endif
-
diff --git a/drivers/staging/erofs/internal.h b/drivers/staging/erofs/internal.h
index 8ce37091db20..5ee2368a18d5 100644
--- a/drivers/staging/erofs/internal.h
+++ b/drivers/staging/erofs/internal.h
@@ -1,17 +1,10 @@
-/* SPDX-License-Identifier: GPL-2.0
- *
- * linux/drivers/staging/erofs/internal.h
- *
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
  * Copyright (C) 2017-2018 HUAWEI, Inc.
- *             http://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file COPYING in the main directory of the Linux
- * distribution for more details.
+ *             https://www.huawei.com/
  */
-#ifndef __INTERNAL_H
-#define __INTERNAL_H
+#ifndef __EROFS_INTERNAL_H
+#define __EROFS_INTERNAL_H
 
 #include <linux/fs.h>
 #include <linux/dcache.h>
@@ -19,7 +12,7 @@
 #include <linux/pagemap.h>
 #include <linux/bio.h>
 #include <linux/buffer_head.h>
-#include <linux/cleancache.h>
+#include <linux/magic.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include "erofs_fs.h"
@@ -28,58 +21,50 @@
 #undef pr_fmt
 #define pr_fmt(fmt) "erofs: " fmt
 
-#define errln(x, ...)   pr_err(x "\n", ##__VA_ARGS__)
-#define infoln(x, ...)  pr_info(x "\n", ##__VA_ARGS__)
+__printf(3, 4) void _erofs_err(struct super_block *sb,
+			       const char *function, const char *fmt, ...);
+#define erofs_err(sb, fmt, ...)	\
+	_erofs_err(sb, __func__, fmt "\n", ##__VA_ARGS__)
+__printf(3, 4) void _erofs_info(struct super_block *sb,
+			       const char *function, const char *fmt, ...);
+#define erofs_info(sb, fmt, ...) \
+	_erofs_info(sb, __func__, fmt "\n", ##__VA_ARGS__)
 #ifdef CONFIG_EROFS_FS_DEBUG
-#define debugln(x, ...) pr_debug(x "\n", ##__VA_ARGS__)
-
-#define dbg_might_sleep         might_sleep
+#define erofs_dbg(x, ...)       pr_debug(x "\n", ##__VA_ARGS__)
 #define DBG_BUGON               BUG_ON
 #else
-#define debugln(x, ...)         ((void)0)
-
-#define dbg_might_sleep()       ((void)0)
+#define erofs_dbg(x, ...)       ((void)0)
 #define DBG_BUGON(x)            ((void)(x))
-#endif
-
-#ifdef CONFIG_EROFS_FAULT_INJECTION
-enum {
-	FAULT_KMALLOC,
-	FAULT_MAX,
-};
-
-extern char *erofs_fault_name[FAULT_MAX];
-#define IS_FAULT_SET(fi, type) ((fi)->inject_type & (1 << (type)))
-
-struct erofs_fault_info {
-	atomic_t inject_ops;
-	unsigned int inject_rate;
-	unsigned int inject_type;
-};
-#endif
-
-#ifdef CONFIG_EROFS_FS_ZIP_CACHE_BIPOLAR
-#define EROFS_FS_ZIP_CACHE_LVL	(2)
-#elif defined(EROFS_FS_ZIP_CACHE_UNIPOLAR)
-#define EROFS_FS_ZIP_CACHE_LVL	(1)
-#else
-#define EROFS_FS_ZIP_CACHE_LVL	(0)
-#endif
-
-#if (!defined(EROFS_FS_HAS_MANAGED_CACHE) && (EROFS_FS_ZIP_CACHE_LVL > 0))
-#define EROFS_FS_HAS_MANAGED_CACHE
-#endif
+#endif	/* !CONFIG_EROFS_FS_DEBUG */
 
 /* EROFS_SUPER_MAGIC_V1 to represent the whole file system */
 #define EROFS_SUPER_MAGIC   EROFS_SUPER_MAGIC_V1
 
 typedef u64 erofs_nid_t;
+typedef u64 erofs_off_t;
+/* data type for filesystem-wide blocks number */
+typedef u32 erofs_blk_t;
 
 struct erofs_sb_info {
+#ifdef CONFIG_EROFS_FS_ZIP
 	/* list for all registered superblocks, mainly for shrinker */
 	struct list_head list;
 	struct mutex umount_mutex;
 
+	/* the dedicated workstation for compression */
+	struct radix_tree_root workstn_tree;
+
+	/* threshold for decompression synchronously */
+	unsigned int max_sync_decompress_pages;
+
+	unsigned int shrinker_run_no;
+
+	/* current strategy of how to use managed cache */
+	unsigned char cache_strategy;
+
+	/* pseudo inode to manage cached pages */
+	struct inode *managed_cache;
+#endif	/* CONFIG_EROFS_FS_ZIP */
 	u32 blocks;
 	u32 meta_blkaddr;
 #ifdef CONFIG_EROFS_FS_XATTR
@@ -88,18 +73,6 @@ struct erofs_sb_info {
 
 	/* inode slot unit size in bit shift */
 	unsigned char islotbits;
-#ifdef CONFIG_EROFS_FS_ZIP
-	/* cluster size in bit shift */
-	unsigned char clusterbits;
-
-	/* the dedicated workstation for compression */
-	struct radix_tree_root workstn_tree;
-
-#ifdef EROFS_FS_HAS_MANAGED_CACHE
-	struct inode *managed_cache;
-#endif
-
-#endif
 
 	u32 build_time_nsec;
 	u64 build_time;
@@ -111,69 +84,31 @@ struct erofs_sb_info {
 
 	u8 uuid[16];                    /* 128-bit uuid for volume */
 	u8 volume_name[16];             /* volume name */
-	u32 requirements;
-
-	char *dev_name;
+	u32 feature_compat;
+	u32 feature_incompat;
 
 	unsigned int mount_opt;
-	unsigned int shrinker_run_no;
-
-#ifdef CONFIG_EROFS_FAULT_INJECTION
-	struct erofs_fault_info fault_info;	/* For fault injection */
-#endif
 };
 
-#ifdef CONFIG_EROFS_FAULT_INJECTION
-#define erofs_show_injection_info(type)					\
-	infoln("inject %s in %s of %pS", erofs_fault_name[type],        \
-		__func__, __builtin_return_address(0))
-
-static inline bool time_to_inject(struct erofs_sb_info *sbi, int type)
-{
-	struct erofs_fault_info *ffi = &sbi->fault_info;
-
-	if (!ffi->inject_rate)
-		return false;
-
-	if (!IS_FAULT_SET(ffi, type))
-		return false;
-
-	atomic_inc(&ffi->inject_ops);
-	if (atomic_read(&ffi->inject_ops) >= ffi->inject_rate) {
-		atomic_set(&ffi->inject_ops, 0);
-		return true;
-	}
-	return false;
-}
-#endif
-
-static inline void *erofs_kmalloc(struct erofs_sb_info *sbi,
-					size_t size, gfp_t flags)
-{
-#ifdef CONFIG_EROFS_FAULT_INJECTION
-	if (time_to_inject(sbi, FAULT_KMALLOC)) {
-		erofs_show_injection_info(FAULT_KMALLOC);
-		return NULL;
-	}
-#endif
-	return kmalloc(size, flags);
-}
-
 #define EROFS_SB(sb) ((struct erofs_sb_info *)(sb)->s_fs_info)
 #define EROFS_I_SB(inode) ((struct erofs_sb_info *)(inode)->i_sb->s_fs_info)
 
 /* Mount flags set via mount options or defaults */
 #define EROFS_MOUNT_XATTR_USER		0x00000010
 #define EROFS_MOUNT_POSIX_ACL		0x00000020
-#define EROFS_MOUNT_FAULT_INJECTION	0x00000040
 
 #define clear_opt(sbi, option)	((sbi)->mount_opt &= ~EROFS_MOUNT_##option)
 #define set_opt(sbi, option)	((sbi)->mount_opt |= EROFS_MOUNT_##option)
 #define test_opt(sbi, option)	((sbi)->mount_opt & EROFS_MOUNT_##option)
 
 #ifdef CONFIG_EROFS_FS_ZIP
-#define erofs_workstn_lock(sbi)         xa_lock(&(sbi)->workstn_tree)
-#define erofs_workstn_unlock(sbi)       xa_unlock(&(sbi)->workstn_tree)
+enum {
+	EROFS_ZIP_CACHE_DISABLED,
+	EROFS_ZIP_CACHE_READAHEAD,
+	EROFS_ZIP_CACHE_READAROUND
+};
+
+#define EROFS_LOCKED_MAGIC     (INT_MIN | 0xE0F510CCL)
 
 /* basic unit of the workstation of a super_block */
 struct erofs_workgroup {
@@ -184,8 +119,6 @@ struct erofs_workgroup {
 	atomic_t refcount;
 };
 
-#define EROFS_LOCKED_MAGIC     (INT_MIN | 0xE0F510CCL)
-
 #if defined(CONFIG_SMP)
 static inline bool erofs_workgroup_try_to_freeze(struct erofs_workgroup *grp,
 						 int val)
@@ -242,54 +175,14 @@ static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp)
 	DBG_BUGON(v == EROFS_LOCKED_MAGIC);
 	return v;
 }
-#endif
+#endif	/* !CONFIG_SMP */
 
-static inline bool erofs_workgroup_get(struct erofs_workgroup *grp, int *ocnt)
-{
-	int o;
-
-repeat:
-	o = erofs_wait_on_workgroup_freezed(grp);
-
-	if (unlikely(o <= 0))
-		return -1;
-
-	if (unlikely(atomic_cmpxchg(&grp->refcount, o, o + 1) != o))
-		goto repeat;
-
-	*ocnt = o;
-	return 0;
-}
-
-#define __erofs_workgroup_get(grp)	atomic_inc(&(grp)->refcount)
-#define __erofs_workgroup_put(grp)	atomic_dec(&(grp)->refcount)
-
-extern int erofs_workgroup_put(struct erofs_workgroup *grp);
-
-extern struct erofs_workgroup *erofs_find_workgroup(
-	struct super_block *sb, pgoff_t index, bool *tag);
-
-extern int erofs_register_workgroup(struct super_block *sb,
-	struct erofs_workgroup *grp, bool tag);
-
-extern unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
-	unsigned long nr_shrink, bool cleanup);
-
-static inline void erofs_workstation_cleanup_all(struct super_block *sb)
-{
-	erofs_shrink_workstation(EROFS_SB(sb), ~0UL, true);
-}
-
-#ifdef EROFS_FS_HAS_MANAGED_CACHE
-#define EROFS_UNALLOCATED_CACHED_PAGE	((void *)0x5F0EF00D)
-
-extern int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
-	struct erofs_workgroup *egrp);
-extern int erofs_try_to_free_cached_page(struct address_space *mapping,
-	struct page *page);
-#endif
-
-#endif
+/* hard limit of pages per compressed cluster */
+#define Z_EROFS_CLUSTER_MAX_PAGES       (CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT)
+#define EROFS_PCPUBUF_NR_PAGES          Z_EROFS_CLUSTER_MAX_PAGES
+#else
+#define EROFS_PCPUBUF_NR_PAGES          0
+#endif	/* !CONFIG_EROFS_FS_ZIP */
 
 /* we strictly follow PAGE_SIZE and no buffer head yet */
 #define LOG_BLOCK_SIZE		PAGE_SHIFT
@@ -308,19 +201,6 @@ extern int erofs_try_to_free_cached_page(struct address_space *mapping,
 
 #define ROOT_NID(sb)		((sb)->root_nid)
 
-#ifdef CONFIG_EROFS_FS_ZIP
-/* hard limit of pages per compressed cluster */
-#define Z_EROFS_CLUSTER_MAX_PAGES       (CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT)
-
-/* page count of a compressed cluster */
-#define erofs_clusterpages(sbi)         ((1 << (sbi)->clusterbits) / PAGE_SIZE)
-#endif
-
-typedef u64 erofs_off_t;
-
-/* data type for filesystem-wide blocks number */
-typedef u32 erofs_blk_t;
-
 #define erofs_blknr(addr)       ((addr) / EROFS_BLKSIZ)
 #define erofs_blkoff(addr)      ((addr) % EROFS_BLKSIZ)
 #define blknr_to_addr(nr)       ((erofs_off_t)(nr) * EROFS_BLKSIZ)
@@ -331,75 +211,74 @@ static inline erofs_off_t iloc(struct erofs_sb_info *sbi, erofs_nid_t nid)
 }
 
 /* atomic flag definitions */
-#define EROFS_V_EA_INITED_BIT	0
+#define EROFS_I_EA_INITED_BIT	0
+#define EROFS_I_Z_INITED_BIT	1
 
 /* bitlock definitions (arranged in reverse order) */
-#define EROFS_V_BL_XATTR_BIT	(BITS_PER_LONG - 1)
+#define EROFS_I_BL_XATTR_BIT	(BITS_PER_LONG - 1)
+#define EROFS_I_BL_Z_BIT	(BITS_PER_LONG - 2)
 
-struct erofs_vnode {
+struct erofs_inode {
 	erofs_nid_t nid;
 
 	/* atomic flags (including bitlocks) */
 	unsigned long flags;
 
-	unsigned char data_mapping_mode;
-	/* inline size in bytes */
+	unsigned char datalayout;
 	unsigned char inode_isize;
 	unsigned short xattr_isize;
 
-	unsigned xattr_shared_count;
-	unsigned *xattr_shared_xattrs;
-
-	erofs_blk_t raw_blkaddr;
+	unsigned int xattr_shared_count;
+	unsigned int *xattr_shared_xattrs;
 
+	union {
+		erofs_blk_t raw_blkaddr;
+#ifdef CONFIG_EROFS_FS_ZIP
+		struct {
+			unsigned short z_advise;
+			unsigned char  z_algorithmtype[2];
+			unsigned char  z_logical_clusterbits;
+			unsigned char  z_physical_clusterbits[2];
+		};
+#endif	/* CONFIG_EROFS_FS_ZIP */
+	};
 	/* the corresponding vfs inode */
 	struct inode vfs_inode;
 };
 
-#define EROFS_V(ptr)	\
-	container_of(ptr, struct erofs_vnode, vfs_inode)
+#define EROFS_I(ptr)	\
+	container_of(ptr, struct erofs_inode, vfs_inode)
 
-#define __inode_advise(x, bit, bits) \
-	(((x) >> (bit)) & ((1 << (bits)) - 1))
-
-#define __inode_version(advise)	\
-	__inode_advise(advise, EROFS_I_VERSION_BIT,	\
-		EROFS_I_VERSION_BITS)
-
-#define __inode_data_mapping(advise)	\
-	__inode_advise(advise, EROFS_I_DATA_MAPPING_BIT,\
-		EROFS_I_DATA_MAPPING_BITS)
-
-static inline unsigned long inode_datablocks(struct inode *inode)
+static inline unsigned long erofs_inode_datablocks(struct inode *inode)
 {
 	/* since i_size cannot be changed */
 	return DIV_ROUND_UP(inode->i_size, EROFS_BLKSIZ);
 }
 
-static inline bool is_inode_layout_plain(struct inode *inode)
+static inline unsigned int erofs_bitrange(unsigned int value, unsigned int bit,
+					  unsigned int bits)
 {
-	return EROFS_V(inode)->data_mapping_mode == EROFS_INODE_LAYOUT_PLAIN;
+
+	return (value >> bit) & ((1 << bits) - 1);
 }
 
-static inline bool is_inode_layout_compression(struct inode *inode)
+
+static inline unsigned int erofs_inode_version(unsigned int value)
 {
-	return EROFS_V(inode)->data_mapping_mode ==
-					EROFS_INODE_LAYOUT_COMPRESSION;
+	return erofs_bitrange(value, EROFS_I_VERSION_BIT,
+			      EROFS_I_VERSION_BITS);
 }
 
-static inline bool is_inode_layout_inline(struct inode *inode)
+static inline unsigned int erofs_inode_datalayout(unsigned int value)
 {
-	return EROFS_V(inode)->data_mapping_mode == EROFS_INODE_LAYOUT_INLINE;
+	return erofs_bitrange(value, EROFS_I_DATALAYOUT_BIT,
+			      EROFS_I_DATALAYOUT_BITS);
 }
 
 extern const struct super_operations erofs_sops;
-extern const struct inode_operations erofs_dir_iops;
-extern const struct file_operations erofs_dir_fops;
 
 extern const struct address_space_operations erofs_raw_access_aops;
-#ifdef CONFIG_EROFS_FS_ZIP
-extern const struct address_space_operations z_erofs_vle_normalaccess_aops;
-#endif
+extern const struct address_space_operations z_erofs_aops;
 
 /*
  * Logical to physical block mapping, used by erofs_map_blocks()
@@ -428,6 +307,7 @@ extern const struct address_space_operations z_erofs_vle_normalaccess_aops;
  */
 enum {
 	BH_Zipped = BH_PrivateStart,
+	BH_FullMapped,
 };
 
 /* Has a disk mapping */
@@ -436,150 +316,117 @@ enum {
 #define EROFS_MAP_META		(1 << BH_Meta)
 /* The extent has been compressed */
 #define EROFS_MAP_ZIPPED	(1 << BH_Zipped)
+/* The length of extent is full */
+#define EROFS_MAP_FULL_MAPPED	(1 << BH_FullMapped)
 
 struct erofs_map_blocks {
 	erofs_off_t m_pa, m_la;
 	u64 m_plen, m_llen;
 
 	unsigned int m_flags;
+
+	struct page *mpage;
 };
 
 /* Flags used by erofs_map_blocks() */
 #define EROFS_GET_BLOCKS_RAW    0x0001
 
+/* zmap.c */
+#ifdef CONFIG_EROFS_FS_ZIP
+int z_erofs_fill_inode(struct inode *inode);
+int z_erofs_map_blocks_iter(struct inode *inode,
+			    struct erofs_map_blocks *map,
+			    int flags);
+#else
+static inline int z_erofs_fill_inode(struct inode *inode) { return -EOPNOTSUPP; }
+static inline int z_erofs_map_blocks_iter(struct inode *inode,
+					  struct erofs_map_blocks *map,
+					  int flags)
+{
+	return -EOPNOTSUPP;
+}
+#endif	/* !CONFIG_EROFS_FS_ZIP */
+
 /* data.c */
-static inline struct bio *prepare_bio(
-	struct super_block *sb,
-	erofs_blk_t blkaddr, unsigned nr_pages,
-	bio_end_io_t endio)
-{
-	gfp_t gfp = GFP_NOIO;
-	struct bio *bio = bio_alloc(gfp, nr_pages);
+struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr);
 
-	if (unlikely(bio == NULL) &&
-		(current->flags & PF_MEMALLOC)) {
-		do {
-			nr_pages /= 2;
-			if (unlikely(!nr_pages)) {
-				bio = bio_alloc(gfp | __GFP_NOFAIL, 1);
-				BUG_ON(bio == NULL);
-				break;
-			}
-			bio = bio_alloc(gfp, nr_pages);
-		} while (bio == NULL);
-	}
-
-	bio->bi_end_io = endio;
-	bio_set_dev(bio, sb->s_bdev);
-	bio->bi_iter.bi_sector = blkaddr << LOG_SECTORS_PER_BLOCK;
-	return bio;
-}
-
-static inline void __submit_bio(struct bio *bio, unsigned op, unsigned op_flags)
-{
-	bio_set_op_attrs(bio, op, op_flags);
-	submit_bio(bio);
-}
-
-extern struct page *erofs_get_meta_page(struct super_block *sb,
-	erofs_blk_t blkaddr, bool prio);
-extern int erofs_map_blocks(struct inode *, struct erofs_map_blocks *, int);
-extern int erofs_map_blocks_iter(struct inode *, struct erofs_map_blocks *,
-	struct page **, int);
-
-struct erofs_map_blocks_iter {
-	struct erofs_map_blocks map;
-	struct page *mpage;
-};
-
-
-static inline struct page *
-erofs_get_inline_page(struct inode *inode,
-		      erofs_blk_t blkaddr)
-{
-	return erofs_get_meta_page(inode->i_sb,
-		blkaddr, S_ISDIR(inode->i_mode));
-}
+int erofs_map_blocks(struct inode *, struct erofs_map_blocks *, int);
 
 /* inode.c */
-extern struct inode *erofs_iget(struct super_block *sb,
-	erofs_nid_t nid, bool dir);
+static inline unsigned long erofs_inode_hash(erofs_nid_t nid)
+{
+#if BITS_PER_LONG == 32
+	return (nid >> 32) ^ (nid & 0xffffffff);
+#else
+	return nid;
+#endif
+}
+
+extern const struct inode_operations erofs_generic_iops;
+extern const struct inode_operations erofs_symlink_iops;
+extern const struct inode_operations erofs_fast_symlink_iops;
+
+struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid, bool dir);
+int erofs_getattr(const struct path *path, struct kstat *stat,
+		  u32 request_mask, unsigned int query_flags);
+
+/* namei.c */
+extern const struct inode_operations erofs_dir_iops;
+
+int erofs_namei(struct inode *dir, struct qstr *name,
+		erofs_nid_t *nid, unsigned int *d_type);
 
 /* dir.c */
-int erofs_namei(struct inode *dir, struct qstr *name,
-	erofs_nid_t *nid, unsigned *d_type);
+extern const struct file_operations erofs_dir_fops;
 
-/* xattr.c */
-#ifdef CONFIG_EROFS_FS_XATTR
-extern const struct xattr_handler *erofs_xattr_handlers[];
-#endif
+/* utils.c / zdata.c */
+struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp);
 
-/* symlink */
-#ifdef CONFIG_EROFS_FS_XATTR
-extern const struct inode_operations erofs_symlink_xattr_iops;
-extern const struct inode_operations erofs_fast_symlink_xattr_iops;
-extern const struct inode_operations erofs_special_inode_operations;
-#endif
-
-static inline void set_inode_fast_symlink(struct inode *inode)
-{
-#ifdef CONFIG_EROFS_FS_XATTR
-	inode->i_op = &erofs_fast_symlink_xattr_iops;
+#if (EROFS_PCPUBUF_NR_PAGES > 0)
+void *erofs_get_pcpubuf(unsigned int pagenr);
+#define erofs_put_pcpubuf(buf) do { \
+	(void)&(buf);	\
+	preempt_enable();	\
+} while (0)
 #else
-	inode->i_op = &simple_symlink_inode_operations;
-#endif
+static inline void *erofs_get_pcpubuf(unsigned int pagenr)
+{
+	return ERR_PTR(-EOPNOTSUPP);
 }
 
-static inline bool is_inode_fast_symlink(struct inode *inode)
-{
-#ifdef CONFIG_EROFS_FS_XATTR
-	return inode->i_op == &erofs_fast_symlink_xattr_iops;
-#else
-	return inode->i_op == &simple_symlink_inode_operations;
+#define erofs_put_pcpubuf(buf) do {} while (0)
 #endif
-}
 
-static inline void *erofs_vmap(struct page **pages, unsigned int count)
-{
-#ifdef CONFIG_EROFS_FS_USE_VM_MAP_RAM
-	int i = 0;
-
-	while (1) {
-		void *addr = vm_map_ram(pages, count, -1, PAGE_KERNEL);
-		/* retry two more times (totally 3 times) */
-		if (addr != NULL || ++i >= 3)
-			return addr;
-		vm_unmap_aliases();
-	}
-	return NULL;
+#ifdef CONFIG_EROFS_FS_ZIP
+int erofs_workgroup_put(struct erofs_workgroup *grp);
+struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
+					     pgoff_t index);
+int erofs_register_workgroup(struct super_block *sb,
+			     struct erofs_workgroup *grp);
+void erofs_workgroup_free_rcu(struct erofs_workgroup *grp);
+void erofs_shrinker_register(struct super_block *sb);
+void erofs_shrinker_unregister(struct super_block *sb);
+int __init erofs_init_shrinker(void);
+void erofs_exit_shrinker(void);
+int __init z_erofs_init_zip_subsystem(void);
+void z_erofs_exit_zip_subsystem(void);
+int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
+				       struct erofs_workgroup *egrp);
+int erofs_try_to_free_cached_page(struct address_space *mapping,
+				  struct page *page);
 #else
-	return vmap(pages, count, VM_MAP, PAGE_KERNEL);
-#endif
-}
+static inline void erofs_shrinker_register(struct super_block *sb) {}
+static inline void erofs_shrinker_unregister(struct super_block *sb) {}
+static inline int erofs_init_shrinker(void) { return 0; }
+static inline void erofs_exit_shrinker(void) {}
+static inline int z_erofs_init_zip_subsystem(void) { return 0; }
+static inline void z_erofs_exit_zip_subsystem(void) {}
+#endif	/* !CONFIG_EROFS_FS_ZIP */
 
-static inline void erofs_vunmap(const void *mem, unsigned int count)
-{
-#ifdef CONFIG_EROFS_FS_USE_VM_MAP_RAM
-	vm_unmap_ram(mem, count);
-#else
-	vunmap(mem);
-#endif
-}
-
-/* utils.c */
-extern struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp);
-
-extern void erofs_register_super(struct super_block *sb);
-extern void erofs_unregister_super(struct super_block *sb);
-
-extern unsigned long erofs_shrink_count(struct shrinker *shrink,
-	struct shrink_control *sc);
-extern unsigned long erofs_shrink_scan(struct shrinker *shrink,
-	struct shrink_control *sc);
+#define EFSCORRUPTED    EUCLEAN         /* Filesystem is corrupted */
 
 #ifndef lru_to_page
 #define lru_to_page(head) (list_entry((head)->prev, struct page, lru))
 #endif
 
-#endif
-
+#endif	/* __EROFS_INTERNAL_H */
diff --git a/drivers/staging/erofs/lz4defs.h b/drivers/staging/erofs/lz4defs.h
deleted file mode 100644
index 00a0b58a0871..000000000000
--- a/drivers/staging/erofs/lz4defs.h
+++ /dev/null
@@ -1,227 +0,0 @@
-#ifndef __LZ4DEFS_H__
-#define __LZ4DEFS_H__
-
-/*
- * lz4defs.h -- common and architecture specific defines for the kernel usage
-
- * LZ4 - Fast LZ compression algorithm
- * Copyright (C) 2011-2016, Yann Collet.
- * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php)
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met:
- *	* Redistributions of source code must retain the above copyright
- *	  notice, this list of conditions and the following disclaimer.
- *	* Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following disclaimer
- * in the documentation and/or other materials provided with the
- * distribution.
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- * You can contact the author at :
- *	- LZ4 homepage : http://www.lz4.org
- *	- LZ4 source repository : https://github.com/lz4/lz4
- *
- *	Changed for kernel usage by:
- *	Sven Schmidt <4sschmid@informatik.uni-hamburg.de>
- */
-
-#include <asm/unaligned.h>
-#include <linux/string.h>	 /* memset, memcpy */
-
-#define FORCE_INLINE __always_inline
-
-/*-************************************
- *	Basic Types
- **************************************/
-#include <linux/types.h>
-
-typedef	uint8_t BYTE;
-typedef uint16_t U16;
-typedef uint32_t U32;
-typedef	int32_t S32;
-typedef uint64_t U64;
-typedef uintptr_t uptrval;
-
-/*-************************************
- *	Architecture specifics
- **************************************/
-#if defined(CONFIG_64BIT)
-#define LZ4_ARCH64 1
-#else
-#define LZ4_ARCH64 0
-#endif
-
-#if defined(__LITTLE_ENDIAN)
-#define LZ4_LITTLE_ENDIAN 1
-#else
-#define LZ4_LITTLE_ENDIAN 0
-#endif
-
-/*-************************************
- *	Constants
- **************************************/
-#define MINMATCH 4
-
-#define WILDCOPYLENGTH 8
-#define LASTLITERALS 5
-#define MFLIMIT (WILDCOPYLENGTH + MINMATCH)
-
-/* Increase this value ==> compression run slower on incompressible data */
-#define LZ4_SKIPTRIGGER 6
-
-#define HASH_UNIT sizeof(size_t)
-
-#define KB (1 << 10)
-#define MB (1 << 20)
-#define GB (1U << 30)
-
-#define MAXD_LOG 16
-#define MAX_DISTANCE ((1 << MAXD_LOG) - 1)
-#define STEPSIZE sizeof(size_t)
-
-#define ML_BITS	4
-#define ML_MASK	((1U << ML_BITS) - 1)
-#define RUN_BITS (8 - ML_BITS)
-#define RUN_MASK ((1U << RUN_BITS) - 1)
-
-/*-************************************
- *	Reading and writing into memory
- **************************************/
-static FORCE_INLINE U16 LZ4_read16(const void *ptr)
-{
-	return get_unaligned((const U16 *)ptr);
-}
-
-static FORCE_INLINE U32 LZ4_read32(const void *ptr)
-{
-	return get_unaligned((const U32 *)ptr);
-}
-
-static FORCE_INLINE size_t LZ4_read_ARCH(const void *ptr)
-{
-	return get_unaligned((const size_t *)ptr);
-}
-
-static FORCE_INLINE void LZ4_write16(void *memPtr, U16 value)
-{
-	put_unaligned(value, (U16 *)memPtr);
-}
-
-static FORCE_INLINE void LZ4_write32(void *memPtr, U32 value)
-{
-	put_unaligned(value, (U32 *)memPtr);
-}
-
-static FORCE_INLINE U16 LZ4_readLE16(const void *memPtr)
-{
-	return get_unaligned_le16(memPtr);
-}
-
-static FORCE_INLINE void LZ4_writeLE16(void *memPtr, U16 value)
-{
-	return put_unaligned_le16(value, memPtr);
-}
-
-static FORCE_INLINE void LZ4_copy8(void *dst, const void *src)
-{
-#if LZ4_ARCH64
-	U64 a = get_unaligned((const U64 *)src);
-
-	put_unaligned(a, (U64 *)dst);
-#else
-	U32 a = get_unaligned((const U32 *)src);
-	U32 b = get_unaligned((const U32 *)src + 1);
-
-	put_unaligned(a, (U32 *)dst);
-	put_unaligned(b, (U32 *)dst + 1);
-#endif
-}
-
-/*
- * customized variant of memcpy,
- * which can overwrite up to 7 bytes beyond dstEnd
- */
-static FORCE_INLINE void LZ4_wildCopy(void *dstPtr,
-	const void *srcPtr, void *dstEnd)
-{
-	BYTE *d = (BYTE *)dstPtr;
-	const BYTE *s = (const BYTE *)srcPtr;
-	BYTE *const e = (BYTE *)dstEnd;
-
-	do {
-		LZ4_copy8(d, s);
-		d += 8;
-		s += 8;
-	} while (d < e);
-}
-
-static FORCE_INLINE unsigned int LZ4_NbCommonBytes(register size_t val)
-{
-#if LZ4_LITTLE_ENDIAN
-	return __ffs(val) >> 3;
-#else
-	return (BITS_PER_LONG - 1 - __fls(val)) >> 3;
-#endif
-}
-
-static FORCE_INLINE unsigned int LZ4_count(
-	const BYTE *pIn,
-	const BYTE *pMatch,
-	const BYTE *pInLimit)
-{
-	const BYTE *const pStart = pIn;
-
-	while (likely(pIn < pInLimit - (STEPSIZE - 1))) {
-		size_t const diff = LZ4_read_ARCH(pMatch) ^ LZ4_read_ARCH(pIn);
-
-		if (!diff) {
-			pIn += STEPSIZE;
-			pMatch += STEPSIZE;
-			continue;
-		}
-
-		pIn += LZ4_NbCommonBytes(diff);
-
-		return (unsigned int)(pIn - pStart);
-	}
-
-#if LZ4_ARCH64
-	if ((pIn < (pInLimit - 3))
-		&& (LZ4_read32(pMatch) == LZ4_read32(pIn))) {
-		pIn += 4;
-		pMatch += 4;
-	}
-#endif
-
-	if ((pIn < (pInLimit - 1))
-		&& (LZ4_read16(pMatch) == LZ4_read16(pIn))) {
-		pIn += 2;
-		pMatch += 2;
-	}
-
-	if ((pIn < pInLimit) && (*pMatch == *pIn))
-		pIn++;
-
-	return (unsigned int)(pIn - pStart);
-}
-
-typedef enum { noLimit = 0, limitedOutput = 1 } limitedOutput_directive;
-typedef enum { byPtr, byU32, byU16 } tableType_t;
-
-typedef enum { noDict = 0, withPrefix64k, usingExtDict } dict_directive;
-typedef enum { noDictIssue = 0, dictSmall } dictIssue_directive;
-
-typedef enum { endOnOutputSize = 0, endOnInputSize = 1 } endCondition_directive;
-typedef enum { full = 0, partial = 1 } earlyEnd_directive;
-
-#endif
diff --git a/drivers/staging/erofs/namei.c b/drivers/staging/erofs/namei.c
index 023f64fa2c87..c5c0dd9d9033 100644
--- a/drivers/staging/erofs/namei.c
+++ b/drivers/staging/erofs/namei.c
@@ -1,16 +1,8 @@
-// SPDX-License-Identifier: GPL-2.0
+// SPDX-License-Identifier: GPL-2.0-only
 /*
- * linux/drivers/staging/erofs/namei.c
- *
  * Copyright (C) 2017-2018 HUAWEI, Inc.
- *             http://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file COPYING in the main directory of the Linux
- * distribution for more details.
+ *             https://www.huawei.com/
  */
-#include "internal.h"
 #include "xattr.h"
 
 #include <trace/events/erofs.h>
@@ -21,9 +13,9 @@ struct erofs_qstr {
 };
 
 /* based on the end of qn is accurate and it must have the trailing '\0' */
-static inline int dirnamecmp(const struct erofs_qstr *qn,
-			     const struct erofs_qstr *qd,
-			     unsigned int *matched)
+static inline int erofs_dirnamecmp(const struct erofs_qstr *qn,
+				   const struct erofs_qstr *qd,
+				   unsigned int *matched)
 {
 	unsigned int i = *matched;
 
@@ -71,16 +63,16 @@ static struct erofs_dirent *find_target_dirent(struct erofs_qstr *name,
 		unsigned int matched = min(startprfx, endprfx);
 		struct erofs_qstr dname = {
 			.name = data + nameoff,
-			.end = unlikely(mid >= ndirents - 1) ?
+			.end = mid >= ndirents - 1 ?
 				data + dirblksize :
 				data + nameoff_from_disk(de[mid + 1].nameoff,
 							 dirblksize)
 		};
 
 		/* string comparison without already matched prefix */
-		int ret = dirnamecmp(name, &dname, &matched);
+		int ret = erofs_dirnamecmp(name, &dname, &matched);
 
-		if (unlikely(!ret)) {
+		if (!ret) {
 			return de + mid;
 		} else if (ret > 0) {
 			head = mid + 1;
@@ -105,7 +97,7 @@ static struct page *find_target_block_classic(struct inode *dir,
 
 	startprfx = endprfx = 0;
 	head = 0;
-	back = inode_datablocks(dir) - 1;
+	back = erofs_inode_datablocks(dir) - 1;
 
 	while (head <= back) {
 		const int mid = head + (back - head) / 2;
@@ -120,11 +112,14 @@ static struct page *find_target_block_classic(struct inode *dir,
 			unsigned int matched;
 			struct erofs_qstr dname;
 
-			if (unlikely(!ndirents)) {
-				DBG_BUGON(1);
+			if (!ndirents) {
 				kunmap_atomic(de);
 				put_page(page);
-				page = ERR_PTR(-EIO);
+				erofs_err(dir->i_sb,
+					  "corrupted dir block %d @ nid %llu",
+					  mid, EROFS_I(dir)->nid);
+				DBG_BUGON(1);
+				page = ERR_PTR(-EFSCORRUPTED);
 				goto out;
 			}
 
@@ -139,17 +134,17 @@ static struct page *find_target_block_classic(struct inode *dir,
 							  EROFS_BLKSIZ);
 
 			/* string comparison without already matched prefix */
-			diff = dirnamecmp(name, &dname, &matched);
+			diff = erofs_dirnamecmp(name, &dname, &matched);
 			kunmap_atomic(de);
 
-			if (unlikely(!diff)) {
+			if (!diff) {
 				*_ndirents = 0;
 				goto out;
 			} else if (diff > 0) {
 				head = mid + 1;
 				startprfx = matched;
 
-				if (likely(!IS_ERR(candidate)))
+				if (!IS_ERR(candidate))
 					put_page(candidate);
 				candidate = page;
 				*_ndirents = ndirents;
@@ -179,7 +174,7 @@ int erofs_namei(struct inode *dir,
 	struct erofs_dirent *de;
 	struct erofs_qstr qn;
 
-	if (unlikely(!dir->i_size))
+	if (!dir->i_size)
 		return -ENOENT;
 
 	qn.name = name->name;
@@ -188,7 +183,7 @@ int erofs_namei(struct inode *dir,
 	ndirents = 0;
 	page = find_target_block_classic(dir, &qn, &ndirents);
 
-	if (unlikely(IS_ERR(page)))
+	if (IS_ERR(page))
 		return PTR_ERR(page);
 
 	data = kmap_atomic(page);
@@ -198,7 +193,7 @@ int erofs_namei(struct inode *dir,
 	else
 		de = (struct erofs_dirent *)data;
 
-	if (likely(!IS_ERR(de))) {
+	if (!IS_ERR(de)) {
 		*nid = le64_to_cpu(de->nid);
 		*d_type = de->file_type;
 	}
@@ -211,11 +206,12 @@ int erofs_namei(struct inode *dir,
 
 /* NOTE: i_mutex is already held by vfs */
 static struct dentry *erofs_lookup(struct inode *dir,
-	struct dentry *dentry, unsigned int flags)
+				   struct dentry *dentry,
+				   unsigned int flags)
 {
 	int err;
 	erofs_nid_t nid;
-	unsigned d_type;
+	unsigned int d_type;
 	struct inode *inode;
 
 	DBG_BUGON(!d_really_is_negative(dentry));
@@ -225,7 +221,7 @@ static struct dentry *erofs_lookup(struct inode *dir,
 	trace_erofs_lookup(dir, dentry, flags);
 
 	/* file name exceeds fs limit */
-	if (unlikely(dentry->d_name.len > EROFS_NAME_LEN))
+	if (dentry->d_name.len > EROFS_NAME_LEN)
 		return ERR_PTR(-ENAMETOOLONG);
 
 	/* false uninitialized warnings on gcc 4.8.x */
@@ -234,29 +230,19 @@ static struct dentry *erofs_lookup(struct inode *dir,
 	if (err == -ENOENT) {
 		/* negative dentry */
 		inode = NULL;
-		goto negative_out;
-	} else if (unlikely(err))
-		return ERR_PTR(err);
-
-	debugln("%s, %s (nid %llu) found, d_type %u", __func__,
-		dentry->d_name.name, nid, d_type);
-
-	inode = erofs_iget(dir->i_sb, nid, d_type == EROFS_FT_DIR);
-	if (IS_ERR(inode))
-		return ERR_CAST(inode);
-
-negative_out:
+	} else if (err) {
+		inode = ERR_PTR(err);
+	} else {
+		erofs_dbg("%s, %s (nid %llu) found, d_type %u", __func__,
+			  dentry->d_name.name, nid, d_type);
+		inode = erofs_iget(dir->i_sb, nid, d_type == EROFS_FT_DIR);
+	}
 	return d_splice_alias(inode, dentry);
 }
 
 const struct inode_operations erofs_dir_iops = {
 	.lookup = erofs_lookup,
-};
-
-const struct inode_operations erofs_dir_xattr_iops = {
-	.lookup = erofs_lookup,
-#ifdef CONFIG_EROFS_FS_XATTR
+	.getattr = erofs_getattr,
 	.listxattr = erofs_listxattr,
-#endif
+	.get_acl = erofs_get_acl,
 };
-
diff --git a/drivers/staging/erofs/super.c b/drivers/staging/erofs/super.c
index b49ebdf6ebda..13977976c719 100644
--- a/drivers/staging/erofs/super.c
+++ b/drivers/staging/erofs/super.c
@@ -1,188 +1,248 @@
-// SPDX-License-Identifier: GPL-2.0
+// SPDX-License-Identifier: GPL-2.0-only
 /*
- * linux/drivers/staging/erofs/super.c
- *
  * Copyright (C) 2017-2018 HUAWEI, Inc.
- *             http://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file COPYING in the main directory of the Linux
- * distribution for more details.
+ *             https://www.huawei.com/
  */
 #include <linux/module.h>
 #include <linux/buffer_head.h>
 #include <linux/statfs.h>
 #include <linux/parser.h>
 #include <linux/seq_file.h>
-#include "internal.h"
+#include <linux/crc32c.h>
+#include "xattr.h"
 
 #define CREATE_TRACE_POINTS
 #include <trace/events/erofs.h>
 
 static struct kmem_cache *erofs_inode_cachep __read_mostly;
 
-static void init_once(void *ptr)
+void _erofs_err(struct super_block *sb, const char *function,
+		const char *fmt, ...)
 {
-	struct erofs_vnode *vi = ptr;
+	struct va_format vaf;
+	va_list args;
+
+	va_start(args, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	pr_err("(device %s): %s: %pV", sb->s_id, function, &vaf);
+	va_end(args);
+}
+
+void _erofs_info(struct super_block *sb, const char *function,
+		 const char *fmt, ...)
+{
+	struct va_format vaf;
+	va_list args;
+
+	va_start(args, fmt);
+
+	vaf.fmt = fmt;
+	vaf.va = &args;
+
+	pr_info("(device %s): %pV", sb->s_id, &vaf);
+	va_end(args);
+}
+
+static int erofs_superblock_csum_verify(struct super_block *sb, void *sbdata)
+{
+	struct erofs_super_block *dsb;
+	u32 expected_crc, crc;
+
+	dsb = kmemdup(sbdata + EROFS_SUPER_OFFSET,
+		      EROFS_BLKSIZ - EROFS_SUPER_OFFSET, GFP_KERNEL);
+	if (!dsb)
+		return -ENOMEM;
+
+	expected_crc = le32_to_cpu(dsb->checksum);
+	dsb->checksum = 0;
+	/* to allow for x86 boot sectors and other oddities. */
+	crc = crc32c(~0, dsb, EROFS_BLKSIZ - EROFS_SUPER_OFFSET);
+	kfree(dsb);
+
+	if (crc != expected_crc) {
+		erofs_err(sb, "invalid checksum 0x%08x, 0x%08x expected",
+			  crc, expected_crc);
+		return -EBADMSG;
+	}
+	return 0;
+}
+
+static void erofs_inode_init_once(void *ptr)
+{
+	struct erofs_inode *vi = ptr;
 
 	inode_init_once(&vi->vfs_inode);
 }
 
-static int erofs_init_inode_cache(void)
+static struct inode *erofs_alloc_inode(struct super_block *sb)
 {
-	erofs_inode_cachep = kmem_cache_create("erofs_inode",
-		sizeof(struct erofs_vnode), 0,
-		SLAB_RECLAIM_ACCOUNT, init_once);
-
-	return erofs_inode_cachep != NULL ? 0 : -ENOMEM;
-}
-
-static void erofs_exit_inode_cache(void)
-{
-	kmem_cache_destroy(erofs_inode_cachep);
-}
-
-static struct inode *alloc_inode(struct super_block *sb)
-{
-	struct erofs_vnode *vi =
+	struct erofs_inode *vi =
 		kmem_cache_alloc(erofs_inode_cachep, GFP_KERNEL);
 
-	if (vi == NULL)
+	if (!vi)
 		return NULL;
 
 	/* zero out everything except vfs_inode */
-	memset(vi, 0, offsetof(struct erofs_vnode, vfs_inode));
+	memset(vi, 0, offsetof(struct erofs_inode, vfs_inode));
 	return &vi->vfs_inode;
 }
 
 static void i_callback(struct rcu_head *head)
 {
 	struct inode *inode = container_of(head, struct inode, i_rcu);
-	struct erofs_vnode *vi = EROFS_V(inode);
+	struct erofs_inode *vi = EROFS_I(inode);
 
-	/* be careful RCU symlink path (see ext4_inode_info->i_data)! */
-	if (is_inode_fast_symlink(inode))
+	/* be careful of RCU symlink path */
+	if (inode->i_op == &erofs_fast_symlink_iops)
 		kfree(inode->i_link);
-
 	kfree(vi->xattr_shared_xattrs);
 
 	kmem_cache_free(erofs_inode_cachep, vi);
 }
 
-static void destroy_inode(struct inode *inode)
+static void erofs_destroy_inode(struct inode *inode)
 {
 	call_rcu(&inode->i_rcu, i_callback);
 }
 
 static bool check_layout_compatibility(struct super_block *sb,
-				       struct erofs_super_block *layout)
+				       struct erofs_super_block *dsb)
 {
-	const unsigned int requirements = le32_to_cpu(layout->requirements);
+	const unsigned int feature = le32_to_cpu(dsb->feature_incompat);
 
-	EROFS_SB(sb)->requirements = requirements;
+	EROFS_SB(sb)->feature_incompat = feature;
 
 	/* check if current kernel meets all mandatory requirements */
-	if (requirements & (~EROFS_ALL_REQUIREMENTS)) {
-		errln("unidentified requirements %x, please upgrade kernel version",
-		      requirements & ~EROFS_ALL_REQUIREMENTS);
+	if (feature & (~EROFS_ALL_FEATURE_INCOMPAT)) {
+		erofs_err(sb,
+			  "unidentified incompatible feature %x, please upgrade kernel version",
+			   feature & ~EROFS_ALL_FEATURE_INCOMPAT);
 		return false;
 	}
 	return true;
 }
 
-static int superblock_read(struct super_block *sb)
+static int erofs_read_superblock(struct super_block *sb)
 {
 	struct erofs_sb_info *sbi;
-	struct buffer_head *bh;
-	struct erofs_super_block *layout;
-	unsigned blkszbits;
+	struct page *page;
+	struct erofs_super_block *dsb;
+	unsigned int blkszbits;
+	void *data;
 	int ret;
 
-	bh = sb_bread(sb, 0);
-
-	if (bh == NULL) {
-		errln("cannot read erofs superblock");
-		return -EIO;
+	page = read_mapping_page(sb->s_bdev->bd_inode->i_mapping, 0, NULL);
+	if (IS_ERR(page)) {
+		erofs_err(sb, "cannot read erofs superblock");
+		return PTR_ERR(page);
 	}
 
 	sbi = EROFS_SB(sb);
-	layout = (struct erofs_super_block *)((u8 *)bh->b_data
-		 + EROFS_SUPER_OFFSET);
+
+	data = kmap(page);
+	dsb = (struct erofs_super_block *)(data + EROFS_SUPER_OFFSET);
 
 	ret = -EINVAL;
-	if (le32_to_cpu(layout->magic) != EROFS_SUPER_MAGIC_V1) {
-		errln("cannot find valid erofs superblock");
+	if (le32_to_cpu(dsb->magic) != EROFS_SUPER_MAGIC_V1) {
+		erofs_err(sb, "cannot find valid erofs superblock");
 		goto out;
 	}
 
-	blkszbits = layout->blkszbits;
+	sbi->feature_compat = le32_to_cpu(dsb->feature_compat);
+	if (sbi->feature_compat & EROFS_FEATURE_COMPAT_SB_CHKSUM) {
+		ret = erofs_superblock_csum_verify(sb, data);
+		if (ret)
+			goto out;
+	}
+
+	ret = -EINVAL;
+	blkszbits = dsb->blkszbits;
 	/* 9(512 bytes) + LOG_SECTORS_PER_BLOCK == LOG_BLOCK_SIZE */
-	if (unlikely(blkszbits != LOG_BLOCK_SIZE)) {
-		errln("blksize %u isn't supported on this platform",
-			1 << blkszbits);
+	if (blkszbits != LOG_BLOCK_SIZE) {
+		erofs_err(sb, "blkszbits %u isn't supported on this platform",
+			  blkszbits);
 		goto out;
 	}
 
-	if (!check_layout_compatibility(sb, layout))
+	if (!check_layout_compatibility(sb, dsb))
 		goto out;
 
-	sbi->blocks = le32_to_cpu(layout->blocks);
-	sbi->meta_blkaddr = le32_to_cpu(layout->meta_blkaddr);
+	sbi->blocks = le32_to_cpu(dsb->blocks);
+	sbi->meta_blkaddr = le32_to_cpu(dsb->meta_blkaddr);
 #ifdef CONFIG_EROFS_FS_XATTR
-	sbi->xattr_blkaddr = le32_to_cpu(layout->xattr_blkaddr);
+	sbi->xattr_blkaddr = le32_to_cpu(dsb->xattr_blkaddr);
 #endif
-	sbi->islotbits = ffs(sizeof(struct erofs_inode_v1)) - 1;
-#ifdef CONFIG_EROFS_FS_ZIP
-	sbi->clusterbits = 12;
+	sbi->islotbits = ilog2(sizeof(struct erofs_inode_compact));
+	sbi->root_nid = le16_to_cpu(dsb->root_nid);
+	sbi->inos = le64_to_cpu(dsb->inos);
 
-	if (1 << (sbi->clusterbits - 12) > Z_EROFS_CLUSTER_MAX_PAGES)
-		errln("clusterbits %u is not supported on this kernel",
-			sbi->clusterbits);
-#endif
+	sbi->build_time = le64_to_cpu(dsb->build_time);
+	sbi->build_time_nsec = le32_to_cpu(dsb->build_time_nsec);
 
-	sbi->root_nid = le16_to_cpu(layout->root_nid);
-	sbi->inos = le64_to_cpu(layout->inos);
-
-	sbi->build_time = le64_to_cpu(layout->build_time);
-	sbi->build_time_nsec = le32_to_cpu(layout->build_time_nsec);
-
-	memcpy(&sb->s_uuid, layout->uuid, sizeof(layout->uuid));
-	memcpy(sbi->volume_name, layout->volume_name,
-		sizeof(layout->volume_name));
+	memcpy(&sb->s_uuid, dsb->uuid, sizeof(dsb->uuid));
 
+	ret = strscpy(sbi->volume_name, dsb->volume_name,
+		      sizeof(dsb->volume_name));
+	if (ret < 0) {	/* -E2BIG */
+		erofs_err(sb, "bad volume name without NIL terminator");
+		ret = -EFSCORRUPTED;
+		goto out;
+	}
 	ret = 0;
 out:
-	brelse(bh);
+	kunmap(page);
+	put_page(page);
 	return ret;
 }
 
-#ifdef CONFIG_EROFS_FAULT_INJECTION
-char *erofs_fault_name[FAULT_MAX] = {
-	[FAULT_KMALLOC]		= "kmalloc",
-};
-
-static void erofs_build_fault_attr(struct erofs_sb_info *sbi,
-						unsigned int rate)
+#ifdef CONFIG_EROFS_FS_ZIP
+static int erofs_build_cache_strategy(struct super_block *sb,
+				      substring_t *args)
 {
-	struct erofs_fault_info *ffi = &sbi->fault_info;
+	struct erofs_sb_info *sbi = EROFS_SB(sb);
+	const char *cs = match_strdup(args);
+	int err = 0;
 
-	if (rate) {
-		atomic_set(&ffi->inject_ops, 0);
-		ffi->inject_rate = rate;
-		ffi->inject_type = (1 << FAULT_MAX) - 1;
-	} else {
-		memset(ffi, 0, sizeof(struct erofs_fault_info));
+	if (!cs) {
+		erofs_err(sb, "Not enough memory to store cache strategy");
+		return -ENOMEM;
 	}
+
+	if (!strcmp(cs, "disabled")) {
+		sbi->cache_strategy = EROFS_ZIP_CACHE_DISABLED;
+	} else if (!strcmp(cs, "readahead")) {
+		sbi->cache_strategy = EROFS_ZIP_CACHE_READAHEAD;
+	} else if (!strcmp(cs, "readaround")) {
+		sbi->cache_strategy = EROFS_ZIP_CACHE_READAROUND;
+	} else {
+		erofs_err(sb, "Unrecognized cache strategy \"%s\"", cs);
+		err = -EINVAL;
+	}
+	kfree(cs);
+	return err;
+}
+#else
+static int erofs_build_cache_strategy(struct super_block *sb,
+				      substring_t *args)
+{
+	erofs_info(sb, "EROFS compression is disabled, so cache strategy is ignored");
+	return 0;
 }
 #endif
 
-static void default_options(struct erofs_sb_info *sbi)
+/* set up default EROFS parameters */
+static void erofs_default_options(struct erofs_sb_info *sbi)
 {
+#ifdef CONFIG_EROFS_FS_ZIP
+	sbi->cache_strategy = EROFS_ZIP_CACHE_READAROUND;
+	sbi->max_sync_decompress_pages = 3;
+#endif
 #ifdef CONFIG_EROFS_FS_XATTR
 	set_opt(sbi, XATTR_USER);
 #endif
-
 #ifdef CONFIG_EROFS_FS_POSIX_ACL
 	set_opt(sbi, POSIX_ACL);
 #endif
@@ -193,7 +253,7 @@ enum {
 	Opt_nouser_xattr,
 	Opt_acl,
 	Opt_noacl,
-	Opt_fault_injection,
+	Opt_cache_strategy,
 	Opt_err
 };
 
@@ -202,20 +262,20 @@ static match_table_t erofs_tokens = {
 	{Opt_nouser_xattr, "nouser_xattr"},
 	{Opt_acl, "acl"},
 	{Opt_noacl, "noacl"},
-	{Opt_fault_injection, "fault_injection=%u"},
+	{Opt_cache_strategy, "cache_strategy=%s"},
 	{Opt_err, NULL}
 };
 
-static int parse_options(struct super_block *sb, char *options)
+static int erofs_parse_options(struct super_block *sb, char *options)
 {
 	substring_t args[MAX_OPT_ARGS];
 	char *p;
-	int arg = 0;
+	int err;
 
 	if (!options)
 		return 0;
 
-	while ((p = strsep(&options, ",")) != NULL) {
+	while ((p = strsep(&options, ","))) {
 		int token;
 
 		if (!*p)
@@ -234,10 +294,10 @@ static int parse_options(struct super_block *sb, char *options)
 			break;
 #else
 		case Opt_user_xattr:
-			infoln("user_xattr options not supported");
+			erofs_info(sb, "user_xattr options not supported");
 			break;
 		case Opt_nouser_xattr:
-			infoln("nouser_xattr options not supported");
+			erofs_info(sb, "nouser_xattr options not supported");
 			break;
 #endif
 #ifdef CONFIG_EROFS_FS_POSIX_ACL
@@ -249,36 +309,29 @@ static int parse_options(struct super_block *sb, char *options)
 			break;
 #else
 		case Opt_acl:
-			infoln("acl options not supported");
+			erofs_info(sb, "acl options not supported");
 			break;
 		case Opt_noacl:
-			infoln("noacl options not supported");
+			erofs_info(sb, "noacl options not supported");
 			break;
 #endif
-		case Opt_fault_injection:
-			if (args->from && match_int(args, &arg))
-				return -EINVAL;
-#ifdef CONFIG_EROFS_FAULT_INJECTION
-			erofs_build_fault_attr(EROFS_SB(sb), arg);
-			set_opt(EROFS_SB(sb), FAULT_INJECTION);
-#else
-			infoln("FAULT_INJECTION was not selected");
-#endif
+		case Opt_cache_strategy:
+			err = erofs_build_cache_strategy(sb, args);
+			if (err)
+				return err;
 			break;
 		default:
-			errln("Unrecognized mount option \"%s\" "
-					"or missing value", p);
+			erofs_err(sb, "Unrecognized mount option \"%s\" or missing value", p);
 			return -EINVAL;
 		}
 	}
 	return 0;
 }
 
-#ifdef EROFS_FS_HAS_MANAGED_CACHE
-
+#ifdef CONFIG_EROFS_FS_ZIP
 static const struct address_space_operations managed_cache_aops;
 
-static int managed_cache_releasepage(struct page *page, gfp_t gfp_mask)
+static int erofs_managed_cache_releasepage(struct page *page, gfp_t gfp_mask)
 {
 	int ret = 1;	/* 0 - busy */
 	struct address_space *const mapping = page->mapping;
@@ -292,8 +345,9 @@ static int managed_cache_releasepage(struct page *page, gfp_t gfp_mask)
 	return ret;
 }
 
-static void managed_cache_invalidatepage(struct page *page,
-	unsigned int offset, unsigned int length)
+static void erofs_managed_cache_invalidatepage(struct page *page,
+					       unsigned int offset,
+					       unsigned int length)
 {
 	const unsigned int stop = length + offset;
 
@@ -303,227 +357,147 @@ static void managed_cache_invalidatepage(struct page *page,
 	DBG_BUGON(stop > PAGE_SIZE || stop < length);
 
 	if (offset == 0 && stop == PAGE_SIZE)
-		while (!managed_cache_releasepage(page, GFP_NOFS))
+		while (!erofs_managed_cache_releasepage(page, GFP_NOFS))
 			cond_resched();
 }
 
 static const struct address_space_operations managed_cache_aops = {
-	.releasepage = managed_cache_releasepage,
-	.invalidatepage = managed_cache_invalidatepage,
+	.releasepage = erofs_managed_cache_releasepage,
+	.invalidatepage = erofs_managed_cache_invalidatepage,
 };
 
-static struct inode *erofs_init_managed_cache(struct super_block *sb)
+static int erofs_init_managed_cache(struct super_block *sb)
 {
-	struct inode *inode = new_inode(sb);
+	struct erofs_sb_info *const sbi = EROFS_SB(sb);
+	struct inode *const inode = new_inode(sb);
 
-	if (unlikely(inode == NULL))
-		return ERR_PTR(-ENOMEM);
+	if (!inode)
+		return -ENOMEM;
 
 	set_nlink(inode, 1);
 	inode->i_size = OFFSET_MAX;
 
 	inode->i_mapping->a_ops = &managed_cache_aops;
 	mapping_set_gfp_mask(inode->i_mapping,
-			     GFP_NOFS | __GFP_HIGHMEM |
-			     __GFP_MOVABLE |  __GFP_NOFAIL);
-	return inode;
+			     GFP_NOFS | __GFP_HIGHMEM | __GFP_MOVABLE);
+	sbi->managed_cache = inode;
+	return 0;
 }
-
+#else
+static int erofs_init_managed_cache(struct super_block *sb) { return 0; }
 #endif
 
-static int erofs_read_super(struct super_block *sb,
-	const char *dev_name, void *data, int silent)
+static int erofs_fill_super(struct super_block *sb, void *data, int silent)
 {
 	struct inode *inode;
 	struct erofs_sb_info *sbi;
-	int err = -EINVAL;
-
-	infoln("read_super, device -> %s", dev_name);
-	infoln("options -> %s", (char *)data);
-
-	if (unlikely(!sb_set_blocksize(sb, EROFS_BLKSIZ))) {
-		errln("failed to set erofs blksize");
-		goto err;
-	}
-
-	sbi = kzalloc(sizeof(struct erofs_sb_info), GFP_KERNEL);
-	if (unlikely(sbi == NULL)) {
-		err = -ENOMEM;
-		goto err;
-	}
-	sb->s_fs_info = sbi;
-
-	err = superblock_read(sb);
-	if (err)
-		goto err_sbread;
+	int err;
 
 	sb->s_magic = EROFS_SUPER_MAGIC;
+
+	if (!sb_set_blocksize(sb, EROFS_BLKSIZ)) {
+		erofs_err(sb, "failed to set erofs blksize");
+		return -EINVAL;
+	}
+
+	sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
+	if (!sbi)
+		return -ENOMEM;
+
+	sb->s_fs_info = sbi;
+	err = erofs_read_superblock(sb);
+	if (err)
+		return err;
+
 	sb->s_flags |= SB_RDONLY | SB_NOATIME;
 	sb->s_maxbytes = MAX_LFS_FILESIZE;
 	sb->s_time_gran = 1;
 
 	sb->s_op = &erofs_sops;
-
-#ifdef CONFIG_EROFS_FS_XATTR
 	sb->s_xattr = erofs_xattr_handlers;
-#endif
 
 	/* set erofs default mount options */
-	default_options(sbi);
+	erofs_default_options(sbi);
 
-	err = parse_options(sb, data);
+	err = erofs_parse_options(sb, data);
 	if (err)
-		goto err_parseopt;
+		return err;
 
-	if (!silent)
-		infoln("root inode @ nid %llu", ROOT_NID(sbi));
+	if (test_opt(sbi, POSIX_ACL))
+		sb->s_flags |= SB_POSIXACL;
+	else
+		sb->s_flags &= ~SB_POSIXACL;
 
 #ifdef CONFIG_EROFS_FS_ZIP
 	INIT_RADIX_TREE(&sbi->workstn_tree, GFP_ATOMIC);
 #endif
 
-#ifdef EROFS_FS_HAS_MANAGED_CACHE
-	sbi->managed_cache = erofs_init_managed_cache(sb);
-	if (IS_ERR(sbi->managed_cache)) {
-		err = PTR_ERR(sbi->managed_cache);
-		goto err_init_managed_cache;
-	}
-#endif
-
 	/* get the root inode */
 	inode = erofs_iget(sb, ROOT_NID(sbi), true);
-	if (IS_ERR(inode)) {
-		err = PTR_ERR(inode);
-		goto err_iget;
-	}
+	if (IS_ERR(inode))
+		return PTR_ERR(inode);
 
 	if (!S_ISDIR(inode->i_mode)) {
-		errln("rootino(nid %llu) is not a directory(i_mode %o)",
-			ROOT_NID(sbi), inode->i_mode);
-		err = -EINVAL;
-		goto err_isdir;
+		erofs_err(sb, "rootino(nid %llu) is not a directory(i_mode %o)",
+			  ROOT_NID(sbi), inode->i_mode);
+		iput(inode);
+		return -EINVAL;
 	}
 
 	sb->s_root = d_make_root(inode);
-	if (sb->s_root == NULL) {
-		err = -ENOMEM;
-		goto err_makeroot;
-	}
+	if (!sb->s_root)
+		return -ENOMEM;
 
-	/* save the device name to sbi */
-	sbi->dev_name = __getname();
-	if (sbi->dev_name == NULL) {
-		err = -ENOMEM;
-		goto err_devname;
-	}
+	erofs_shrinker_register(sb);
+	/* sb->s_umount is already locked, SB_ACTIVE and SB_BORN are not set */
+	err = erofs_init_managed_cache(sb);
+	if (err)
+		return err;
 
-	snprintf(sbi->dev_name, PATH_MAX, "%s", dev_name);
-	sbi->dev_name[PATH_MAX - 1] = '\0';
-
-	erofs_register_super(sb);
-
-	if (!silent)
-		infoln("mounted on %s with opts: %s.", dev_name,
-			(char *)data);
+	erofs_info(sb, "mounted with opts: %s, root inode @ nid %llu.",
+		   (char *)data, ROOT_NID(sbi));
 	return 0;
-	/*
-	 * please add a label for each exit point and use
-	 * the following name convention, thus new features
-	 * can be integrated easily without renaming labels.
-	 */
-err_devname:
-	dput(sb->s_root);
-err_makeroot:
-err_isdir:
-	if (sb->s_root == NULL)
-		iput(inode);
-err_iget:
-#ifdef EROFS_FS_HAS_MANAGED_CACHE
-	iput(sbi->managed_cache);
-err_init_managed_cache:
-#endif
-err_parseopt:
-err_sbread:
-	sb->s_fs_info = NULL;
-	kfree(sbi);
-err:
-	return err;
+}
+
+static struct dentry *erofs_mount(struct file_system_type *fs_type, int flags,
+				  const char *dev_name, void *data)
+{
+	return mount_bdev(fs_type, flags, dev_name, data, erofs_fill_super);
 }
 
 /*
  * could be triggered after deactivate_locked_super()
  * is called, thus including umount and failed to initialize.
  */
-static void erofs_put_super(struct super_block *sb)
+static void erofs_kill_sb(struct super_block *sb)
 {
-	struct erofs_sb_info *sbi = EROFS_SB(sb);
-
-	/* for cases which are failed in "read_super" */
-	if (sbi == NULL)
-		return;
+	struct erofs_sb_info *sbi;
 
 	WARN_ON(sb->s_magic != EROFS_SUPER_MAGIC);
 
-	infoln("unmounted for %s", sbi->dev_name);
-	__putname(sbi->dev_name);
-
-#ifdef EROFS_FS_HAS_MANAGED_CACHE
-	iput(sbi->managed_cache);
-#endif
-
-	mutex_lock(&sbi->umount_mutex);
-
-#ifdef CONFIG_EROFS_FS_ZIP
-	erofs_workstation_cleanup_all(sb);
-#endif
-
-	erofs_unregister_super(sb);
-	mutex_unlock(&sbi->umount_mutex);
+	kill_block_super(sb);
 
+	sbi = EROFS_SB(sb);
+	if (!sbi)
+		return;
 	kfree(sbi);
 	sb->s_fs_info = NULL;
 }
 
-
-struct erofs_mount_private {
-	const char *dev_name;
-	char *options;
-};
-
-/* support mount_bdev() with options */
-static int erofs_fill_super(struct super_block *sb,
-	void *_priv, int silent)
+/* called when ->s_root is non-NULL */
+static void erofs_put_super(struct super_block *sb)
 {
-	struct erofs_mount_private *priv = _priv;
+	struct erofs_sb_info *const sbi = EROFS_SB(sb);
 
-	return erofs_read_super(sb, priv->dev_name,
-		priv->options, silent);
+	DBG_BUGON(!sbi);
+
+	erofs_shrinker_unregister(sb);
+#ifdef CONFIG_EROFS_FS_ZIP
+	iput(sbi->managed_cache);
+	sbi->managed_cache = NULL;
+#endif
 }
 
-static struct dentry *erofs_mount(
-	struct file_system_type *fs_type, int flags,
-	const char *dev_name, void *data)
-{
-	struct erofs_mount_private priv = {
-		.dev_name = dev_name,
-		.options = data
-	};
-
-	return mount_bdev(fs_type, flags, dev_name,
-		&priv, erofs_fill_super);
-}
-
-static void erofs_kill_sb(struct super_block *sb)
-{
-	kill_block_super(sb);
-}
-
-static struct shrinker erofs_shrinker_info = {
-	.scan_objects = erofs_shrink_scan,
-	.count_objects = erofs_shrink_count,
-	.seeks = DEFAULT_SEEKS,
-};
-
 static struct file_system_type erofs_fs_type = {
 	.owner          = THIS_MODULE,
 	.name           = "erofs",
@@ -533,47 +507,41 @@ static struct file_system_type erofs_fs_type = {
 };
 MODULE_ALIAS_FS("erofs");
 
-#ifdef CONFIG_EROFS_FS_ZIP
-extern int z_erofs_init_zip_subsystem(void);
-extern void z_erofs_exit_zip_subsystem(void);
-#endif
-
 static int __init erofs_module_init(void)
 {
 	int err;
 
 	erofs_check_ondisk_layout_definitions();
-	infoln("initializing erofs " EROFS_VERSION);
 
-	err = erofs_init_inode_cache();
-	if (err)
+	erofs_inode_cachep = kmem_cache_create("erofs_inode",
+					       sizeof(struct erofs_inode), 0,
+					       SLAB_RECLAIM_ACCOUNT,
+					       erofs_inode_init_once);
+	if (!erofs_inode_cachep) {
+		err = -ENOMEM;
 		goto icache_err;
+	}
 
-	err = register_shrinker(&erofs_shrinker_info);
+	err = erofs_init_shrinker();
 	if (err)
 		goto shrinker_err;
 
-#ifdef CONFIG_EROFS_FS_ZIP
 	err = z_erofs_init_zip_subsystem();
 	if (err)
 		goto zip_err;
-#endif
 
 	err = register_filesystem(&erofs_fs_type);
 	if (err)
 		goto fs_err;
 
-	infoln("successfully to initialize erofs");
 	return 0;
 
 fs_err:
-#ifdef CONFIG_EROFS_FS_ZIP
 	z_erofs_exit_zip_subsystem();
 zip_err:
-#endif
-	unregister_shrinker(&erofs_shrinker_info);
+	erofs_exit_shrinker();
 shrinker_err:
-	erofs_exit_inode_cache();
+	kmem_cache_destroy(erofs_inode_cachep);
 icache_err:
 	return err;
 }
@@ -581,12 +549,12 @@ static int __init erofs_module_init(void)
 static void __exit erofs_module_exit(void)
 {
 	unregister_filesystem(&erofs_fs_type);
-#ifdef CONFIG_EROFS_FS_ZIP
 	z_erofs_exit_zip_subsystem();
-#endif
-	unregister_shrinker(&erofs_shrinker_info);
-	erofs_exit_inode_cache();
-	infoln("successfully finalize erofs");
+	erofs_exit_shrinker();
+
+	/* Ensure all RCU free inodes are safe before cache is destroyed. */
+	rcu_barrier();
+	kmem_cache_destroy(erofs_inode_cachep);
 }
 
 /* get filesystem statistics */
@@ -627,26 +595,45 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root)
 	else
 		seq_puts(seq, ",noacl");
 #endif
-#ifdef CONFIG_EROFS_FAULT_INJECTION
-	if (test_opt(sbi, FAULT_INJECTION))
-		seq_printf(seq, ",fault_injection=%u",
-				sbi->fault_info.inject_rate);
+#ifdef CONFIG_EROFS_FS_ZIP
+	if (sbi->cache_strategy == EROFS_ZIP_CACHE_DISABLED) {
+		seq_puts(seq, ",cache_strategy=disabled");
+	} else if (sbi->cache_strategy == EROFS_ZIP_CACHE_READAHEAD) {
+		seq_puts(seq, ",cache_strategy=readahead");
+	} else if (sbi->cache_strategy == EROFS_ZIP_CACHE_READAROUND) {
+		seq_puts(seq, ",cache_strategy=readaround");
+	}
 #endif
 	return 0;
 }
 
 static int erofs_remount(struct super_block *sb, int *flags, char *data)
 {
+	struct erofs_sb_info *sbi = EROFS_SB(sb);
+	unsigned int org_mnt_opt = sbi->mount_opt;
+	int err;
+
 	DBG_BUGON(!sb_rdonly(sb));
+	err = erofs_parse_options(sb, data);
+	if (err)
+		goto out;
+
+	if (test_opt(sbi, POSIX_ACL))
+		sb->s_flags |= SB_POSIXACL;
+	else
+		sb->s_flags &= ~SB_POSIXACL;
 
 	*flags |= SB_RDONLY;
 	return 0;
+out:
+	sbi->mount_opt = org_mnt_opt;
+	return err;
 }
 
 const struct super_operations erofs_sops = {
 	.put_super = erofs_put_super,
-	.alloc_inode = alloc_inode,
-	.destroy_inode = destroy_inode,
+	.alloc_inode = erofs_alloc_inode,
+	.destroy_inode = erofs_destroy_inode,
 	.statfs = erofs_statfs,
 	.show_options = erofs_show_options,
 	.remount_fs = erofs_remount,
@@ -656,6 +643,5 @@ module_init(erofs_module_init);
 module_exit(erofs_module_exit);
 
 MODULE_DESCRIPTION("Enhanced ROM File System");
-MODULE_AUTHOR("Gao Xiang, Yu Chao, Miao Xie, CONSUMER BG, HUAWEI Inc.");
+MODULE_AUTHOR("Gao Xiang, Chao Yu, Miao Xie, CONSUMER BG, HUAWEI Inc.");
 MODULE_LICENSE("GPL");
-
diff --git a/drivers/staging/erofs/include/linux/tagptr.h b/drivers/staging/erofs/tagptr.h
similarity index 93%
rename from drivers/staging/erofs/include/linux/tagptr.h
rename to drivers/staging/erofs/tagptr.h
index ccd106dbd48e..64ceb7270b5c 100644
--- a/drivers/staging/erofs/include/linux/tagptr.h
+++ b/drivers/staging/erofs/tagptr.h
@@ -1,11 +1,9 @@
-/* SPDX-License-Identifier: GPL-2.0
- *
- * Tagged pointer implementation
- *
- * Copyright (C) 2018 Gao Xiang <gaoxiang25@huawei.com>
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * A tagged pointer implementation
  */
-#ifndef _LINUX_TAGPTR_H
-#define _LINUX_TAGPTR_H
+#ifndef __EROFS_FS_TAGPTR_H
+#define __EROFS_FS_TAGPTR_H
 
 #include <linux/types.h>
 #include <linux/build_bug.h>
@@ -106,5 +104,4 @@ tagptr_init(o, cmpxchg(&ptptr->v, o.v, n.v)); })
 	ptptr->v &= ~tags; \
 *ptptr; })
 
-#endif
-
+#endif	/* __EROFS_FS_TAGPTR_H */
diff --git a/drivers/staging/erofs/unzip_lz4.c b/drivers/staging/erofs/unzip_lz4.c
deleted file mode 100644
index b1ea23f66c4e..000000000000
--- a/drivers/staging/erofs/unzip_lz4.c
+++ /dev/null
@@ -1,251 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause
-/*
- * linux/drivers/staging/erofs/unzip_lz4.c
- *
- * Copyright (C) 2018 HUAWEI, Inc.
- *             http://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
- *
- * Original code taken from 'linux/lib/lz4/lz4_decompress.c'
- */
-
-/*
- * LZ4 - Fast LZ compression algorithm
- * Copyright (C) 2011 - 2016, Yann Collet.
- * BSD 2 - Clause License (http://www.opensource.org/licenses/bsd - license.php)
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met:
- *	* Redistributions of source code must retain the above copyright
- *	  notice, this list of conditions and the following disclaimer.
- *	* Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following disclaimer
- * in the documentation and/or other materials provided with the
- * distribution.
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- * You can contact the author at :
- *	- LZ4 homepage : http://www.lz4.org
- *	- LZ4 source repository : https://github.com/lz4/lz4
- *
- *	Changed for kernel usage by:
- *	Sven Schmidt <4sschmid@informatik.uni-hamburg.de>
- */
-#include "internal.h"
-#include <asm/unaligned.h>
-#include "lz4defs.h"
-
-/*
- * no public solution to solve our requirement yet.
- * see: <required buffer size for LZ4_decompress_safe_partial>
- *      https://groups.google.com/forum/#!topic/lz4c/_3kkz5N6n00
- */
-static FORCE_INLINE int customized_lz4_decompress_safe_partial(
-	const void * const source,
-	void * const dest,
-	int inputSize,
-	int outputSize)
-{
-	/* Local Variables */
-	const BYTE *ip = (const BYTE *) source;
-	const BYTE * const iend = ip + inputSize;
-
-	BYTE *op = (BYTE *) dest;
-	BYTE * const oend = op + outputSize;
-	BYTE *cpy;
-
-	static const unsigned int dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 };
-	static const int dec64table[] = { 0, 0, 0, -1, 0, 1, 2, 3 };
-
-	/* Empty output buffer */
-	if (unlikely(outputSize == 0))
-		return ((inputSize == 1) && (*ip == 0)) ? 0 : -1;
-
-	/* Main Loop : decode sequences */
-	while (1) {
-		size_t length;
-		const BYTE *match;
-		size_t offset;
-
-		/* get literal length */
-		unsigned int const token = *ip++;
-
-		length = token>>ML_BITS;
-
-		if (length == RUN_MASK) {
-			unsigned int s;
-
-			do {
-				s = *ip++;
-				length += s;
-			} while ((ip < iend - RUN_MASK) & (s == 255));
-
-			if (unlikely((size_t)(op + length) < (size_t)(op))) {
-				/* overflow detection */
-				goto _output_error;
-			}
-			if (unlikely((size_t)(ip + length) < (size_t)(ip))) {
-				/* overflow detection */
-				goto _output_error;
-			}
-		}
-
-		/* copy literals */
-		cpy = op + length;
-		if ((cpy > oend - WILDCOPYLENGTH) ||
-			(ip + length > iend - (2 + 1 + LASTLITERALS))) {
-			if (cpy > oend) {
-				memcpy(op, ip, length = oend - op);
-				op += length;
-				break;
-			}
-
-			if (unlikely(ip + length > iend)) {
-				/*
-				 * Error :
-				 * read attempt beyond
-				 * end of input buffer
-				 */
-				goto _output_error;
-			}
-
-			memcpy(op, ip, length);
-			ip += length;
-			op += length;
-
-			if (ip > iend - 2)
-				break;
-			/* Necessarily EOF, due to parsing restrictions */
-			/* break; */
-		} else {
-			LZ4_wildCopy(op, ip, cpy);
-			ip += length;
-			op = cpy;
-		}
-
-		/* get offset */
-		offset = LZ4_readLE16(ip);
-		ip += 2;
-		match = op - offset;
-
-		if (unlikely(match < (const BYTE *)dest)) {
-			/* Error : offset outside buffers */
-			goto _output_error;
-		}
-
-		/* get matchlength */
-		length = token & ML_MASK;
-		if (length == ML_MASK) {
-			unsigned int s;
-
-			do {
-				s = *ip++;
-
-				if (ip > iend - LASTLITERALS)
-					goto _output_error;
-
-				length += s;
-			} while (s == 255);
-
-			if (unlikely((size_t)(op + length) < (size_t)op)) {
-				/* overflow detection */
-				goto _output_error;
-			}
-		}
-
-		length += MINMATCH;
-
-		/* copy match within block */
-		cpy = op + length;
-
-		if (unlikely(cpy >= oend - WILDCOPYLENGTH)) {
-			if (cpy >= oend) {
-				while (op < oend)
-					*op++ = *match++;
-				break;
-			}
-			goto __match;
-		}
-
-		/* costs ~1%; silence an msan warning when offset == 0 */
-		LZ4_write32(op, (U32)offset);
-
-		if (unlikely(offset < 8)) {
-			const int dec64 = dec64table[offset];
-
-			op[0] = match[0];
-			op[1] = match[1];
-			op[2] = match[2];
-			op[3] = match[3];
-			match += dec32table[offset];
-			memcpy(op + 4, match, 4);
-			match -= dec64;
-		} else {
-			LZ4_copy8(op, match);
-			match += 8;
-		}
-
-		op += 8;
-
-		if (unlikely(cpy > oend - 12)) {
-			BYTE * const oCopyLimit = oend - (WILDCOPYLENGTH - 1);
-
-			if (op < oCopyLimit) {
-				LZ4_wildCopy(op, match, oCopyLimit);
-				match += oCopyLimit - op;
-				op = oCopyLimit;
-			}
-__match:
-			while (op < cpy)
-				*op++ = *match++;
-		} else {
-			LZ4_copy8(op, match);
-
-			if (length > 16)
-				LZ4_wildCopy(op + 8, match + 8, cpy);
-		}
-
-		op = cpy; /* correction */
-	}
-	DBG_BUGON((void *)ip - source > inputSize);
-	DBG_BUGON((void *)op - dest > outputSize);
-
-	/* Nb of output bytes decoded */
-	return (int) ((void *)op - dest);
-
-	/* Overflow error detected */
-_output_error:
-	return -ERANGE;
-}
-
-int z_erofs_unzip_lz4(void *in, void *out, size_t inlen, size_t outlen)
-{
-	int ret = customized_lz4_decompress_safe_partial(in,
-		out, inlen, outlen);
-
-	if (ret >= 0)
-		return ret;
-
-	/*
-	 * LZ4_decompress_safe will return an error code
-	 * (< 0) if decompression failed
-	 */
-	errln("%s, failed to decompress, in[%p, %zu] outlen[%p, %zu]",
-	      __func__, in, inlen, out, outlen);
-	WARN_ON(1);
-	print_hex_dump(KERN_DEBUG, "raw data [in]: ", DUMP_PREFIX_OFFSET,
-		16, 1, in, inlen, true);
-	print_hex_dump(KERN_DEBUG, "raw data [out]: ", DUMP_PREFIX_OFFSET,
-		16, 1, out, outlen, true);
-	return -EIO;
-}
-
diff --git a/drivers/staging/erofs/unzip_vle.c b/drivers/staging/erofs/unzip_vle.c
deleted file mode 100644
index 0f1558c6747e..000000000000
--- a/drivers/staging/erofs/unzip_vle.c
+++ /dev/null
@@ -1,1695 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * linux/drivers/staging/erofs/unzip_vle.c
- *
- * Copyright (C) 2018 HUAWEI, Inc.
- *             http://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file COPYING in the main directory of the Linux
- * distribution for more details.
- */
-#include "unzip_vle.h"
-#include <linux/prefetch.h>
-
-static struct workqueue_struct *z_erofs_workqueue __read_mostly;
-static struct kmem_cache *z_erofs_workgroup_cachep __read_mostly;
-
-void z_erofs_exit_zip_subsystem(void)
-{
-	destroy_workqueue(z_erofs_workqueue);
-	kmem_cache_destroy(z_erofs_workgroup_cachep);
-}
-
-static inline int init_unzip_workqueue(void)
-{
-	const unsigned onlinecpus = num_possible_cpus();
-
-	/*
-	 * we don't need too many threads, limiting threads
-	 * could improve scheduling performance.
-	 */
-	z_erofs_workqueue = alloc_workqueue("erofs_unzipd",
-		WQ_UNBOUND | WQ_HIGHPRI | WQ_CPU_INTENSIVE,
-		onlinecpus + onlinecpus / 4);
-
-	return z_erofs_workqueue != NULL ? 0 : -ENOMEM;
-}
-
-int z_erofs_init_zip_subsystem(void)
-{
-	z_erofs_workgroup_cachep =
-		kmem_cache_create("erofs_compress",
-		Z_EROFS_WORKGROUP_SIZE, 0,
-		SLAB_RECLAIM_ACCOUNT, NULL);
-
-	if (z_erofs_workgroup_cachep != NULL) {
-		if (!init_unzip_workqueue())
-			return 0;
-
-		kmem_cache_destroy(z_erofs_workgroup_cachep);
-	}
-	return -ENOMEM;
-}
-
-enum z_erofs_vle_work_role {
-	Z_EROFS_VLE_WORK_SECONDARY,
-	Z_EROFS_VLE_WORK_PRIMARY,
-	/*
-	 * The current work was the tail of an exist chain, and the previous
-	 * processed chained works are all decided to be hooked up to it.
-	 * A new chain should be created for the remaining unprocessed works,
-	 * therefore different from Z_EROFS_VLE_WORK_PRIMARY_FOLLOWED,
-	 * the next work cannot reuse the whole page in the following scenario:
-	 *  ________________________________________________________________
-	 * |      tail (partial) page     |       head (partial) page       |
-	 * |  (belongs to the next work)  |  (belongs to the current work)  |
-	 * |_______PRIMARY_FOLLOWED_______|________PRIMARY_HOOKED___________|
-	 */
-	Z_EROFS_VLE_WORK_PRIMARY_HOOKED,
-	/*
-	 * The current work has been linked with the processed chained works,
-	 * and could be also linked with the potential remaining works, which
-	 * means if the processing page is the tail partial page of the work,
-	 * the current work can safely use the whole page (since the next work
-	 * is under control) for in-place decompression, as illustrated below:
-	 *  ________________________________________________________________
-	 * |  tail (partial) page  |          head (partial) page           |
-	 * | (of the current work) |         (of the previous work)         |
-	 * |  PRIMARY_FOLLOWED or  |                                        |
-	 * |_____PRIMARY_HOOKED____|____________PRIMARY_FOLLOWED____________|
-	 *
-	 * [  (*) the above page can be used for the current work itself.  ]
-	 */
-	Z_EROFS_VLE_WORK_PRIMARY_FOLLOWED,
-	Z_EROFS_VLE_WORK_MAX
-};
-
-struct z_erofs_vle_work_builder {
-	enum z_erofs_vle_work_role role;
-	/*
-	 * 'hosted = false' means that the current workgroup doesn't belong to
-	 * the owned chained workgroups. In the other words, it is none of our
-	 * business to submit this workgroup.
-	 */
-	bool hosted;
-
-	struct z_erofs_vle_workgroup *grp;
-	struct z_erofs_vle_work *work;
-	struct z_erofs_pagevec_ctor vector;
-
-	/* pages used for reading the compressed data */
-	struct page **compressed_pages;
-	unsigned compressed_deficit;
-};
-
-#define VLE_WORK_BUILDER_INIT()	\
-	{ .work = NULL, .role = Z_EROFS_VLE_WORK_PRIMARY_FOLLOWED }
-
-#ifdef EROFS_FS_HAS_MANAGED_CACHE
-
-static bool grab_managed_cache_pages(struct address_space *mapping,
-				     erofs_blk_t start,
-				     struct page **compressed_pages,
-				     int clusterblks,
-				     bool reserve_allocation)
-{
-	bool noio = true;
-	unsigned int i;
-
-	/* TODO: optimize by introducing find_get_pages_range */
-	for (i = 0; i < clusterblks; ++i) {
-		struct page *page, *found;
-
-		if (READ_ONCE(compressed_pages[i]) != NULL)
-			continue;
-
-		page = found = find_get_page(mapping, start + i);
-		if (found == NULL) {
-			noio = false;
-			if (!reserve_allocation)
-				continue;
-			page = EROFS_UNALLOCATED_CACHED_PAGE;
-		}
-
-		if (NULL == cmpxchg(compressed_pages + i, NULL, page))
-			continue;
-
-		if (found != NULL)
-			put_page(found);
-	}
-	return noio;
-}
-
-/* called by erofs_shrinker to get rid of all compressed_pages */
-int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
-				       struct erofs_workgroup *egrp)
-{
-	struct z_erofs_vle_workgroup *const grp =
-		container_of(egrp, struct z_erofs_vle_workgroup, obj);
-	struct address_space *const mapping = sbi->managed_cache->i_mapping;
-	const int clusterpages = erofs_clusterpages(sbi);
-	int i;
-
-	/*
-	 * refcount of workgroup is now freezed as 1,
-	 * therefore no need to worry about available decompression users.
-	 */
-	for (i = 0; i < clusterpages; ++i) {
-		struct page *page = grp->compressed_pages[i];
-
-		if (page == NULL || page->mapping != mapping)
-			continue;
-
-		/* block other users from reclaiming or migrating the page */
-		if (!trylock_page(page))
-			return -EBUSY;
-
-		/* barrier is implied in the following 'unlock_page' */
-		WRITE_ONCE(grp->compressed_pages[i], NULL);
-
-		set_page_private(page, 0);
-		ClearPagePrivate(page);
-
-		unlock_page(page);
-		put_page(page);
-	}
-	return 0;
-}
-
-int erofs_try_to_free_cached_page(struct address_space *mapping,
-				  struct page *page)
-{
-	struct erofs_sb_info *const sbi = EROFS_SB(mapping->host->i_sb);
-	const unsigned int clusterpages = erofs_clusterpages(sbi);
-
-	struct z_erofs_vle_workgroup *grp;
-	int ret = 0;	/* 0 - busy */
-
-	/* prevent the workgroup from being freed */
-	rcu_read_lock();
-	grp = (void *)page_private(page);
-
-	if (erofs_workgroup_try_to_freeze(&grp->obj, 1)) {
-		unsigned int i;
-
-		for (i = 0; i < clusterpages; ++i) {
-			if (grp->compressed_pages[i] == page) {
-				WRITE_ONCE(grp->compressed_pages[i], NULL);
-				ret = 1;
-				break;
-			}
-		}
-		erofs_workgroup_unfreeze(&grp->obj, 1);
-	}
-	rcu_read_unlock();
-
-	if (ret) {
-		ClearPagePrivate(page);
-		put_page(page);
-	}
-	return ret;
-}
-#endif
-
-/* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */
-static inline bool try_to_reuse_as_compressed_page(
-	struct z_erofs_vle_work_builder *b,
-	struct page *page)
-{
-	while (b->compressed_deficit) {
-		--b->compressed_deficit;
-		if (NULL == cmpxchg(b->compressed_pages++, NULL, page))
-			return true;
-	}
-
-	return false;
-}
-
-/* callers must be with work->lock held */
-static int z_erofs_vle_work_add_page(
-	struct z_erofs_vle_work_builder *builder,
-	struct page *page,
-	enum z_erofs_page_type type)
-{
-	int ret;
-	bool occupied;
-
-	/* give priority for the compressed data storage */
-	if (builder->role >= Z_EROFS_VLE_WORK_PRIMARY &&
-		type == Z_EROFS_PAGE_TYPE_EXCLUSIVE &&
-		try_to_reuse_as_compressed_page(builder, page))
-		return 0;
-
-	ret = z_erofs_pagevec_ctor_enqueue(&builder->vector,
-		page, type, &occupied);
-	builder->work->vcnt += (unsigned)ret;
-
-	return ret ? 0 : -EAGAIN;
-}
-
-static enum z_erofs_vle_work_role
-try_to_claim_workgroup(struct z_erofs_vle_workgroup *grp,
-		       z_erofs_vle_owned_workgrp_t *owned_head,
-		       bool *hosted)
-{
-	DBG_BUGON(*hosted == true);
-
-	/* let's claim these following types of workgroup */
-retry:
-	if (grp->next == Z_EROFS_VLE_WORKGRP_NIL) {
-		/* type 1, nil workgroup */
-		if (Z_EROFS_VLE_WORKGRP_NIL != cmpxchg(&grp->next,
-			Z_EROFS_VLE_WORKGRP_NIL, *owned_head))
-			goto retry;
-
-		*owned_head = grp;
-		*hosted = true;
-		/* lucky, I am the followee :) */
-		return Z_EROFS_VLE_WORK_PRIMARY_FOLLOWED;
-
-	} else if (grp->next == Z_EROFS_VLE_WORKGRP_TAIL) {
-		/*
-		 * type 2, link to the end of a existing open chain,
-		 * be careful that its submission itself is governed
-		 * by the original owned chain.
-		 */
-		if (Z_EROFS_VLE_WORKGRP_TAIL != cmpxchg(&grp->next,
-			Z_EROFS_VLE_WORKGRP_TAIL, *owned_head))
-			goto retry;
-		*owned_head = Z_EROFS_VLE_WORKGRP_TAIL;
-		return Z_EROFS_VLE_WORK_PRIMARY_HOOKED;
-	}
-
-	return Z_EROFS_VLE_WORK_PRIMARY; /* :( better luck next time */
-}
-
-static struct z_erofs_vle_work *
-z_erofs_vle_work_lookup(struct super_block *sb,
-			pgoff_t idx, unsigned pageofs,
-			struct z_erofs_vle_workgroup **grp_ret,
-			enum z_erofs_vle_work_role *role,
-			z_erofs_vle_owned_workgrp_t *owned_head,
-			bool *hosted)
-{
-	bool tag, primary;
-	struct erofs_workgroup *egrp;
-	struct z_erofs_vle_workgroup *grp;
-	struct z_erofs_vle_work *work;
-
-	egrp = erofs_find_workgroup(sb, idx, &tag);
-	if (egrp == NULL) {
-		*grp_ret = NULL;
-		return NULL;
-	}
-
-	*grp_ret = grp = container_of(egrp,
-		struct z_erofs_vle_workgroup, obj);
-
-	work = z_erofs_vle_grab_work(grp, pageofs);
-	/* if multiref is disabled, `primary' is always true */
-	primary = true;
-
-	if (work->pageofs != pageofs) {
-		DBG_BUGON(1);
-		erofs_workgroup_put(egrp);
-		return ERR_PTR(-EIO);
-	}
-
-	/*
-	 * lock must be taken first to avoid grp->next == NIL between
-	 * claiming workgroup and adding pages:
-	 *                        grp->next != NIL
-	 *   grp->next = NIL
-	 *   mutex_unlock_all
-	 *                        mutex_lock(&work->lock)
-	 *                        add all pages to pagevec
-	 *
-	 * [correct locking case 1]:
-	 *   mutex_lock(grp->work[a])
-	 *   ...
-	 *   mutex_lock(grp->work[b])     mutex_lock(grp->work[c])
-	 *   ...                          *role = SECONDARY
-	 *                                add all pages to pagevec
-	 *                                ...
-	 *                                mutex_unlock(grp->work[c])
-	 *   mutex_lock(grp->work[c])
-	 *   ...
-	 *   grp->next = NIL
-	 *   mutex_unlock_all
-	 *
-	 * [correct locking case 2]:
-	 *   mutex_lock(grp->work[b])
-	 *   ...
-	 *   mutex_lock(grp->work[a])
-	 *   ...
-	 *   mutex_lock(grp->work[c])
-	 *   ...
-	 *   grp->next = NIL
-	 *   mutex_unlock_all
-	 *                                mutex_lock(grp->work[a])
-	 *                                *role = PRIMARY_OWNER
-	 *                                add all pages to pagevec
-	 *                                ...
-	 */
-	mutex_lock(&work->lock);
-
-	*hosted = false;
-	if (!primary)
-		*role = Z_EROFS_VLE_WORK_SECONDARY;
-	else	/* claim the workgroup if possible */
-		*role = try_to_claim_workgroup(grp, owned_head, hosted);
-	return work;
-}
-
-static struct z_erofs_vle_work *
-z_erofs_vle_work_register(struct super_block *sb,
-			  struct z_erofs_vle_workgroup **grp_ret,
-			  struct erofs_map_blocks *map,
-			  pgoff_t index, unsigned pageofs,
-			  enum z_erofs_vle_work_role *role,
-			  z_erofs_vle_owned_workgrp_t *owned_head,
-			  bool *hosted)
-{
-	bool newgrp = false;
-	struct z_erofs_vle_workgroup *grp = *grp_ret;
-	struct z_erofs_vle_work *work;
-
-	/* if multiref is disabled, grp should never be nullptr */
-	if (unlikely(grp)) {
-		DBG_BUGON(1);
-		return ERR_PTR(-EINVAL);
-	}
-
-	/* no available workgroup, let's allocate one */
-	grp = kmem_cache_zalloc(z_erofs_workgroup_cachep, GFP_NOFS);
-	if (unlikely(grp == NULL))
-		return ERR_PTR(-ENOMEM);
-
-	grp->obj.index = index;
-	grp->llen = map->m_llen;
-
-	z_erofs_vle_set_workgrp_fmt(grp,
-		(map->m_flags & EROFS_MAP_ZIPPED) ?
-			Z_EROFS_VLE_WORKGRP_FMT_LZ4 :
-			Z_EROFS_VLE_WORKGRP_FMT_PLAIN);
-	atomic_set(&grp->obj.refcount, 1);
-
-	/* new workgrps have been claimed as type 1 */
-	WRITE_ONCE(grp->next, *owned_head);
-	/* primary and followed work for all new workgrps */
-	*role = Z_EROFS_VLE_WORK_PRIMARY_FOLLOWED;
-	/* it should be submitted by ourselves */
-	*hosted = true;
-
-	newgrp = true;
-	work = z_erofs_vle_grab_primary_work(grp);
-	work->pageofs = pageofs;
-
-	mutex_init(&work->lock);
-
-	if (newgrp) {
-		int err = erofs_register_workgroup(sb, &grp->obj, 0);
-
-		if (err) {
-			kmem_cache_free(z_erofs_workgroup_cachep, grp);
-			return ERR_PTR(-EAGAIN);
-		}
-	}
-
-	*owned_head = *grp_ret = grp;
-
-	mutex_lock(&work->lock);
-	return work;
-}
-
-static inline void __update_workgrp_llen(struct z_erofs_vle_workgroup *grp,
-					 unsigned int llen)
-{
-	while (1) {
-		unsigned int orig_llen = grp->llen;
-
-		if (orig_llen >= llen || orig_llen ==
-			cmpxchg(&grp->llen, orig_llen, llen))
-			break;
-	}
-}
-
-#define builder_is_hooked(builder) \
-	((builder)->role >= Z_EROFS_VLE_WORK_PRIMARY_HOOKED)
-
-#define builder_is_followed(builder) \
-	((builder)->role >= Z_EROFS_VLE_WORK_PRIMARY_FOLLOWED)
-
-static int z_erofs_vle_work_iter_begin(struct z_erofs_vle_work_builder *builder,
-				       struct super_block *sb,
-				       struct erofs_map_blocks *map,
-				       z_erofs_vle_owned_workgrp_t *owned_head)
-{
-	const unsigned clusterpages = erofs_clusterpages(EROFS_SB(sb));
-	const erofs_blk_t index = erofs_blknr(map->m_pa);
-	const unsigned pageofs = map->m_la & ~PAGE_MASK;
-	struct z_erofs_vle_workgroup *grp;
-	struct z_erofs_vle_work *work;
-
-	DBG_BUGON(builder->work != NULL);
-
-	/* must be Z_EROFS_WORK_TAIL or the next chained work */
-	DBG_BUGON(*owned_head == Z_EROFS_VLE_WORKGRP_NIL);
-	DBG_BUGON(*owned_head == Z_EROFS_VLE_WORKGRP_TAIL_CLOSED);
-
-	DBG_BUGON(erofs_blkoff(map->m_pa));
-
-repeat:
-	work = z_erofs_vle_work_lookup(sb, index,
-		pageofs, &grp, &builder->role, owned_head, &builder->hosted);
-	if (work != NULL) {
-		__update_workgrp_llen(grp, map->m_llen);
-		goto got_it;
-	}
-
-	work = z_erofs_vle_work_register(sb, &grp, map, index, pageofs,
-		&builder->role, owned_head, &builder->hosted);
-
-	if (unlikely(work == ERR_PTR(-EAGAIN)))
-		goto repeat;
-
-	if (unlikely(IS_ERR(work)))
-		return PTR_ERR(work);
-got_it:
-	z_erofs_pagevec_ctor_init(&builder->vector,
-		Z_EROFS_VLE_INLINE_PAGEVECS, work->pagevec, work->vcnt);
-
-	if (builder->role >= Z_EROFS_VLE_WORK_PRIMARY) {
-		/* enable possibly in-place decompression */
-		builder->compressed_pages = grp->compressed_pages;
-		builder->compressed_deficit = clusterpages;
-	} else {
-		builder->compressed_pages = NULL;
-		builder->compressed_deficit = 0;
-	}
-
-	builder->grp = grp;
-	builder->work = work;
-	return 0;
-}
-
-/*
- * keep in mind that no referenced workgroups will be freed
- * only after a RCU grace period, so rcu_read_lock() could
- * prevent a workgroup from being freed.
- */
-static void z_erofs_rcu_callback(struct rcu_head *head)
-{
-	struct z_erofs_vle_work *work =	container_of(head,
-		struct z_erofs_vle_work, rcu);
-	struct z_erofs_vle_workgroup *grp =
-		z_erofs_vle_work_workgroup(work, true);
-
-	kmem_cache_free(z_erofs_workgroup_cachep, grp);
-}
-
-void erofs_workgroup_free_rcu(struct erofs_workgroup *grp)
-{
-	struct z_erofs_vle_workgroup *const vgrp = container_of(grp,
-		struct z_erofs_vle_workgroup, obj);
-	struct z_erofs_vle_work *const work = &vgrp->work;
-
-	call_rcu(&work->rcu, z_erofs_rcu_callback);
-}
-
-static void __z_erofs_vle_work_release(struct z_erofs_vle_workgroup *grp,
-	struct z_erofs_vle_work *work __maybe_unused)
-{
-	erofs_workgroup_put(&grp->obj);
-}
-
-void z_erofs_vle_work_release(struct z_erofs_vle_work *work)
-{
-	struct z_erofs_vle_workgroup *grp =
-		z_erofs_vle_work_workgroup(work, true);
-
-	__z_erofs_vle_work_release(grp, work);
-}
-
-static inline bool
-z_erofs_vle_work_iter_end(struct z_erofs_vle_work_builder *builder)
-{
-	struct z_erofs_vle_work *work = builder->work;
-
-	if (work == NULL)
-		return false;
-
-	z_erofs_pagevec_ctor_exit(&builder->vector, false);
-	mutex_unlock(&work->lock);
-
-	/*
-	 * if all pending pages are added, don't hold work reference
-	 * any longer if the current work isn't hosted by ourselves.
-	 */
-	if (!builder->hosted)
-		__z_erofs_vle_work_release(builder->grp, work);
-
-	builder->work = NULL;
-	builder->grp = NULL;
-	return true;
-}
-
-static inline struct page *__stagingpage_alloc(struct list_head *pagepool,
-					       gfp_t gfp)
-{
-	struct page *page = erofs_allocpage(pagepool, gfp);
-
-	if (unlikely(page == NULL))
-		return NULL;
-
-	page->mapping = Z_EROFS_MAPPING_STAGING;
-	return page;
-}
-
-struct z_erofs_vle_frontend {
-	struct inode *const inode;
-
-	struct z_erofs_vle_work_builder builder;
-	struct erofs_map_blocks_iter m_iter;
-
-	z_erofs_vle_owned_workgrp_t owned_head;
-
-	bool initial;
-#if (EROFS_FS_ZIP_CACHE_LVL >= 2)
-	erofs_off_t cachedzone_la;
-#endif
-};
-
-#define VLE_FRONTEND_INIT(__i) { \
-	.inode = __i, \
-	.m_iter = { \
-		{ .m_llen = 0, .m_plen = 0 }, \
-		.mpage = NULL \
-	}, \
-	.builder = VLE_WORK_BUILDER_INIT(), \
-	.owned_head = Z_EROFS_VLE_WORKGRP_TAIL, \
-	.initial = true, }
-
-static int z_erofs_do_read_page(struct z_erofs_vle_frontend *fe,
-				struct page *page,
-				struct list_head *page_pool)
-{
-	struct super_block *const sb = fe->inode->i_sb;
-	struct erofs_sb_info *const sbi __maybe_unused = EROFS_SB(sb);
-	struct erofs_map_blocks_iter *const m = &fe->m_iter;
-	struct erofs_map_blocks *const map = &m->map;
-	struct z_erofs_vle_work_builder *const builder = &fe->builder;
-	const loff_t offset = page_offset(page);
-
-	bool tight = builder_is_hooked(builder);
-	struct z_erofs_vle_work *work = builder->work;
-
-#ifdef EROFS_FS_HAS_MANAGED_CACHE
-	struct address_space *const mngda = sbi->managed_cache->i_mapping;
-	struct z_erofs_vle_workgroup *grp;
-	bool noio_outoforder;
-#endif
-
-	enum z_erofs_page_type page_type;
-	unsigned cur, end, spiltted, index;
-	int err = 0;
-
-	/* register locked file pages as online pages in pack */
-	z_erofs_onlinepage_init(page);
-
-	spiltted = 0;
-	end = PAGE_SIZE;
-repeat:
-	cur = end - 1;
-
-	/* lucky, within the range of the current map_blocks */
-	if (offset + cur >= map->m_la &&
-		offset + cur < map->m_la + map->m_llen) {
-		/* didn't get a valid unzip work previously (very rare) */
-		if (!builder->work)
-			goto restart_now;
-		goto hitted;
-	}
-
-	/* go ahead the next map_blocks */
-	debugln("%s: [out-of-range] pos %llu", __func__, offset + cur);
-
-	if (z_erofs_vle_work_iter_end(builder))
-		fe->initial = false;
-
-	map->m_la = offset + cur;
-	map->m_llen = 0;
-	err = erofs_map_blocks_iter(fe->inode, map, &m->mpage, 0);
-	if (unlikely(err))
-		goto err_out;
-
-restart_now:
-	if (unlikely(!(map->m_flags & EROFS_MAP_MAPPED)))
-		goto hitted;
-
-	DBG_BUGON(map->m_plen != 1 << sbi->clusterbits);
-	DBG_BUGON(erofs_blkoff(map->m_pa));
-
-	err = z_erofs_vle_work_iter_begin(builder, sb, map, &fe->owned_head);
-	if (unlikely(err))
-		goto err_out;
-
-#ifdef EROFS_FS_HAS_MANAGED_CACHE
-	grp = fe->builder.grp;
-
-	/* let's do out-of-order decompression for noio */
-	noio_outoforder = grab_managed_cache_pages(mngda,
-		erofs_blknr(map->m_pa),
-		grp->compressed_pages, erofs_blknr(map->m_plen),
-		/* compressed page caching selection strategy */
-		fe->initial | (EROFS_FS_ZIP_CACHE_LVL >= 2 ?
-			map->m_la < fe->cachedzone_la : 0));
-
-	if (noio_outoforder && builder_is_followed(builder))
-		builder->role = Z_EROFS_VLE_WORK_PRIMARY;
-#endif
-
-	tight &= builder_is_hooked(builder);
-	work = builder->work;
-hitted:
-	cur = end - min_t(unsigned, offset + end - map->m_la, end);
-	if (unlikely(!(map->m_flags & EROFS_MAP_MAPPED))) {
-		zero_user_segment(page, cur, end);
-		goto next_part;
-	}
-
-	/* let's derive page type */
-	page_type = cur ? Z_EROFS_VLE_PAGE_TYPE_HEAD :
-		(!spiltted ? Z_EROFS_PAGE_TYPE_EXCLUSIVE :
-			(tight ? Z_EROFS_PAGE_TYPE_EXCLUSIVE :
-				Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED));
-
-	if (cur)
-		tight &= builder_is_followed(builder);
-
-retry:
-	err = z_erofs_vle_work_add_page(builder, page, page_type);
-	/* should allocate an additional staging page for pagevec */
-	if (err == -EAGAIN) {
-		struct page *const newpage =
-			__stagingpage_alloc(page_pool, GFP_NOFS);
-
-		err = z_erofs_vle_work_add_page(builder,
-			newpage, Z_EROFS_PAGE_TYPE_EXCLUSIVE);
-		if (likely(!err))
-			goto retry;
-	}
-
-	if (unlikely(err))
-		goto err_out;
-
-	index = page->index - map->m_la / PAGE_SIZE;
-
-	/* FIXME! avoid the last relundant fixup & endio */
-	z_erofs_onlinepage_fixup(page, index, true);
-
-	/* bump up the number of spiltted parts of a page */
-	++spiltted;
-	/* also update nr_pages */
-	work->nr_pages = max_t(pgoff_t, work->nr_pages, index + 1);
-next_part:
-	/* can be used for verification */
-	map->m_llen = offset + cur - map->m_la;
-
-	end = cur;
-	if (end > 0)
-		goto repeat;
-
-out:
-	/* FIXME! avoid the last relundant fixup & endio */
-	z_erofs_onlinepage_endio(page);
-
-	debugln("%s, finish page: %pK spiltted: %u map->m_llen %llu",
-		__func__, page, spiltted, map->m_llen);
-	return err;
-
-	/* if some error occurred while processing this page */
-err_out:
-	SetPageError(page);
-	goto out;
-}
-
-static void z_erofs_vle_unzip_kickoff(void *ptr, int bios)
-{
-	tagptr1_t t = tagptr_init(tagptr1_t, ptr);
-	struct z_erofs_vle_unzip_io *io = tagptr_unfold_ptr(t);
-	bool background = tagptr_unfold_tags(t);
-
-	if (!background) {
-		unsigned long flags;
-
-		spin_lock_irqsave(&io->u.wait.lock, flags);
-		if (!atomic_add_return(bios, &io->pending_bios))
-			wake_up_locked(&io->u.wait);
-		spin_unlock_irqrestore(&io->u.wait.lock, flags);
-		return;
-	}
-
-	if (!atomic_add_return(bios, &io->pending_bios))
-		queue_work(z_erofs_workqueue, &io->u.work);
-}
-
-static inline void z_erofs_vle_read_endio(struct bio *bio)
-{
-	const blk_status_t err = bio->bi_status;
-	unsigned i;
-	struct bio_vec *bvec;
-#ifdef EROFS_FS_HAS_MANAGED_CACHE
-	struct address_space *mngda = NULL;
-#endif
-
-	bio_for_each_segment_all(bvec, bio, i) {
-		struct page *page = bvec->bv_page;
-		bool cachemngd = false;
-
-		DBG_BUGON(PageUptodate(page));
-		DBG_BUGON(!page->mapping);
-
-#ifdef EROFS_FS_HAS_MANAGED_CACHE
-		if (unlikely(mngda == NULL && !z_erofs_is_stagingpage(page))) {
-			struct inode *const inode = page->mapping->host;
-			struct super_block *const sb = inode->i_sb;
-
-			mngda = EROFS_SB(sb)->managed_cache->i_mapping;
-		}
-
-		/*
-		 * If mngda has not gotten, it equals NULL,
-		 * however, page->mapping never be NULL if working properly.
-		 */
-		cachemngd = (page->mapping == mngda);
-#endif
-
-		if (unlikely(err))
-			SetPageError(page);
-		else if (cachemngd)
-			SetPageUptodate(page);
-
-		if (cachemngd)
-			unlock_page(page);
-	}
-
-	z_erofs_vle_unzip_kickoff(bio->bi_private, -1);
-	bio_put(bio);
-}
-
-static struct page *z_pagemap_global[Z_EROFS_VLE_VMAP_GLOBAL_PAGES];
-static DEFINE_MUTEX(z_pagemap_global_lock);
-
-static int z_erofs_vle_unzip(struct super_block *sb,
-	struct z_erofs_vle_workgroup *grp,
-	struct list_head *page_pool)
-{
-	struct erofs_sb_info *const sbi = EROFS_SB(sb);
-#ifdef EROFS_FS_HAS_MANAGED_CACHE
-	struct address_space *const mngda = sbi->managed_cache->i_mapping;
-#endif
-	const unsigned clusterpages = erofs_clusterpages(sbi);
-
-	struct z_erofs_pagevec_ctor ctor;
-	unsigned int nr_pages;
-	unsigned int sparsemem_pages = 0;
-	struct page *pages_onstack[Z_EROFS_VLE_VMAP_ONSTACK_PAGES];
-	struct page **pages, **compressed_pages, *page;
-	unsigned i, llen;
-
-	enum z_erofs_page_type page_type;
-	bool overlapped;
-	struct z_erofs_vle_work *work;
-	void *vout;
-	int err;
-
-	might_sleep();
-	work = z_erofs_vle_grab_primary_work(grp);
-	DBG_BUGON(!READ_ONCE(work->nr_pages));
-
-	mutex_lock(&work->lock);
-	nr_pages = work->nr_pages;
-
-	if (likely(nr_pages <= Z_EROFS_VLE_VMAP_ONSTACK_PAGES))
-		pages = pages_onstack;
-	else if (nr_pages <= Z_EROFS_VLE_VMAP_GLOBAL_PAGES &&
-		mutex_trylock(&z_pagemap_global_lock))
-		pages = z_pagemap_global;
-	else {
-repeat:
-		pages = kvmalloc_array(nr_pages,
-			sizeof(struct page *), GFP_KERNEL);
-
-		/* fallback to global pagemap for the lowmem scenario */
-		if (unlikely(pages == NULL)) {
-			if (nr_pages > Z_EROFS_VLE_VMAP_GLOBAL_PAGES)
-				goto repeat;
-			else {
-				mutex_lock(&z_pagemap_global_lock);
-				pages = z_pagemap_global;
-			}
-		}
-	}
-
-	for (i = 0; i < nr_pages; ++i)
-		pages[i] = NULL;
-
-	err = 0;
-	z_erofs_pagevec_ctor_init(&ctor,
-		Z_EROFS_VLE_INLINE_PAGEVECS, work->pagevec, 0);
-
-	for (i = 0; i < work->vcnt; ++i) {
-		unsigned pagenr;
-
-		page = z_erofs_pagevec_ctor_dequeue(&ctor, &page_type);
-
-		/* all pages in pagevec ought to be valid */
-		DBG_BUGON(page == NULL);
-		DBG_BUGON(page->mapping == NULL);
-
-		if (z_erofs_gather_if_stagingpage(page_pool, page))
-			continue;
-
-		if (page_type == Z_EROFS_VLE_PAGE_TYPE_HEAD)
-			pagenr = 0;
-		else
-			pagenr = z_erofs_onlinepage_index(page);
-
-		DBG_BUGON(pagenr >= nr_pages);
-
-		/*
-		 * currently EROFS doesn't support multiref(dedup),
-		 * so here erroring out one multiref page.
-		 */
-		if (pages[pagenr]) {
-			DBG_BUGON(1);
-			SetPageError(pages[pagenr]);
-			z_erofs_onlinepage_endio(pages[pagenr]);
-			err = -EIO;
-		}
-		pages[pagenr] = page;
-	}
-	sparsemem_pages = i;
-
-	z_erofs_pagevec_ctor_exit(&ctor, true);
-
-	overlapped = false;
-	compressed_pages = grp->compressed_pages;
-
-	for (i = 0; i < clusterpages; ++i) {
-		unsigned pagenr;
-
-		page = compressed_pages[i];
-
-		/* all compressed pages ought to be valid */
-		DBG_BUGON(page == NULL);
-		DBG_BUGON(page->mapping == NULL);
-
-		if (!z_erofs_is_stagingpage(page)) {
-#ifdef EROFS_FS_HAS_MANAGED_CACHE
-			if (page->mapping == mngda) {
-				if (unlikely(!PageUptodate(page)))
-					err = -EIO;
-				continue;
-			}
-#endif
-
-			/*
-			 * only if non-head page can be selected
-			 * for inplace decompression
-			 */
-			pagenr = z_erofs_onlinepage_index(page);
-
-			DBG_BUGON(pagenr >= nr_pages);
-			if (pages[pagenr]) {
-				DBG_BUGON(1);
-				SetPageError(pages[pagenr]);
-				z_erofs_onlinepage_endio(pages[pagenr]);
-				err = -EIO;
-			}
-			++sparsemem_pages;
-			pages[pagenr] = page;
-
-			overlapped = true;
-		}
-
-		/* PG_error needs checking for inplaced and staging pages */
-		if (unlikely(PageError(page))) {
-			DBG_BUGON(PageUptodate(page));
-			err = -EIO;
-		}
-	}
-
-	if (unlikely(err))
-		goto out;
-
-	llen = (nr_pages << PAGE_SHIFT) - work->pageofs;
-
-	if (z_erofs_vle_workgrp_fmt(grp) == Z_EROFS_VLE_WORKGRP_FMT_PLAIN) {
-		err = z_erofs_vle_plain_copy(compressed_pages, clusterpages,
-			pages, nr_pages, work->pageofs);
-		goto out;
-	}
-
-	if (llen > grp->llen)
-		llen = grp->llen;
-
-	err = z_erofs_vle_unzip_fast_percpu(compressed_pages, clusterpages,
-					    pages, llen, work->pageofs);
-	if (err != -ENOTSUPP)
-		goto out;
-
-	if (sparsemem_pages >= nr_pages)
-		goto skip_allocpage;
-
-	for (i = 0; i < nr_pages; ++i) {
-		if (pages[i] != NULL)
-			continue;
-
-		pages[i] = __stagingpage_alloc(page_pool, GFP_NOFS);
-	}
-
-skip_allocpage:
-	vout = erofs_vmap(pages, nr_pages);
-	if (!vout) {
-		err = -ENOMEM;
-		goto out;
-	}
-
-	err = z_erofs_vle_unzip_vmap(compressed_pages,
-		clusterpages, vout, llen, work->pageofs, overlapped);
-
-	erofs_vunmap(vout, nr_pages);
-
-out:
-	/* must handle all compressed pages before endding pages */
-	for (i = 0; i < clusterpages; ++i) {
-		page = compressed_pages[i];
-
-#ifdef EROFS_FS_HAS_MANAGED_CACHE
-		if (page->mapping == mngda)
-			continue;
-#endif
-		/* recycle all individual staging pages */
-		(void)z_erofs_gather_if_stagingpage(page_pool, page);
-
-		WRITE_ONCE(compressed_pages[i], NULL);
-	}
-
-	for (i = 0; i < nr_pages; ++i) {
-		page = pages[i];
-		if (!page)
-			continue;
-
-		DBG_BUGON(page->mapping == NULL);
-
-		/* recycle all individual staging pages */
-		if (z_erofs_gather_if_stagingpage(page_pool, page))
-			continue;
-
-		if (unlikely(err < 0))
-			SetPageError(page);
-
-		z_erofs_onlinepage_endio(page);
-	}
-
-	if (pages == z_pagemap_global)
-		mutex_unlock(&z_pagemap_global_lock);
-	else if (unlikely(pages != pages_onstack))
-		kvfree(pages);
-
-	work->nr_pages = 0;
-	work->vcnt = 0;
-
-	/* all work locks MUST be taken before the following line */
-
-	WRITE_ONCE(grp->next, Z_EROFS_VLE_WORKGRP_NIL);
-
-	/* all work locks SHOULD be released right now */
-	mutex_unlock(&work->lock);
-
-	z_erofs_vle_work_release(work);
-	return err;
-}
-
-static void z_erofs_vle_unzip_all(struct super_block *sb,
-				  struct z_erofs_vle_unzip_io *io,
-				  struct list_head *page_pool)
-{
-	z_erofs_vle_owned_workgrp_t owned = io->head;
-
-	while (owned != Z_EROFS_VLE_WORKGRP_TAIL_CLOSED) {
-		struct z_erofs_vle_workgroup *grp;
-
-		/* no possible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */
-		DBG_BUGON(owned == Z_EROFS_VLE_WORKGRP_TAIL);
-
-		/* no possible that 'owned' equals NULL */
-		DBG_BUGON(owned == Z_EROFS_VLE_WORKGRP_NIL);
-
-		grp = owned;
-		owned = READ_ONCE(grp->next);
-
-		z_erofs_vle_unzip(sb, grp, page_pool);
-	}
-}
-
-static void z_erofs_vle_unzip_wq(struct work_struct *work)
-{
-	struct z_erofs_vle_unzip_io_sb *iosb = container_of(work,
-		struct z_erofs_vle_unzip_io_sb, io.u.work);
-	LIST_HEAD(page_pool);
-
-	DBG_BUGON(iosb->io.head == Z_EROFS_VLE_WORKGRP_TAIL_CLOSED);
-	z_erofs_vle_unzip_all(iosb->sb, &iosb->io, &page_pool);
-
-	put_pages_list(&page_pool);
-	kvfree(iosb);
-}
-
-static inline struct z_erofs_vle_unzip_io *
-prepare_io_handler(struct super_block *sb,
-		   struct z_erofs_vle_unzip_io *io,
-		   bool background)
-{
-	struct z_erofs_vle_unzip_io_sb *iosb;
-
-	if (!background) {
-		/* waitqueue available for foreground io */
-		BUG_ON(io == NULL);
-
-		init_waitqueue_head(&io->u.wait);
-		atomic_set(&io->pending_bios, 0);
-		goto out;
-	}
-
-	if (io != NULL)
-		BUG();
-	else {
-		/* allocate extra io descriptor for background io */
-		iosb = kvzalloc(sizeof(struct z_erofs_vle_unzip_io_sb),
-			GFP_KERNEL | __GFP_NOFAIL);
-		BUG_ON(iosb == NULL);
-
-		io = &iosb->io;
-	}
-
-	iosb->sb = sb;
-	INIT_WORK(&io->u.work, z_erofs_vle_unzip_wq);
-out:
-	io->head = Z_EROFS_VLE_WORKGRP_TAIL_CLOSED;
-	return io;
-}
-
-#ifdef EROFS_FS_HAS_MANAGED_CACHE
-/* true - unlocked (noio), false - locked (need submit io) */
-static inline bool recover_managed_page(struct z_erofs_vle_workgroup *grp,
-					struct page *page)
-{
-	wait_on_page_locked(page);
-	if (PagePrivate(page) && PageUptodate(page))
-		return true;
-
-	lock_page(page);
-	ClearPageError(page);
-
-	if (unlikely(!PagePrivate(page))) {
-		set_page_private(page, (unsigned long)grp);
-		SetPagePrivate(page);
-	}
-	if (unlikely(PageUptodate(page))) {
-		unlock_page(page);
-		return true;
-	}
-	return false;
-}
-
-#define __FSIO_1 1
-#else
-#define __FSIO_1 0
-#endif
-
-static bool z_erofs_vle_submit_all(struct super_block *sb,
-				   z_erofs_vle_owned_workgrp_t owned_head,
-				   struct list_head *pagepool,
-				   struct z_erofs_vle_unzip_io *fg_io,
-				   bool force_fg)
-{
-	struct erofs_sb_info *const sbi = EROFS_SB(sb);
-	const unsigned clusterpages = erofs_clusterpages(sbi);
-	const gfp_t gfp = GFP_NOFS;
-#ifdef EROFS_FS_HAS_MANAGED_CACHE
-	struct address_space *const mngda = sbi->managed_cache->i_mapping;
-	struct z_erofs_vle_workgroup *lstgrp_noio = NULL, *lstgrp_io = NULL;
-#endif
-	struct z_erofs_vle_unzip_io *ios[1 + __FSIO_1];
-	struct bio *bio;
-	tagptr1_t bi_private;
-	/* since bio will be NULL, no need to initialize last_index */
-	pgoff_t uninitialized_var(last_index);
-	bool force_submit = false;
-	unsigned nr_bios;
-
-	if (unlikely(owned_head == Z_EROFS_VLE_WORKGRP_TAIL))
-		return false;
-
-	/*
-	 * force_fg == 1, (io, fg_io[0]) no io, (io, fg_io[1]) need submit io
-	 * force_fg == 0, (io, fg_io[0]) no io; (io[1], bg_io) need submit io
-	 */
-#ifdef EROFS_FS_HAS_MANAGED_CACHE
-	ios[0] = prepare_io_handler(sb, fg_io + 0, false);
-#endif
-
-	if (force_fg) {
-		ios[__FSIO_1] = prepare_io_handler(sb, fg_io + __FSIO_1, false);
-		bi_private = tagptr_fold(tagptr1_t, ios[__FSIO_1], 0);
-	} else {
-		ios[__FSIO_1] = prepare_io_handler(sb, NULL, true);
-		bi_private = tagptr_fold(tagptr1_t, ios[__FSIO_1], 1);
-	}
-
-	nr_bios = 0;
-	force_submit = false;
-	bio = NULL;
-
-	/* by default, all need io submission */
-	ios[__FSIO_1]->head = owned_head;
-
-	do {
-		struct z_erofs_vle_workgroup *grp;
-		struct page **compressed_pages, *oldpage, *page;
-		pgoff_t first_index;
-		unsigned i = 0;
-#ifdef EROFS_FS_HAS_MANAGED_CACHE
-		unsigned int noio = 0;
-		bool cachemngd;
-#endif
-		int err;
-
-		/* no possible 'owned_head' equals the following */
-		DBG_BUGON(owned_head == Z_EROFS_VLE_WORKGRP_TAIL_CLOSED);
-		DBG_BUGON(owned_head == Z_EROFS_VLE_WORKGRP_NIL);
-
-		grp = owned_head;
-
-		/* close the main owned chain at first */
-		owned_head = cmpxchg(&grp->next, Z_EROFS_VLE_WORKGRP_TAIL,
-			Z_EROFS_VLE_WORKGRP_TAIL_CLOSED);
-
-		first_index = grp->obj.index;
-		compressed_pages = grp->compressed_pages;
-
-		force_submit |= (first_index != last_index + 1);
-repeat:
-		/* fulfill all compressed pages */
-		oldpage = page = READ_ONCE(compressed_pages[i]);
-
-#ifdef EROFS_FS_HAS_MANAGED_CACHE
-		cachemngd = false;
-
-		if (page == EROFS_UNALLOCATED_CACHED_PAGE) {
-			cachemngd = true;
-			goto do_allocpage;
-		} else if (page != NULL) {
-			if (page->mapping != mngda)
-				BUG_ON(PageUptodate(page));
-			else if (recover_managed_page(grp, page)) {
-				/* page is uptodate, skip io submission */
-				force_submit = true;
-				++noio;
-				goto skippage;
-			}
-		} else {
-do_allocpage:
-#else
-		if (page != NULL)
-			BUG_ON(PageUptodate(page));
-		else {
-#endif
-			page = __stagingpage_alloc(pagepool, gfp);
-
-			if (oldpage != cmpxchg(compressed_pages + i,
-				oldpage, page)) {
-				list_add(&page->lru, pagepool);
-				goto repeat;
-#ifdef EROFS_FS_HAS_MANAGED_CACHE
-			} else if (cachemngd && !add_to_page_cache_lru(page,
-				mngda, first_index + i, gfp)) {
-				set_page_private(page, (unsigned long)grp);
-				SetPagePrivate(page);
-#endif
-			}
-		}
-
-		if (bio != NULL && force_submit) {
-submit_bio_retry:
-			__submit_bio(bio, REQ_OP_READ, 0);
-			bio = NULL;
-		}
-
-		if (bio == NULL) {
-			bio = prepare_bio(sb, first_index + i,
-				BIO_MAX_PAGES, z_erofs_vle_read_endio);
-			bio->bi_private = tagptr_cast_ptr(bi_private);
-
-			++nr_bios;
-		}
-
-		err = bio_add_page(bio, page, PAGE_SIZE, 0);
-		if (err < PAGE_SIZE)
-			goto submit_bio_retry;
-
-		force_submit = false;
-		last_index = first_index + i;
-#ifdef EROFS_FS_HAS_MANAGED_CACHE
-skippage:
-#endif
-		if (++i < clusterpages)
-			goto repeat;
-
-#ifdef EROFS_FS_HAS_MANAGED_CACHE
-		if (noio < clusterpages) {
-			lstgrp_io = grp;
-		} else {
-			z_erofs_vle_owned_workgrp_t iogrp_next =
-				owned_head == Z_EROFS_VLE_WORKGRP_TAIL ?
-				Z_EROFS_VLE_WORKGRP_TAIL_CLOSED :
-				owned_head;
-
-			if (lstgrp_io == NULL)
-				ios[1]->head = iogrp_next;
-			else
-				WRITE_ONCE(lstgrp_io->next, iogrp_next);
-
-			if (lstgrp_noio == NULL)
-				ios[0]->head = grp;
-			else
-				WRITE_ONCE(lstgrp_noio->next, grp);
-
-			lstgrp_noio = grp;
-		}
-#endif
-	} while (owned_head != Z_EROFS_VLE_WORKGRP_TAIL);
-
-	if (bio != NULL)
-		__submit_bio(bio, REQ_OP_READ, 0);
-
-#ifndef EROFS_FS_HAS_MANAGED_CACHE
-	BUG_ON(!nr_bios);
-#else
-	if (lstgrp_noio != NULL)
-		WRITE_ONCE(lstgrp_noio->next, Z_EROFS_VLE_WORKGRP_TAIL_CLOSED);
-
-	if (!force_fg && !nr_bios) {
-		kvfree(container_of(ios[1],
-			struct z_erofs_vle_unzip_io_sb, io));
-		return true;
-	}
-#endif
-
-	z_erofs_vle_unzip_kickoff(tagptr_cast_ptr(bi_private), nr_bios);
-	return true;
-}
-
-static void z_erofs_submit_and_unzip(struct z_erofs_vle_frontend *f,
-				     struct list_head *pagepool,
-				     bool force_fg)
-{
-	struct super_block *sb = f->inode->i_sb;
-	struct z_erofs_vle_unzip_io io[1 + __FSIO_1];
-
-	if (!z_erofs_vle_submit_all(sb, f->owned_head, pagepool, io, force_fg))
-		return;
-
-#ifdef EROFS_FS_HAS_MANAGED_CACHE
-	z_erofs_vle_unzip_all(sb, &io[0], pagepool);
-#endif
-	if (!force_fg)
-		return;
-
-	/* wait until all bios are completed */
-	wait_event(io[__FSIO_1].u.wait,
-		!atomic_read(&io[__FSIO_1].pending_bios));
-
-	/* let's synchronous decompression */
-	z_erofs_vle_unzip_all(sb, &io[__FSIO_1], pagepool);
-}
-
-static int z_erofs_vle_normalaccess_readpage(struct file *file,
-					     struct page *page)
-{
-	struct inode *const inode = page->mapping->host;
-	struct z_erofs_vle_frontend f = VLE_FRONTEND_INIT(inode);
-	int err;
-	LIST_HEAD(pagepool);
-
-#if (EROFS_FS_ZIP_CACHE_LVL >= 2)
-	f.cachedzone_la = page->index << PAGE_SHIFT;
-#endif
-	err = z_erofs_do_read_page(&f, page, &pagepool);
-	(void)z_erofs_vle_work_iter_end(&f.builder);
-
-	/* if some compressed cluster ready, need submit them anyway */
-	z_erofs_submit_and_unzip(&f, &pagepool, true);
-
-	if (err)
-		errln("%s, failed to read, err [%d]", __func__, err);
-
-	if (f.m_iter.mpage != NULL)
-		put_page(f.m_iter.mpage);
-
-	/* clean up the remaining free pages */
-	put_pages_list(&pagepool);
-	return err;
-}
-
-static inline int __z_erofs_vle_normalaccess_readpages(
-	struct file *filp,
-	struct address_space *mapping,
-	struct list_head *pages, unsigned nr_pages, bool sync)
-{
-	struct inode *const inode = mapping->host;
-
-	struct z_erofs_vle_frontend f = VLE_FRONTEND_INIT(inode);
-	gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL);
-	struct page *head = NULL;
-	LIST_HEAD(pagepool);
-
-#if (EROFS_FS_ZIP_CACHE_LVL >= 2)
-	f.cachedzone_la = lru_to_page(pages)->index << PAGE_SHIFT;
-#endif
-	for (; nr_pages; --nr_pages) {
-		struct page *page = lru_to_page(pages);
-
-		prefetchw(&page->flags);
-		list_del(&page->lru);
-
-		if (add_to_page_cache_lru(page, mapping, page->index, gfp)) {
-			list_add(&page->lru, &pagepool);
-			continue;
-		}
-
-		set_page_private(page, (unsigned long)head);
-		head = page;
-	}
-
-	while (head != NULL) {
-		struct page *page = head;
-		int err;
-
-		/* traversal in reverse order */
-		head = (void *)page_private(page);
-
-		err = z_erofs_do_read_page(&f, page, &pagepool);
-		if (err) {
-			struct erofs_vnode *vi = EROFS_V(inode);
-
-			errln("%s, readahead error at page %lu of nid %llu",
-				__func__, page->index, vi->nid);
-		}
-
-		put_page(page);
-	}
-
-	(void)z_erofs_vle_work_iter_end(&f.builder);
-
-	z_erofs_submit_and_unzip(&f, &pagepool, sync);
-
-	if (f.m_iter.mpage != NULL)
-		put_page(f.m_iter.mpage);
-
-	/* clean up the remaining free pages */
-	put_pages_list(&pagepool);
-	return 0;
-}
-
-static int z_erofs_vle_normalaccess_readpages(
-	struct file *filp,
-	struct address_space *mapping,
-	struct list_head *pages, unsigned nr_pages)
-{
-	return __z_erofs_vle_normalaccess_readpages(filp,
-		mapping, pages, nr_pages,
-		nr_pages < 4 /* sync */);
-}
-
-const struct address_space_operations z_erofs_vle_normalaccess_aops = {
-	.readpage = z_erofs_vle_normalaccess_readpage,
-	.readpages = z_erofs_vle_normalaccess_readpages,
-};
-
-#define __vle_cluster_advise(x, bit, bits) \
-	((le16_to_cpu(x) >> (bit)) & ((1 << (bits)) - 1))
-
-#define __vle_cluster_type(advise) __vle_cluster_advise(advise, \
-	Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT, Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS)
-
-enum {
-	Z_EROFS_VLE_CLUSTER_TYPE_PLAIN,
-	Z_EROFS_VLE_CLUSTER_TYPE_HEAD,
-	Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD,
-	Z_EROFS_VLE_CLUSTER_TYPE_RESERVED,
-	Z_EROFS_VLE_CLUSTER_TYPE_MAX
-};
-
-#define vle_cluster_type(di)	\
-	__vle_cluster_type((di)->di_advise)
-
-static inline unsigned
-vle_compressed_index_clusterofs(unsigned clustersize,
-	struct z_erofs_vle_decompressed_index *di)
-{
-	debugln("%s, vle=%pK, advise=%x (type %u), clusterofs=%x blkaddr=%x",
-		__func__, di, di->di_advise, vle_cluster_type(di),
-		di->di_clusterofs, di->di_u.blkaddr);
-
-	switch (vle_cluster_type(di)) {
-	case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
-		break;
-	case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
-	case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
-		return di->di_clusterofs;
-	default:
-		BUG_ON(1);
-	}
-	return clustersize;
-}
-
-static inline erofs_blk_t
-vle_extent_blkaddr(struct inode *inode, pgoff_t index)
-{
-	struct erofs_sb_info *sbi = EROFS_I_SB(inode);
-	struct erofs_vnode *vi = EROFS_V(inode);
-
-	unsigned ofs = Z_EROFS_VLE_EXTENT_ALIGN(vi->inode_isize +
-		vi->xattr_isize) + sizeof(struct erofs_extent_header) +
-		index * sizeof(struct z_erofs_vle_decompressed_index);
-
-	return erofs_blknr(iloc(sbi, vi->nid) + ofs);
-}
-
-static inline unsigned int
-vle_extent_blkoff(struct inode *inode, pgoff_t index)
-{
-	struct erofs_sb_info *sbi = EROFS_I_SB(inode);
-	struct erofs_vnode *vi = EROFS_V(inode);
-
-	unsigned ofs = Z_EROFS_VLE_EXTENT_ALIGN(vi->inode_isize +
-		vi->xattr_isize) + sizeof(struct erofs_extent_header) +
-		index * sizeof(struct z_erofs_vle_decompressed_index);
-
-	return erofs_blkoff(iloc(sbi, vi->nid) + ofs);
-}
-
-/*
- * Variable-sized Logical Extent (Fixed Physical Cluster) Compression Mode
- * ---
- * VLE compression mode attempts to compress a number of logical data into
- * a physical cluster with a fixed size.
- * VLE compression mode uses "struct z_erofs_vle_decompressed_index".
- */
-static erofs_off_t vle_get_logical_extent_head(
-	struct inode *inode,
-	struct page **page_iter,
-	void **kaddr_iter,
-	unsigned lcn,	/* logical cluster number */
-	erofs_blk_t *pcn,
-	unsigned *flags)
-{
-	/* for extent meta */
-	struct page *page = *page_iter;
-	erofs_blk_t blkaddr = vle_extent_blkaddr(inode, lcn);
-	struct z_erofs_vle_decompressed_index *di;
-	unsigned long long ofs;
-	const unsigned int clusterbits = EROFS_SB(inode->i_sb)->clusterbits;
-	const unsigned int clustersize = 1 << clusterbits;
-	unsigned int delta0;
-
-	if (page->index != blkaddr) {
-		kunmap_atomic(*kaddr_iter);
-		unlock_page(page);
-		put_page(page);
-
-		*page_iter = page = erofs_get_meta_page(inode->i_sb,
-			blkaddr, false);
-		*kaddr_iter = kmap_atomic(page);
-	}
-
-	di = *kaddr_iter + vle_extent_blkoff(inode, lcn);
-	switch (vle_cluster_type(di)) {
-	case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
-		delta0 = le16_to_cpu(di->di_u.delta[0]);
-		DBG_BUGON(!delta0);
-		DBG_BUGON(lcn < delta0);
-
-		ofs = vle_get_logical_extent_head(inode,
-			page_iter, kaddr_iter,
-			lcn - delta0, pcn, flags);
-		break;
-	case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
-		*flags ^= EROFS_MAP_ZIPPED;
-	case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
-		/* clustersize should be a power of two */
-		ofs = ((unsigned long long)lcn << clusterbits) +
-			(le16_to_cpu(di->di_clusterofs) & (clustersize - 1));
-		*pcn = le32_to_cpu(di->di_u.blkaddr);
-		break;
-	default:
-		BUG_ON(1);
-	}
-	return ofs;
-}
-
-int z_erofs_map_blocks_iter(struct inode *inode,
-	struct erofs_map_blocks *map,
-	struct page **mpage_ret, int flags)
-{
-	/* logicial extent (start, end) offset */
-	unsigned long long ofs, end;
-	struct z_erofs_vle_decompressed_index *di;
-	erofs_blk_t e_blkaddr, pcn;
-	unsigned lcn, logical_cluster_ofs, cluster_type;
-	u32 ofs_rem;
-	struct page *mpage = *mpage_ret;
-	void *kaddr;
-	bool initial;
-	const unsigned int clusterbits = EROFS_SB(inode->i_sb)->clusterbits;
-	const unsigned int clustersize = 1 << clusterbits;
-	int err = 0;
-
-	/* if both m_(l,p)len are 0, regularize l_lblk, l_lofs, etc... */
-	initial = !map->m_llen;
-
-	/* when trying to read beyond EOF, leave it unmapped */
-	if (unlikely(map->m_la >= inode->i_size)) {
-		BUG_ON(!initial);
-		map->m_llen = map->m_la + 1 - inode->i_size;
-		map->m_la = inode->i_size - 1;
-		map->m_flags = 0;
-		goto out;
-	}
-
-	debugln("%s, m_la %llu m_llen %llu --- start", __func__,
-		map->m_la, map->m_llen);
-
-	ofs = map->m_la + map->m_llen;
-
-	/* clustersize should be power of two */
-	lcn = ofs >> clusterbits;
-	ofs_rem = ofs & (clustersize - 1);
-
-	e_blkaddr = vle_extent_blkaddr(inode, lcn);
-
-	if (mpage == NULL || mpage->index != e_blkaddr) {
-		if (mpage != NULL)
-			put_page(mpage);
-
-		mpage = erofs_get_meta_page(inode->i_sb, e_blkaddr, false);
-		*mpage_ret = mpage;
-	} else {
-		lock_page(mpage);
-		DBG_BUGON(!PageUptodate(mpage));
-	}
-
-	kaddr = kmap_atomic(mpage);
-	di = kaddr + vle_extent_blkoff(inode, lcn);
-
-	debugln("%s, lcn %u e_blkaddr %u e_blkoff %u", __func__, lcn,
-		e_blkaddr, vle_extent_blkoff(inode, lcn));
-
-	logical_cluster_ofs = vle_compressed_index_clusterofs(clustersize, di);
-	if (!initial) {
-		/* [walking mode] 'map' has been already initialized */
-		map->m_llen += logical_cluster_ofs;
-		goto unmap_out;
-	}
-
-	/* by default, compressed */
-	map->m_flags |= EROFS_MAP_ZIPPED;
-
-	end = (u64)(lcn + 1) * clustersize;
-
-	cluster_type = vle_cluster_type(di);
-
-	switch (cluster_type) {
-	case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
-		if (ofs_rem >= logical_cluster_ofs)
-			map->m_flags ^= EROFS_MAP_ZIPPED;
-		/* fallthrough */
-	case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
-		if (ofs_rem == logical_cluster_ofs) {
-			pcn = le32_to_cpu(di->di_u.blkaddr);
-			goto exact_hitted;
-		}
-
-		if (ofs_rem > logical_cluster_ofs) {
-			ofs = lcn * clustersize | logical_cluster_ofs;
-			pcn = le32_to_cpu(di->di_u.blkaddr);
-			break;
-		}
-
-		/* logical cluster number should be >= 1 */
-		if (unlikely(!lcn)) {
-			errln("invalid logical cluster 0 at nid %llu",
-				EROFS_V(inode)->nid);
-			err = -EIO;
-			goto unmap_out;
-		}
-		end = (lcn-- * clustersize) | logical_cluster_ofs;
-		/* fallthrough */
-	case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
-		/* get the correspoinding first chunk */
-		ofs = vle_get_logical_extent_head(inode, mpage_ret,
-			&kaddr, lcn, &pcn, &map->m_flags);
-		mpage = *mpage_ret;
-		break;
-	default:
-		errln("unknown cluster type %u at offset %llu of nid %llu",
-			cluster_type, ofs, EROFS_V(inode)->nid);
-		err = -EIO;
-		goto unmap_out;
-	}
-
-	map->m_la = ofs;
-exact_hitted:
-	map->m_llen = end - ofs;
-	map->m_plen = clustersize;
-	map->m_pa = blknr_to_addr(pcn);
-	map->m_flags |= EROFS_MAP_MAPPED;
-unmap_out:
-	kunmap_atomic(kaddr);
-	unlock_page(mpage);
-out:
-	debugln("%s, m_la %llu m_pa %llu m_llen %llu m_plen %llu m_flags 0%o",
-		__func__, map->m_la, map->m_pa,
-		map->m_llen, map->m_plen, map->m_flags);
-
-	/* aggressively BUG_ON iff CONFIG_EROFS_FS_DEBUG is on */
-	DBG_BUGON(err < 0);
-	return err;
-}
-
diff --git a/drivers/staging/erofs/unzip_vle_lz4.c b/drivers/staging/erofs/unzip_vle_lz4.c
deleted file mode 100644
index 3a7428317f0a..000000000000
--- a/drivers/staging/erofs/unzip_vle_lz4.c
+++ /dev/null
@@ -1,210 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * linux/drivers/staging/erofs/unzip_vle_lz4.c
- *
- * Copyright (C) 2018 HUAWEI, Inc.
- *             http://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file COPYING in the main directory of the Linux
- * distribution for more details.
- */
-#include "unzip_vle.h"
-
-#if Z_EROFS_CLUSTER_MAX_PAGES > Z_EROFS_VLE_INLINE_PAGEVECS
-#define EROFS_PERCPU_NR_PAGES   Z_EROFS_CLUSTER_MAX_PAGES
-#else
-#define EROFS_PERCPU_NR_PAGES   Z_EROFS_VLE_INLINE_PAGEVECS
-#endif
-
-static struct {
-	char data[PAGE_SIZE * EROFS_PERCPU_NR_PAGES];
-} erofs_pcpubuf[NR_CPUS];
-
-int z_erofs_vle_plain_copy(struct page **compressed_pages,
-			   unsigned clusterpages,
-			   struct page **pages,
-			   unsigned nr_pages,
-			   unsigned short pageofs)
-{
-	unsigned i, j;
-	void *src = NULL;
-	const unsigned righthalf = PAGE_SIZE - pageofs;
-	char *percpu_data;
-	bool mirrored[Z_EROFS_CLUSTER_MAX_PAGES] = { 0 };
-
-	preempt_disable();
-	percpu_data = erofs_pcpubuf[smp_processor_id()].data;
-
-	j = 0;
-	for (i = 0; i < nr_pages; j = i++) {
-		struct page *page = pages[i];
-		void *dst;
-
-		if (page == NULL) {
-			if (src != NULL) {
-				if (!mirrored[j])
-					kunmap_atomic(src);
-				src = NULL;
-			}
-			continue;
-		}
-
-		dst = kmap_atomic(page);
-
-		for (; j < clusterpages; ++j) {
-			if (compressed_pages[j] != page)
-				continue;
-
-			DBG_BUGON(mirrored[j]);
-			memcpy(percpu_data + j * PAGE_SIZE, dst, PAGE_SIZE);
-			mirrored[j] = true;
-			break;
-		}
-
-		if (i) {
-			if (src == NULL)
-				src = mirrored[i-1] ?
-					percpu_data + (i-1) * PAGE_SIZE :
-					kmap_atomic(compressed_pages[i-1]);
-
-			memcpy(dst, src + righthalf, pageofs);
-
-			if (!mirrored[i-1])
-				kunmap_atomic(src);
-
-			if (unlikely(i >= clusterpages)) {
-				kunmap_atomic(dst);
-				break;
-			}
-		}
-
-		if (!righthalf)
-			src = NULL;
-		else {
-			src = mirrored[i] ? percpu_data + i * PAGE_SIZE :
-				kmap_atomic(compressed_pages[i]);
-
-			memcpy(dst + pageofs, src, righthalf);
-		}
-
-		kunmap_atomic(dst);
-	}
-
-	if (src != NULL && !mirrored[j])
-		kunmap_atomic(src);
-
-	preempt_enable();
-	return 0;
-}
-
-extern int z_erofs_unzip_lz4(void *in, void *out, size_t inlen, size_t outlen);
-
-int z_erofs_vle_unzip_fast_percpu(struct page **compressed_pages,
-				  unsigned clusterpages,
-				  struct page **pages,
-				  unsigned outlen,
-				  unsigned short pageofs)
-{
-	void *vin, *vout;
-	unsigned nr_pages, i, j;
-	int ret;
-
-	if (outlen + pageofs > EROFS_PERCPU_NR_PAGES * PAGE_SIZE)
-		return -ENOTSUPP;
-
-	nr_pages = DIV_ROUND_UP(outlen + pageofs, PAGE_SIZE);
-
-	if (clusterpages == 1) {
-		vin = kmap_atomic(compressed_pages[0]);
-	} else {
-		vin = erofs_vmap(compressed_pages, clusterpages);
-		if (!vin)
-			return -ENOMEM;
-	}
-
-	preempt_disable();
-	vout = erofs_pcpubuf[smp_processor_id()].data;
-
-	ret = z_erofs_unzip_lz4(vin, vout + pageofs,
-		clusterpages * PAGE_SIZE, outlen);
-
-	if (ret < 0)
-		goto out;
-	ret = 0;
-
-	for (i = 0; i < nr_pages; ++i) {
-		j = min((unsigned)PAGE_SIZE - pageofs, outlen);
-
-		if (pages[i] != NULL) {
-			if (clusterpages == 1 &&
-			    pages[i] == compressed_pages[0]) {
-				memcpy(vin + pageofs, vout + pageofs, j);
-			} else {
-				void *dst = kmap_atomic(pages[i]);
-
-				memcpy(dst + pageofs, vout + pageofs, j);
-				kunmap_atomic(dst);
-			}
-		}
-		vout += PAGE_SIZE;
-		outlen -= j;
-		pageofs = 0;
-	}
-
-out:
-	preempt_enable();
-
-	if (clusterpages == 1)
-		kunmap_atomic(vin);
-	else
-		erofs_vunmap(vin, clusterpages);
-
-	return ret;
-}
-
-int z_erofs_vle_unzip_vmap(struct page **compressed_pages,
-			   unsigned clusterpages,
-			   void *vout,
-			   unsigned llen,
-			   unsigned short pageofs,
-			   bool overlapped)
-{
-	void *vin;
-	unsigned i;
-	int ret;
-
-	if (overlapped) {
-		preempt_disable();
-		vin = erofs_pcpubuf[smp_processor_id()].data;
-
-		for (i = 0; i < clusterpages; ++i) {
-			void *t = kmap_atomic(compressed_pages[i]);
-
-			memcpy(vin + PAGE_SIZE *i, t, PAGE_SIZE);
-			kunmap_atomic(t);
-		}
-	} else if (clusterpages == 1)
-		vin = kmap_atomic(compressed_pages[0]);
-	else {
-		vin = erofs_vmap(compressed_pages, clusterpages);
-	}
-
-	ret = z_erofs_unzip_lz4(vin, vout + pageofs,
-		clusterpages * PAGE_SIZE, llen);
-	if (ret > 0)
-		ret = 0;
-
-	if (!overlapped) {
-		if (clusterpages == 1)
-			kunmap_atomic(vin);
-		else {
-			erofs_vunmap(vin, clusterpages);
-		}
-	} else
-		preempt_enable();
-
-	return ret;
-}
-
diff --git a/drivers/staging/erofs/utils.c b/drivers/staging/erofs/utils.c
index 4de9c39535eb..aab4a607f028 100644
--- a/drivers/staging/erofs/utils.c
+++ b/drivers/staging/erofs/utils.c
@@ -1,16 +1,8 @@
-// SPDX-License-Identifier: GPL-2.0
+// SPDX-License-Identifier: GPL-2.0-only
 /*
- * linux/drivers/staging/erofs/utils.c
- *
  * Copyright (C) 2018 HUAWEI, Inc.
- *             http://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file COPYING in the main directory of the Linux
- * distribution for more details.
+ *             https://www.huawei.com/
  */
-
 #include "internal.h"
 #include <linux/pagevec.h>
 
@@ -20,43 +12,67 @@ struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
 
 	if (!list_empty(pool)) {
 		page = lru_to_page(pool);
+		DBG_BUGON(page_ref_count(page) != 1);
 		list_del(&page->lru);
 	} else {
-		page = alloc_pages(gfp | __GFP_NOFAIL, 0);
+		page = alloc_page(gfp);
 	}
 	return page;
 }
 
+#if (EROFS_PCPUBUF_NR_PAGES > 0)
+static struct {
+	u8 data[PAGE_SIZE * EROFS_PCPUBUF_NR_PAGES];
+} ____cacheline_aligned_in_smp erofs_pcpubuf[NR_CPUS];
+
+void *erofs_get_pcpubuf(unsigned int pagenr)
+{
+	preempt_disable();
+	return &erofs_pcpubuf[smp_processor_id()].data[pagenr * PAGE_SIZE];
+}
+#endif
+
+#ifdef CONFIG_EROFS_FS_ZIP
 /* global shrink count (for all mounted EROFS instances) */
 static atomic_long_t erofs_global_shrink_cnt;
 
-#ifdef CONFIG_EROFS_FS_ZIP
+#define __erofs_workgroup_get(grp)	atomic_inc(&(grp)->refcount)
+#define __erofs_workgroup_put(grp)	atomic_dec(&(grp)->refcount)
 
-/* radix_tree and the future XArray both don't use tagptr_t yet */
-struct erofs_workgroup *erofs_find_workgroup(
-	struct super_block *sb, pgoff_t index, bool *tag)
+static int erofs_workgroup_get(struct erofs_workgroup *grp)
+{
+	int o;
+
+repeat:
+	o = erofs_wait_on_workgroup_freezed(grp);
+	if (o <= 0)
+		return -1;
+
+	if (atomic_cmpxchg(&grp->refcount, o, o + 1) != o)
+		goto repeat;
+
+	/* decrease refcount paired by erofs_workgroup_put */
+	if (o == 1)
+		atomic_long_dec(&erofs_global_shrink_cnt);
+	return 0;
+}
+
+struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
+					     pgoff_t index)
 {
 	struct erofs_sb_info *sbi = EROFS_SB(sb);
 	struct erofs_workgroup *grp;
-	int oldcount;
 
 repeat:
 	rcu_read_lock();
 	grp = radix_tree_lookup(&sbi->workstn_tree, index);
-	if (grp != NULL) {
-		*tag = radix_tree_exceptional_entry(grp);
-		grp = (void *)((unsigned long)grp &
-			~RADIX_TREE_EXCEPTIONAL_ENTRY);
-
-		if (erofs_workgroup_get(grp, &oldcount)) {
+	if (grp) {
+		if (erofs_workgroup_get(grp)) {
 			/* prefer to relax rcu read side */
 			rcu_read_unlock();
 			goto repeat;
 		}
 
-		/* decrease refcount added by erofs_workgroup_put */
-		if (unlikely(oldcount == 1))
-			atomic_long_dec(&erofs_global_shrink_cnt);
 		DBG_BUGON(index != grp->index);
 	}
 	rcu_read_unlock();
@@ -64,14 +80,13 @@ struct erofs_workgroup *erofs_find_workgroup(
 }
 
 int erofs_register_workgroup(struct super_block *sb,
-			     struct erofs_workgroup *grp,
-			     bool tag)
+			     struct erofs_workgroup *grp)
 {
 	struct erofs_sb_info *sbi;
 	int err;
 
 	/* grp shouldn't be broken or used before */
-	if (unlikely(atomic_read(&grp->refcount) != 1)) {
+	if (atomic_read(&grp->refcount) != 1) {
 		DBG_BUGON(1);
 		return -EINVAL;
 	}
@@ -81,35 +96,28 @@ int erofs_register_workgroup(struct super_block *sb,
 		return err;
 
 	sbi = EROFS_SB(sb);
-	erofs_workstn_lock(sbi);
-
-	if (tag)
-		grp = (void *)((unsigned long)grp |
-			1UL << RADIX_TREE_EXCEPTIONAL_SHIFT);
+	xa_lock(&sbi->workstn_tree);
 
 	/*
 	 * Bump up reference count before making this workgroup
 	 * visible to other users in order to avoid potential UAF
-	 * without serialized by erofs_workstn_lock.
+	 * without serialized by workstn_lock.
 	 */
 	__erofs_workgroup_get(grp);
 
-	err = radix_tree_insert(&sbi->workstn_tree,
-				grp->index, grp);
-	if (unlikely(err))
+	err = radix_tree_insert(&sbi->workstn_tree, grp->index, grp);
+	if (err)
 		/*
 		 * it's safe to decrease since the workgroup isn't visible
 		 * and refcount >= 2 (cannot be freezed).
 		 */
 		__erofs_workgroup_put(grp);
 
-	erofs_workstn_unlock(sbi);
+	xa_unlock(&sbi->workstn_tree);
 	radix_tree_preload_end();
 	return err;
 }
 
-extern void erofs_workgroup_free_rcu(struct erofs_workgroup *grp);
-
 static void  __erofs_workgroup_free(struct erofs_workgroup *grp)
 {
 	atomic_long_dec(&erofs_global_shrink_cnt);
@@ -127,33 +135,22 @@ int erofs_workgroup_put(struct erofs_workgroup *grp)
 	return count;
 }
 
-#ifdef EROFS_FS_HAS_MANAGED_CACHE
-/* for cache-managed case, customized reclaim paths exist */
-static void erofs_workgroup_unfreeze_final(struct erofs_workgroup *grp)
+static bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi,
+					   struct erofs_workgroup *grp)
 {
-	erofs_workgroup_unfreeze(grp, 0);
-	__erofs_workgroup_free(grp);
-}
-
-bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi,
-				    struct erofs_workgroup *grp,
-				    bool cleanup)
-{
-	void *entry;
-
 	/*
-	 * for managed cache enabled, the refcount of workgroups
-	 * themselves could be < 0 (freezed). So there is no guarantee
-	 * that all refcount > 0 if managed cache is enabled.
+	 * If managed cache is on, refcount of workgroups
+	 * themselves could be < 0 (freezed). In other words,
+	 * there is no guarantee that all refcounts > 0.
 	 */
 	if (!erofs_workgroup_try_to_freeze(grp, 1))
 		return false;
 
 	/*
-	 * note that all cached pages should be unlinked
-	 * before delete it from the radix tree.
-	 * Otherwise some cached pages of an orphan old workgroup
-	 * could be still linked after the new one is available.
+	 * Note that all cached pages should be unattached
+	 * before deleted from the radix tree. Otherwise some
+	 * cached pages could be still attached to the orphan
+	 * old workgroup when the new one is available in the tree.
 	 */
 	if (erofs_try_to_free_all_cached_pages(sbi, grp)) {
 		erofs_workgroup_unfreeze(grp, 1);
@@ -161,87 +158,52 @@ bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi,
 	}
 
 	/*
-	 * it is impossible to fail after the workgroup is freezed,
+	 * It's impossible to fail after the workgroup is freezed,
 	 * however in order to avoid some race conditions, add a
 	 * DBG_BUGON to observe this in advance.
 	 */
-	entry = radix_tree_delete(&sbi->workstn_tree, grp->index);
-	DBG_BUGON((void *)((unsigned long)entry &
-			   ~RADIX_TREE_EXCEPTIONAL_ENTRY) != grp);
+	DBG_BUGON(radix_tree_delete(&sbi->workstn_tree, grp->index) != grp);
 
-	/*
-	 * if managed cache is enable, the last refcount
-	 * should indicate the related workstation.
-	 */
-	erofs_workgroup_unfreeze_final(grp);
+	/* last refcount should be connected with its managed pslot.  */
+	erofs_workgroup_unfreeze(grp, 0);
+	__erofs_workgroup_free(grp);
 	return true;
 }
 
-#else
-/* for nocache case, no customized reclaim path at all */
-bool erofs_try_to_release_workgroup(struct erofs_sb_info *sbi,
-				    struct erofs_workgroup *grp,
-				    bool cleanup)
-{
-	int cnt = atomic_read(&grp->refcount);
-	void *entry;
-
-	DBG_BUGON(cnt <= 0);
-	DBG_BUGON(cleanup && cnt != 1);
-
-	if (cnt > 1)
-		return false;
-
-	entry = radix_tree_delete(&sbi->workstn_tree, grp->index);
-	DBG_BUGON((void *)((unsigned long)entry &
-			   ~RADIX_TREE_EXCEPTIONAL_ENTRY) != grp);
-
-	/* (rarely) could be grabbed again when freeing */
-	erofs_workgroup_put(grp);
-	return true;
-}
-
-#endif
-
-unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
-				       unsigned long nr_shrink,
-				       bool cleanup)
+static unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
+					      unsigned long nr_shrink)
 {
 	pgoff_t first_index = 0;
 	void *batch[PAGEVEC_SIZE];
-	unsigned freed = 0;
+	unsigned int freed = 0;
 
 	int i, found;
 repeat:
-	erofs_workstn_lock(sbi);
+	xa_lock(&sbi->workstn_tree);
 
 	found = radix_tree_gang_lookup(&sbi->workstn_tree,
-		batch, first_index, PAGEVEC_SIZE);
+				       batch, first_index, PAGEVEC_SIZE);
 
 	for (i = 0; i < found; ++i) {
-		struct erofs_workgroup *grp = (void *)
-			((unsigned long)batch[i] &
-				~RADIX_TREE_EXCEPTIONAL_ENTRY);
+		struct erofs_workgroup *grp = batch[i];
 
 		first_index = grp->index + 1;
 
 		/* try to shrink each valid workgroup */
-		if (!erofs_try_to_release_workgroup(sbi, grp, cleanup))
+		if (!erofs_try_to_release_workgroup(sbi, grp))
 			continue;
 
 		++freed;
-		if (unlikely(!--nr_shrink))
+		if (!--nr_shrink)
 			break;
 	}
-	erofs_workstn_unlock(sbi);
+	xa_unlock(&sbi->workstn_tree);
 
 	if (i && nr_shrink)
 		goto repeat;
 	return freed;
 }
 
-#endif
-
 /* protected by 'erofs_sb_list_lock' */
 static unsigned int shrinker_run_no;
 
@@ -249,7 +211,7 @@ static unsigned int shrinker_run_no;
 static DEFINE_SPINLOCK(erofs_sb_list_lock);
 static LIST_HEAD(erofs_sb_list);
 
-void erofs_register_super(struct super_block *sb)
+void erofs_shrinker_register(struct super_block *sb)
 {
 	struct erofs_sb_info *sbi = EROFS_SB(sb);
 
@@ -260,21 +222,28 @@ void erofs_register_super(struct super_block *sb)
 	spin_unlock(&erofs_sb_list_lock);
 }
 
-void erofs_unregister_super(struct super_block *sb)
+void erofs_shrinker_unregister(struct super_block *sb)
 {
+	struct erofs_sb_info *const sbi = EROFS_SB(sb);
+
+	mutex_lock(&sbi->umount_mutex);
+	/* clean up all remaining workgroups in memory */
+	erofs_shrink_workstation(sbi, ~0UL);
+
 	spin_lock(&erofs_sb_list_lock);
-	list_del(&EROFS_SB(sb)->list);
+	list_del(&sbi->list);
 	spin_unlock(&erofs_sb_list_lock);
+	mutex_unlock(&sbi->umount_mutex);
 }
 
-unsigned long erofs_shrink_count(struct shrinker *shrink,
-				 struct shrink_control *sc)
+static unsigned long erofs_shrink_count(struct shrinker *shrink,
+					struct shrink_control *sc)
 {
 	return atomic_long_read(&erofs_global_shrink_cnt);
 }
 
-unsigned long erofs_shrink_scan(struct shrinker *shrink,
-				struct shrink_control *sc)
+static unsigned long erofs_shrink_scan(struct shrinker *shrink,
+				       struct shrink_control *sc)
 {
 	struct erofs_sb_info *sbi;
 	struct list_head *p;
@@ -284,9 +253,9 @@ unsigned long erofs_shrink_scan(struct shrinker *shrink,
 	unsigned long freed = 0;
 
 	spin_lock(&erofs_sb_list_lock);
-	do
+	do {
 		run_no = ++shrinker_run_no;
-	while (run_no == 0);
+	} while (run_no == 0);
 
 	/* Iterate over all mounted superblocks and try to shrink them */
 	p = erofs_sb_list.next;
@@ -308,9 +277,7 @@ unsigned long erofs_shrink_scan(struct shrinker *shrink,
 		spin_unlock(&erofs_sb_list_lock);
 		sbi->shrinker_run_no = run_no;
 
-#ifdef CONFIG_EROFS_FS_ZIP
-		freed += erofs_shrink_workstation(sbi, nr - freed, false);
-#endif
+		freed += erofs_shrink_workstation(sbi, nr - freed);
 
 		spin_lock(&erofs_sb_list_lock);
 		/* Get the next list element before we move this one */
@@ -330,3 +297,19 @@ unsigned long erofs_shrink_scan(struct shrinker *shrink,
 	return freed;
 }
 
+static struct shrinker erofs_shrinker_info = {
+	.scan_objects = erofs_shrink_scan,
+	.count_objects = erofs_shrink_count,
+	.seeks = DEFAULT_SEEKS,
+};
+
+int __init erofs_init_shrinker(void)
+{
+	return register_shrinker(&erofs_shrinker_info);
+}
+
+void erofs_exit_shrinker(void)
+{
+	unregister_shrinker(&erofs_shrinker_info);
+}
+#endif	/* !CONFIG_EROFS_FS_ZIP */
diff --git a/drivers/staging/erofs/xattr.c b/drivers/staging/erofs/xattr.c
index d48687ca2199..2b9bb3a7a5f2 100644
--- a/drivers/staging/erofs/xattr.c
+++ b/drivers/staging/erofs/xattr.c
@@ -1,14 +1,7 @@
-// SPDX-License-Identifier: GPL-2.0
+// SPDX-License-Identifier: GPL-2.0-only
 /*
- * linux/drivers/staging/erofs/xattr.c
- *
  * Copyright (C) 2017-2018 HUAWEI, Inc.
- *             http://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file COPYING in the main directory of the Linux
- * distribution for more details.
+ *             https://www.huawei.com/
  */
 #include <linux/security.h>
 #include "xattr.h"
@@ -19,13 +12,13 @@ struct xattr_iter {
 	void *kaddr;
 
 	erofs_blk_t blkaddr;
-	unsigned ofs;
+	unsigned int ofs;
 };
 
 static inline void xattr_iter_end(struct xattr_iter *it, bool atomic)
 {
 	/* the only user of kunmap() is 'init_inode_xattrs' */
-	if (unlikely(!atomic))
+	if (!atomic)
 		kunmap(it->page);
 	else
 		kunmap_atomic(it->kaddr);
@@ -44,23 +37,30 @@ static inline void xattr_iter_end_final(struct xattr_iter *it)
 
 static int init_inode_xattrs(struct inode *inode)
 {
-	struct erofs_vnode *const vi = EROFS_V(inode);
+	struct erofs_inode *const vi = EROFS_I(inode);
 	struct xattr_iter it;
-	unsigned i;
+	unsigned int i;
 	struct erofs_xattr_ibody_header *ih;
+	struct super_block *sb;
 	struct erofs_sb_info *sbi;
 	bool atomic_map;
 	int ret = 0;
 
 	/* the most case is that xattrs of this inode are initialized. */
-	if (test_bit(EROFS_V_EA_INITED_BIT, &vi->flags))
+	if (test_bit(EROFS_I_EA_INITED_BIT, &vi->flags)) {
+		/*
+		 * paired with smp_mb() at the end of the function to ensure
+		 * fields will only be observed after the bit is set.
+		 */
+		smp_mb();
 		return 0;
+	}
 
-	if (wait_on_bit_lock(&vi->flags, EROFS_V_BL_XATTR_BIT, TASK_KILLABLE))
+	if (wait_on_bit_lock(&vi->flags, EROFS_I_BL_XATTR_BIT, TASK_KILLABLE))
 		return -ERESTARTSYS;
 
 	/* someone has initialized xattrs for us? */
-	if (test_bit(EROFS_V_EA_INITED_BIT, &vi->flags))
+	if (test_bit(EROFS_I_EA_INITED_BIT, &vi->flags))
 		goto out_unlock;
 
 	/*
@@ -72,25 +72,29 @@ static int init_inode_xattrs(struct inode *inode)
 	 *    undefined right now (maybe use later with some new sb feature).
 	 */
 	if (vi->xattr_isize == sizeof(struct erofs_xattr_ibody_header)) {
-		errln("xattr_isize %d of nid %llu is not supported yet",
-		      vi->xattr_isize, vi->nid);
-		ret = -ENOTSUPP;
+		erofs_err(inode->i_sb,
+			  "xattr_isize %d of nid %llu is not supported yet",
+			  vi->xattr_isize, vi->nid);
+		ret = -EOPNOTSUPP;
 		goto out_unlock;
 	} else if (vi->xattr_isize < sizeof(struct erofs_xattr_ibody_header)) {
-		if (unlikely(vi->xattr_isize)) {
+		if (vi->xattr_isize) {
+			erofs_err(inode->i_sb,
+				  "bogus xattr ibody @ nid %llu", vi->nid);
 			DBG_BUGON(1);
-			ret = -EIO;
+			ret = -EFSCORRUPTED;
 			goto out_unlock;	/* xattr ondisk layout error */
 		}
 		ret = -ENOATTR;
 		goto out_unlock;
 	}
 
-	sbi = EROFS_I_SB(inode);
+	sb = inode->i_sb;
+	sbi = EROFS_SB(sb);
 	it.blkaddr = erofs_blknr(iloc(sbi, vi->nid) + vi->inode_isize);
 	it.ofs = erofs_blkoff(iloc(sbi, vi->nid) + vi->inode_isize);
 
-	it.page = erofs_get_inline_page(inode, it.blkaddr);
+	it.page = erofs_get_meta_page(sb, it.blkaddr);
 	if (IS_ERR(it.page)) {
 		ret = PTR_ERR(it.page);
 		goto out_unlock;
@@ -115,13 +119,12 @@ static int init_inode_xattrs(struct inode *inode)
 	it.ofs += sizeof(struct erofs_xattr_ibody_header);
 
 	for (i = 0; i < vi->xattr_shared_count; ++i) {
-		if (unlikely(it.ofs >= EROFS_BLKSIZ)) {
+		if (it.ofs >= EROFS_BLKSIZ) {
 			/* cannot be unaligned */
-			BUG_ON(it.ofs != EROFS_BLKSIZ);
+			DBG_BUGON(it.ofs != EROFS_BLKSIZ);
 			xattr_iter_end(&it, atomic_map);
 
-			it.page = erofs_get_meta_page(inode->i_sb,
-				++it.blkaddr, S_ISDIR(inode->i_mode));
+			it.page = erofs_get_meta_page(sb, ++it.blkaddr);
 			if (IS_ERR(it.page)) {
 				kfree(vi->xattr_shared_xattrs);
 				vi->xattr_shared_xattrs = NULL;
@@ -139,18 +142,29 @@ static int init_inode_xattrs(struct inode *inode)
 	}
 	xattr_iter_end(&it, atomic_map);
 
-	set_bit(EROFS_V_EA_INITED_BIT, &vi->flags);
+	/* paired with smp_mb() at the beginning of the function. */
+	smp_mb();
+	set_bit(EROFS_I_EA_INITED_BIT, &vi->flags);
 
 out_unlock:
-	clear_and_wake_up_bit(EROFS_V_BL_XATTR_BIT, &vi->flags);
+	clear_and_wake_up_bit(EROFS_I_BL_XATTR_BIT, &vi->flags);
 	return ret;
 }
 
+/*
+ * the general idea for these return values is
+ * if    0 is returned, go on processing the current xattr;
+ *       1 (> 0) is returned, skip this round to process the next xattr;
+ *    -err (< 0) is returned, an error (maybe ENOXATTR) occurred
+ *                            and need to be handled
+ */
 struct xattr_iter_handlers {
-	int (*entry)(struct xattr_iter *, struct erofs_xattr_entry *);
-	int (*name)(struct xattr_iter *, unsigned, char *, unsigned);
-	int (*alloc_buffer)(struct xattr_iter *, unsigned);
-	void (*value)(struct xattr_iter *, unsigned, char *, unsigned);
+	int (*entry)(struct xattr_iter *_it, struct erofs_xattr_entry *entry);
+	int (*name)(struct xattr_iter *_it, unsigned int processed, char *buf,
+		    unsigned int len);
+	int (*alloc_buffer)(struct xattr_iter *_it, unsigned int value_sz);
+	void (*value)(struct xattr_iter *_it, unsigned int processed, char *buf,
+		      unsigned int len);
 };
 
 static inline int xattr_iter_fixup(struct xattr_iter *it)
@@ -161,7 +175,8 @@ static inline int xattr_iter_fixup(struct xattr_iter *it)
 	xattr_iter_end(it, true);
 
 	it->blkaddr += erofs_blknr(it->ofs);
-	it->page = erofs_get_meta_page(it->sb, it->blkaddr, false);
+
+	it->page = erofs_get_meta_page(it->sb, it->blkaddr);
 	if (IS_ERR(it->page)) {
 		int err = PTR_ERR(it->page);
 
@@ -175,15 +190,15 @@ static inline int xattr_iter_fixup(struct xattr_iter *it)
 }
 
 static int inline_xattr_iter_begin(struct xattr_iter *it,
-	struct inode *inode)
+				   struct inode *inode)
 {
-	struct erofs_vnode *const vi = EROFS_V(inode);
+	struct erofs_inode *const vi = EROFS_I(inode);
 	struct erofs_sb_info *const sbi = EROFS_SB(inode->i_sb);
-	unsigned xattr_header_sz, inline_xattr_ofs;
+	unsigned int xattr_header_sz, inline_xattr_ofs;
 
 	xattr_header_sz = inlinexattr_header_size(inode);
-	if (unlikely(xattr_header_sz >= vi->xattr_isize)) {
-		BUG_ON(xattr_header_sz > vi->xattr_isize);
+	if (xattr_header_sz >= vi->xattr_isize) {
+		DBG_BUGON(xattr_header_sz > vi->xattr_isize);
 		return -ENOATTR;
 	}
 
@@ -192,7 +207,7 @@ static int inline_xattr_iter_begin(struct xattr_iter *it,
 	it->blkaddr = erofs_blknr(iloc(sbi, vi->nid) + inline_xattr_ofs);
 	it->ofs = erofs_blkoff(iloc(sbi, vi->nid) + inline_xattr_ofs);
 
-	it->page = erofs_get_inline_page(inode, it->blkaddr);
+	it->page = erofs_get_meta_page(inode->i_sb, it->blkaddr);
 	if (IS_ERR(it->page))
 		return PTR_ERR(it->page);
 
@@ -200,11 +215,16 @@ static int inline_xattr_iter_begin(struct xattr_iter *it,
 	return vi->xattr_isize - xattr_header_sz;
 }
 
+/*
+ * Regardless of success or failure, `xattr_foreach' will end up with
+ * `ofs' pointing to the next xattr item rather than an arbitrary position.
+ */
 static int xattr_foreach(struct xattr_iter *it,
-	const struct xattr_iter_handlers *op, unsigned int *tlimit)
+			 const struct xattr_iter_handlers *op,
+			 unsigned int *tlimit)
 {
 	struct erofs_xattr_entry entry;
-	unsigned value_sz, processed, slice;
+	unsigned int value_sz, processed, slice;
 	int err;
 
 	/* 0. fixup blkaddr, ofs, ipage */
@@ -218,10 +238,14 @@ static int xattr_foreach(struct xattr_iter *it,
 	 *    therefore entry should be in the page
 	 */
 	entry = *(struct erofs_xattr_entry *)(it->kaddr + it->ofs);
-	if (tlimit != NULL) {
-		unsigned entry_sz = EROFS_XATTR_ENTRY_SIZE(&entry);
+	if (tlimit) {
+		unsigned int entry_sz = erofs_xattr_entry_size(&entry);
 
-		BUG_ON(*tlimit < entry_sz);
+		/* xattr on-disk corruption: xattr entry beyond xattr_isize */
+		if (*tlimit < entry_sz) {
+			DBG_BUGON(1);
+			return -EFSCORRUPTED;
+		}
 		*tlimit -= entry_sz;
 	}
 
@@ -240,7 +264,7 @@ static int xattr_foreach(struct xattr_iter *it,
 
 	while (processed < entry.e_name_len) {
 		if (it->ofs >= EROFS_BLKSIZ) {
-			BUG_ON(it->ofs > EROFS_BLKSIZ);
+			DBG_BUGON(it->ofs > EROFS_BLKSIZ);
 
 			err = xattr_iter_fixup(it);
 			if (err)
@@ -248,8 +272,8 @@ static int xattr_foreach(struct xattr_iter *it,
 			it->ofs = 0;
 		}
 
-		slice = min_t(unsigned, PAGE_SIZE - it->ofs,
-			entry.e_name_len - processed);
+		slice = min_t(unsigned int, PAGE_SIZE - it->ofs,
+			      entry.e_name_len - processed);
 
 		/* handle name */
 		err = op->name(it, processed, it->kaddr + it->ofs, slice);
@@ -265,7 +289,7 @@ static int xattr_foreach(struct xattr_iter *it,
 	/* 3. handle xattr value */
 	processed = 0;
 
-	if (op->alloc_buffer != NULL) {
+	if (op->alloc_buffer) {
 		err = op->alloc_buffer(it, value_sz);
 		if (err) {
 			it->ofs += value_sz;
@@ -275,7 +299,7 @@ static int xattr_foreach(struct xattr_iter *it,
 
 	while (processed < value_sz) {
 		if (it->ofs >= EROFS_BLKSIZ) {
-			BUG_ON(it->ofs > EROFS_BLKSIZ);
+			DBG_BUGON(it->ofs > EROFS_BLKSIZ);
 
 			err = xattr_iter_fixup(it);
 			if (err)
@@ -283,17 +307,17 @@ static int xattr_foreach(struct xattr_iter *it,
 			it->ofs = 0;
 		}
 
-		slice = min_t(unsigned, PAGE_SIZE - it->ofs,
-			value_sz - processed);
+		slice = min_t(unsigned int, PAGE_SIZE - it->ofs,
+			      value_sz - processed);
 		op->value(it, processed, it->kaddr + it->ofs, slice);
 		it->ofs += slice;
 		processed += slice;
 	}
 
 out:
-	/* we assume that ofs is aligned with 4 bytes */
+	/* xattrs should be 4-byte aligned (on-disk constraint) */
 	it->ofs = EROFS_XATTR_ALIGN(it->ofs);
-	return err;
+	return err < 0 ? err : 0;
 }
 
 struct getxattr_iter {
@@ -305,7 +329,7 @@ struct getxattr_iter {
 };
 
 static int xattr_entrymatch(struct xattr_iter *_it,
-	struct erofs_xattr_entry *entry)
+			    struct erofs_xattr_entry *entry)
 {
 	struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it);
 
@@ -314,7 +338,7 @@ static int xattr_entrymatch(struct xattr_iter *_it,
 }
 
 static int xattr_namematch(struct xattr_iter *_it,
-	unsigned processed, char *buf, unsigned len)
+			   unsigned int processed, char *buf, unsigned int len)
 {
 	struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it);
 
@@ -322,17 +346,18 @@ static int xattr_namematch(struct xattr_iter *_it,
 }
 
 static int xattr_checkbuffer(struct xattr_iter *_it,
-	unsigned value_sz)
+			     unsigned int value_sz)
 {
 	struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it);
 	int err = it->buffer_size < value_sz ? -ERANGE : 0;
 
 	it->buffer_size = value_sz;
-	return it->buffer == NULL ? 1 : err;
+	return !it->buffer ? 1 : err;
 }
 
 static void xattr_copyvalue(struct xattr_iter *_it,
-	unsigned processed, char *buf, unsigned len)
+			    unsigned int processed,
+			    char *buf, unsigned int len)
 {
 	struct getxattr_iter *it = container_of(_it, struct getxattr_iter, it);
 
@@ -349,7 +374,7 @@ static const struct xattr_iter_handlers find_xattr_handlers = {
 static int inline_getxattr(struct inode *inode, struct getxattr_iter *it)
 {
 	int ret;
-	unsigned remaining;
+	unsigned int remaining;
 
 	ret = inline_xattr_iter_begin(&it->it, inode);
 	if (ret < 0)
@@ -358,22 +383,20 @@ static int inline_getxattr(struct inode *inode, struct getxattr_iter *it)
 	remaining = ret;
 	while (remaining) {
 		ret = xattr_foreach(&it->it, &find_xattr_handlers, &remaining);
-		if (ret >= 0)
-			break;
-
-		if (ret != -ENOATTR)	/* -ENOMEM, -EIO, etc. */
+		if (ret != -ENOATTR)
 			break;
 	}
 	xattr_iter_end_final(&it->it);
 
-	return ret < 0 ? ret : it->buffer_size;
+	return ret ? ret : it->buffer_size;
 }
 
 static int shared_getxattr(struct inode *inode, struct getxattr_iter *it)
 {
-	struct erofs_vnode *const vi = EROFS_V(inode);
-	struct erofs_sb_info *const sbi = EROFS_SB(inode->i_sb);
-	unsigned i;
+	struct erofs_inode *const vi = EROFS_I(inode);
+	struct super_block *const sb = inode->i_sb;
+	struct erofs_sb_info *const sbi = EROFS_SB(sb);
+	unsigned int i;
 	int ret = -ENOATTR;
 
 	for (i = 0; i < vi->xattr_shared_count; ++i) {
@@ -386,8 +409,7 @@ static int shared_getxattr(struct inode *inode, struct getxattr_iter *it)
 			if (i)
 				xattr_iter_end(&it->it, true);
 
-			it->it.page = erofs_get_meta_page(inode->i_sb,
-							  blkaddr, false);
+			it->it.page = erofs_get_meta_page(sb, blkaddr);
 			if (IS_ERR(it->it.page))
 				return PTR_ERR(it->it.page);
 
@@ -396,16 +418,13 @@ static int shared_getxattr(struct inode *inode, struct getxattr_iter *it)
 		}
 
 		ret = xattr_foreach(&it->it, &find_xattr_handlers, NULL);
-		if (ret >= 0)
-			break;
-
-		if (ret != -ENOATTR)	/* -ENOMEM, -EIO, etc. */
+		if (ret != -ENOATTR)
 			break;
 	}
 	if (vi->xattr_shared_count)
 		xattr_iter_end_final(&it->it);
 
-	return ret < 0 ? ret : it->buffer_size;
+	return ret ? ret : it->buffer_size;
 }
 
 static bool erofs_xattr_user_list(struct dentry *dentry)
@@ -419,13 +438,13 @@ static bool erofs_xattr_trusted_list(struct dentry *dentry)
 }
 
 int erofs_getxattr(struct inode *inode, int index,
-	const char *name,
-	void *buffer, size_t buffer_size)
+		   const char *name,
+		   void *buffer, size_t buffer_size)
 {
 	int ret;
 	struct getxattr_iter it;
 
-	if (unlikely(name == NULL))
+	if (!name)
 		return -EINVAL;
 
 	ret = init_inode_xattrs(inode);
@@ -450,8 +469,8 @@ int erofs_getxattr(struct inode *inode, int index,
 }
 
 static int erofs_xattr_generic_get(const struct xattr_handler *handler,
-		struct dentry *unused, struct inode *inode,
-		const char *name, void *buffer, size_t size)
+				   struct dentry *unused, struct inode *inode,
+				   const char *name, void *buffer, size_t size)
 {
 	struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
 
@@ -461,8 +480,6 @@ static int erofs_xattr_generic_get(const struct xattr_handler *handler,
 			return -EOPNOTSUPP;
 		break;
 	case EROFS_XATTR_INDEX_TRUSTED:
-		if (!capable(CAP_SYS_ADMIN))
-			return -EPERM;
 		break;
 	case EROFS_XATTR_INDEX_SECURITY:
 		break;
@@ -517,24 +534,23 @@ struct listxattr_iter {
 };
 
 static int xattr_entrylist(struct xattr_iter *_it,
-	struct erofs_xattr_entry *entry)
+			   struct erofs_xattr_entry *entry)
 {
 	struct listxattr_iter *it =
 		container_of(_it, struct listxattr_iter, it);
-	unsigned prefix_len;
+	unsigned int prefix_len;
 	const char *prefix;
 
 	const struct xattr_handler *h =
 		erofs_xattr_handler(entry->e_name_index);
 
-	if (h == NULL || (h->list != NULL && !h->list(it->dentry)))
+	if (!h || (h->list && !h->list(it->dentry)))
 		return 1;
 
-	/* Note that at least one of 'prefix' and 'name' should be non-NULL */
-	prefix = h->prefix != NULL ? h->prefix : h->name;
+	prefix = xattr_prefix(h);
 	prefix_len = strlen(prefix);
 
-	if (it->buffer == NULL) {
+	if (!it->buffer) {
 		it->buffer_ofs += prefix_len + entry->e_name_len + 1;
 		return 1;
 	}
@@ -549,7 +565,7 @@ static int xattr_entrylist(struct xattr_iter *_it,
 }
 
 static int xattr_namelist(struct xattr_iter *_it,
-	unsigned processed, char *buf, unsigned len)
+			  unsigned int processed, char *buf, unsigned int len)
 {
 	struct listxattr_iter *it =
 		container_of(_it, struct listxattr_iter, it);
@@ -560,7 +576,7 @@ static int xattr_namelist(struct xattr_iter *_it,
 }
 
 static int xattr_skipvalue(struct xattr_iter *_it,
-	unsigned value_sz)
+			   unsigned int value_sz)
 {
 	struct listxattr_iter *it =
 		container_of(_it, struct listxattr_iter, it);
@@ -579,7 +595,7 @@ static const struct xattr_iter_handlers list_xattr_handlers = {
 static int inline_listxattr(struct listxattr_iter *it)
 {
 	int ret;
-	unsigned remaining;
+	unsigned int remaining;
 
 	ret = inline_xattr_iter_begin(&it->it, d_inode(it->dentry));
 	if (ret < 0)
@@ -588,19 +604,20 @@ static int inline_listxattr(struct listxattr_iter *it)
 	remaining = ret;
 	while (remaining) {
 		ret = xattr_foreach(&it->it, &list_xattr_handlers, &remaining);
-		if (ret < 0)
+		if (ret)
 			break;
 	}
 	xattr_iter_end_final(&it->it);
-	return ret < 0 ? ret : it->buffer_ofs;
+	return ret ? ret : it->buffer_ofs;
 }
 
 static int shared_listxattr(struct listxattr_iter *it)
 {
 	struct inode *const inode = d_inode(it->dentry);
-	struct erofs_vnode *const vi = EROFS_V(inode);
-	struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
-	unsigned i;
+	struct erofs_inode *const vi = EROFS_I(inode);
+	struct super_block *const sb = inode->i_sb;
+	struct erofs_sb_info *const sbi = EROFS_SB(sb);
+	unsigned int i;
 	int ret = 0;
 
 	for (i = 0; i < vi->xattr_shared_count; ++i) {
@@ -612,8 +629,7 @@ static int shared_listxattr(struct listxattr_iter *it)
 			if (i)
 				xattr_iter_end(&it->it, true);
 
-			it->it.page = erofs_get_meta_page(inode->i_sb,
-							  blkaddr, false);
+			it->it.page = erofs_get_meta_page(sb, blkaddr);
 			if (IS_ERR(it->it.page))
 				return PTR_ERR(it->it.page);
 
@@ -622,17 +638,17 @@ static int shared_listxattr(struct listxattr_iter *it)
 		}
 
 		ret = xattr_foreach(&it->it, &list_xattr_handlers, NULL);
-		if (ret < 0)
+		if (ret)
 			break;
 	}
 	if (vi->xattr_shared_count)
 		xattr_iter_end_final(&it->it);
 
-	return ret < 0 ? ret : it->buffer_ofs;
+	return ret ? ret : it->buffer_ofs;
 }
 
 ssize_t erofs_listxattr(struct dentry *dentry,
-	char *buffer, size_t buffer_size)
+			char *buffer, size_t buffer_size)
 {
 	int ret;
 	struct listxattr_iter it;
@@ -656,3 +672,39 @@ ssize_t erofs_listxattr(struct dentry *dentry,
 	return shared_listxattr(&it);
 }
 
+#ifdef CONFIG_EROFS_FS_POSIX_ACL
+struct posix_acl *erofs_get_acl(struct inode *inode, int type)
+{
+	struct posix_acl *acl;
+	int prefix, rc;
+	char *value = NULL;
+
+	switch (type) {
+	case ACL_TYPE_ACCESS:
+		prefix = EROFS_XATTR_INDEX_POSIX_ACL_ACCESS;
+		break;
+	case ACL_TYPE_DEFAULT:
+		prefix = EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT;
+		break;
+	default:
+		return ERR_PTR(-EINVAL);
+	}
+
+	rc = erofs_getxattr(inode, prefix, "", NULL, 0);
+	if (rc > 0) {
+		value = kmalloc(rc, GFP_KERNEL);
+		if (!value)
+			return ERR_PTR(-ENOMEM);
+		rc = erofs_getxattr(inode, prefix, "", value, rc);
+	}
+
+	if (rc == -ENOATTR)
+		acl = NULL;
+	else if (rc < 0)
+		acl = ERR_PTR(rc);
+	else
+		acl = posix_acl_from_xattr(&init_user_ns, value, rc);
+	kfree(value);
+	return acl;
+}
+#endif
diff --git a/drivers/staging/erofs/xattr.h b/drivers/staging/erofs/xattr.h
index 0c7379282fc5..366dcb400525 100644
--- a/drivers/staging/erofs/xattr.h
+++ b/drivers/staging/erofs/xattr.h
@@ -1,14 +1,7 @@
-/* SPDX-License-Identifier: GPL-2.0
- *
- * linux/drivers/staging/erofs/xattr.h
- *
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
  * Copyright (C) 2017-2018 HUAWEI, Inc.
- *             http://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file COPYING in the main directory of the Linux
- * distribution for more details.
+ *             https://www.huawei.com/
  */
 #ifndef __EROFS_XATTR_H
 #define __EROFS_XATTR_H
@@ -20,14 +13,14 @@
 /* Attribute not found */
 #define ENOATTR         ENODATA
 
-static inline unsigned inlinexattr_header_size(struct inode *inode)
+static inline unsigned int inlinexattr_header_size(struct inode *inode)
 {
-	return sizeof(struct erofs_xattr_ibody_header)
-		+ sizeof(u32) * EROFS_V(inode)->xattr_shared_count;
+	return sizeof(struct erofs_xattr_ibody_header) +
+		sizeof(u32) * EROFS_I(inode)->xattr_shared_count;
 }
 
-static inline erofs_blk_t
-xattrblock_addr(struct erofs_sb_info *sbi, unsigned xattr_id)
+static inline erofs_blk_t xattrblock_addr(struct erofs_sb_info *sbi,
+					  unsigned int xattr_id)
 {
 #ifdef CONFIG_EROFS_FS_XATTR
 	return sbi->xattr_blkaddr +
@@ -37,56 +30,59 @@ xattrblock_addr(struct erofs_sb_info *sbi, unsigned xattr_id)
 #endif
 }
 
-static inline unsigned
-xattrblock_offset(struct erofs_sb_info *sbi, unsigned xattr_id)
+static inline unsigned int xattrblock_offset(struct erofs_sb_info *sbi,
+					     unsigned int xattr_id)
 {
 	return (xattr_id * sizeof(__u32)) % EROFS_BLKSIZ;
 }
 
+#ifdef CONFIG_EROFS_FS_XATTR
 extern const struct xattr_handler erofs_xattr_user_handler;
 extern const struct xattr_handler erofs_xattr_trusted_handler;
 #ifdef CONFIG_EROFS_FS_SECURITY
 extern const struct xattr_handler erofs_xattr_security_handler;
 #endif
 
-static inline const struct xattr_handler *erofs_xattr_handler(unsigned index)
+static inline const struct xattr_handler *erofs_xattr_handler(unsigned int idx)
 {
-static const struct xattr_handler *xattr_handler_map[] = {
-	[EROFS_XATTR_INDEX_USER] = &erofs_xattr_user_handler,
+	static const struct xattr_handler *xattr_handler_map[] = {
+		[EROFS_XATTR_INDEX_USER] = &erofs_xattr_user_handler,
 #ifdef CONFIG_EROFS_FS_POSIX_ACL
-	[EROFS_XATTR_INDEX_POSIX_ACL_ACCESS] = &posix_acl_access_xattr_handler,
-	[EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT] =
-		&posix_acl_default_xattr_handler,
+		[EROFS_XATTR_INDEX_POSIX_ACL_ACCESS] =
+			&posix_acl_access_xattr_handler,
+		[EROFS_XATTR_INDEX_POSIX_ACL_DEFAULT] =
+			&posix_acl_default_xattr_handler,
 #endif
-	[EROFS_XATTR_INDEX_TRUSTED] = &erofs_xattr_trusted_handler,
+		[EROFS_XATTR_INDEX_TRUSTED] = &erofs_xattr_trusted_handler,
 #ifdef CONFIG_EROFS_FS_SECURITY
-	[EROFS_XATTR_INDEX_SECURITY] = &erofs_xattr_security_handler,
+		[EROFS_XATTR_INDEX_SECURITY] = &erofs_xattr_security_handler,
 #endif
-};
-	return index && index < ARRAY_SIZE(xattr_handler_map) ?
-		xattr_handler_map[index] : NULL;
+	};
+
+	return idx && idx < ARRAY_SIZE(xattr_handler_map) ?
+		xattr_handler_map[idx] : NULL;
 }
 
-#ifdef CONFIG_EROFS_FS_XATTR
-
-extern const struct inode_operations erofs_generic_xattr_iops;
-extern const struct inode_operations erofs_dir_xattr_iops;
+extern const struct xattr_handler *erofs_xattr_handlers[];
 
 int erofs_getxattr(struct inode *, int, const char *, void *, size_t);
 ssize_t erofs_listxattr(struct dentry *, char *, size_t);
 #else
-static int __maybe_unused erofs_getxattr(struct inode *inode, int index,
-	const char *name,
-	void *buffer, size_t buffer_size)
+static inline int erofs_getxattr(struct inode *inode, int index,
+				 const char *name, void *buffer,
+				 size_t buffer_size)
 {
-	return -ENOTSUPP;
+	return -EOPNOTSUPP;
 }
 
-static ssize_t __maybe_unused erofs_listxattr(struct dentry *dentry,
-	char *buffer, size_t buffer_size)
-{
-	return -ENOTSUPP;
-}
+#define erofs_listxattr (NULL)
+#define erofs_xattr_handlers (NULL)
+#endif	/* !CONFIG_EROFS_FS_XATTR */
+
+#ifdef CONFIG_EROFS_FS_POSIX_ACL
+struct posix_acl *erofs_get_acl(struct inode *inode, int type);
+#else
+#define erofs_get_acl	(NULL)
 #endif
 
 #endif
diff --git a/drivers/staging/erofs/zdata.c b/drivers/staging/erofs/zdata.c
new file mode 100644
index 000000000000..37c3e28b1e91
--- /dev/null
+++ b/drivers/staging/erofs/zdata.c
@@ -0,0 +1,1370 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             https://www.huawei.com/
+ */
+#include "zdata.h"
+#include "compress.h"
+#include <linux/prefetch.h>
+
+#include <trace/events/erofs.h>
+
+/*
+ * a compressed_pages[] placeholder in order to avoid
+ * being filled with file pages for in-place decompression.
+ */
+#define PAGE_UNALLOCATED     ((void *)0x5F0E4B1D)
+
+/* how to allocate cached pages for a pcluster */
+enum z_erofs_cache_alloctype {
+	DONTALLOC,	/* don't allocate any cached pages */
+	DELAYEDALLOC,	/* delayed allocation (at the time of submitting io) */
+};
+
+/*
+ * tagged pointer with 1-bit tag for all compressed pages
+ * tag 0 - the page is just found with an extra page reference
+ */
+typedef tagptr1_t compressed_page_t;
+
+#define tag_compressed_page_justfound(page) \
+	tagptr_fold(compressed_page_t, page, 1)
+
+static struct workqueue_struct *z_erofs_workqueue __read_mostly;
+static struct kmem_cache *pcluster_cachep __read_mostly;
+
+void z_erofs_exit_zip_subsystem(void)
+{
+	destroy_workqueue(z_erofs_workqueue);
+	kmem_cache_destroy(pcluster_cachep);
+}
+
+static inline int z_erofs_init_workqueue(void)
+{
+	const unsigned int onlinecpus = num_possible_cpus();
+
+	/*
+	 * no need to spawn too many threads, limiting threads could minimum
+	 * scheduling overhead, perhaps per-CPU threads should be better?
+	 */
+	z_erofs_workqueue = alloc_workqueue("erofs_unzipd",
+					    WQ_UNBOUND | WQ_HIGHPRI,
+					    onlinecpus + onlinecpus / 4);
+	return z_erofs_workqueue ? 0 : -ENOMEM;
+}
+
+static void z_erofs_pcluster_init_once(void *ptr)
+{
+	struct z_erofs_pcluster *pcl = ptr;
+	struct z_erofs_collection *cl = z_erofs_primarycollection(pcl);
+	unsigned int i;
+
+	mutex_init(&cl->lock);
+	cl->nr_pages = 0;
+	cl->vcnt = 0;
+	for (i = 0; i < Z_EROFS_CLUSTER_MAX_PAGES; ++i)
+		pcl->compressed_pages[i] = NULL;
+}
+
+static void z_erofs_pcluster_init_always(struct z_erofs_pcluster *pcl)
+{
+	struct z_erofs_collection *cl = z_erofs_primarycollection(pcl);
+
+	atomic_set(&pcl->obj.refcount, 1);
+
+	DBG_BUGON(cl->nr_pages);
+	DBG_BUGON(cl->vcnt);
+}
+
+int __init z_erofs_init_zip_subsystem(void)
+{
+	pcluster_cachep = kmem_cache_create("erofs_compress",
+					    Z_EROFS_WORKGROUP_SIZE, 0,
+					    SLAB_RECLAIM_ACCOUNT,
+					    z_erofs_pcluster_init_once);
+	if (pcluster_cachep) {
+		if (!z_erofs_init_workqueue())
+			return 0;
+
+		kmem_cache_destroy(pcluster_cachep);
+	}
+	return -ENOMEM;
+}
+
+enum z_erofs_collectmode {
+	COLLECT_SECONDARY,
+	COLLECT_PRIMARY,
+	/*
+	 * The current collection was the tail of an exist chain, in addition
+	 * that the previous processed chained collections are all decided to
+	 * be hooked up to it.
+	 * A new chain will be created for the remaining collections which are
+	 * not processed yet, therefore different from COLLECT_PRIMARY_FOLLOWED,
+	 * the next collection cannot reuse the whole page safely in
+	 * the following scenario:
+	 *  ________________________________________________________________
+	 * |      tail (partial) page     |       head (partial) page       |
+	 * |   (belongs to the next cl)   |   (belongs to the current cl)   |
+	 * |_______PRIMARY_FOLLOWED_______|________PRIMARY_HOOKED___________|
+	 */
+	COLLECT_PRIMARY_HOOKED,
+	COLLECT_PRIMARY_FOLLOWED_NOINPLACE,
+	/*
+	 * The current collection has been linked with the owned chain, and
+	 * could also be linked with the remaining collections, which means
+	 * if the processing page is the tail page of the collection, thus
+	 * the current collection can safely use the whole page (since
+	 * the previous collection is under control) for in-place I/O, as
+	 * illustrated below:
+	 *  ________________________________________________________________
+	 * |  tail (partial) page |          head (partial) page           |
+	 * |  (of the current cl) |      (of the previous collection)      |
+	 * |  PRIMARY_FOLLOWED or |                                        |
+	 * |_____PRIMARY_HOOKED___|____________PRIMARY_FOLLOWED____________|
+	 *
+	 * [  (*) the above page can be used as inplace I/O.               ]
+	 */
+	COLLECT_PRIMARY_FOLLOWED,
+};
+
+struct z_erofs_collector {
+	struct z_erofs_pagevec_ctor vector;
+
+	struct z_erofs_pcluster *pcl, *tailpcl;
+	struct z_erofs_collection *cl;
+	struct page **compressedpages;
+	z_erofs_next_pcluster_t owned_head;
+
+	enum z_erofs_collectmode mode;
+};
+
+struct z_erofs_decompress_frontend {
+	struct inode *const inode;
+
+	struct z_erofs_collector clt;
+	struct erofs_map_blocks map;
+
+	bool readahead;
+	/* used for applying cache strategy on the fly */
+	bool backmost;
+	erofs_off_t headoffset;
+};
+
+#define COLLECTOR_INIT() { \
+	.owned_head = Z_EROFS_PCLUSTER_TAIL, \
+	.mode = COLLECT_PRIMARY_FOLLOWED }
+
+#define DECOMPRESS_FRONTEND_INIT(__i) { \
+	.inode = __i, .clt = COLLECTOR_INIT(), \
+	.backmost = true, }
+
+static struct page *z_pagemap_global[Z_EROFS_VMAP_GLOBAL_PAGES];
+static DEFINE_MUTEX(z_pagemap_global_lock);
+
+static void preload_compressed_pages(struct z_erofs_collector *clt,
+				     struct address_space *mc,
+				     enum z_erofs_cache_alloctype type)
+{
+	const struct z_erofs_pcluster *pcl = clt->pcl;
+	const unsigned int clusterpages = BIT(pcl->clusterbits);
+	struct page **pages = clt->compressedpages;
+	pgoff_t index = pcl->obj.index + (pages - pcl->compressed_pages);
+	bool standalone = true;
+
+	if (clt->mode < COLLECT_PRIMARY_FOLLOWED)
+		return;
+
+	for (; pages < pcl->compressed_pages + clusterpages; ++pages) {
+		struct page *page;
+		compressed_page_t t;
+
+		/* the compressed page was loaded before */
+		if (READ_ONCE(*pages))
+			continue;
+
+		page = find_get_page(mc, index);
+
+		if (page) {
+			t = tag_compressed_page_justfound(page);
+		} else if (type == DELAYEDALLOC) {
+			t = tagptr_init(compressed_page_t, PAGE_UNALLOCATED);
+		} else {	/* DONTALLOC */
+			if (standalone)
+				clt->compressedpages = pages;
+			standalone = false;
+			continue;
+		}
+
+		if (!cmpxchg_relaxed(pages, NULL, tagptr_cast_ptr(t)))
+			continue;
+
+		if (page)
+			put_page(page);
+	}
+
+	if (standalone)		/* downgrade to PRIMARY_FOLLOWED_NOINPLACE */
+		clt->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE;
+}
+
+/* called by erofs_shrinker to get rid of all compressed_pages */
+int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
+				       struct erofs_workgroup *grp)
+{
+	struct z_erofs_pcluster *const pcl =
+		container_of(grp, struct z_erofs_pcluster, obj);
+	struct address_space *const mapping = MNGD_MAPPING(sbi);
+	const unsigned int clusterpages = BIT(pcl->clusterbits);
+	int i;
+
+	/*
+	 * refcount of workgroup is now freezed as 1,
+	 * therefore no need to worry about available decompression users.
+	 */
+	for (i = 0; i < clusterpages; ++i) {
+		struct page *page = pcl->compressed_pages[i];
+
+		if (!page)
+			continue;
+
+		/* block other users from reclaiming or migrating the page */
+		if (!trylock_page(page))
+			return -EBUSY;
+
+		if (page->mapping != mapping)
+			continue;
+
+		/* barrier is implied in the following 'unlock_page' */
+		WRITE_ONCE(pcl->compressed_pages[i], NULL);
+		set_page_private(page, 0);
+		ClearPagePrivate(page);
+
+		unlock_page(page);
+		put_page(page);
+	}
+	return 0;
+}
+
+int erofs_try_to_free_cached_page(struct address_space *mapping,
+				  struct page *page)
+{
+	struct z_erofs_pcluster *const pcl = (void *)page_private(page);
+	const unsigned int clusterpages = BIT(pcl->clusterbits);
+	int ret = 0;	/* 0 - busy */
+
+	if (erofs_workgroup_try_to_freeze(&pcl->obj, 1)) {
+		unsigned int i;
+
+		for (i = 0; i < clusterpages; ++i) {
+			if (pcl->compressed_pages[i] == page) {
+				WRITE_ONCE(pcl->compressed_pages[i], NULL);
+				ret = 1;
+				break;
+			}
+		}
+		erofs_workgroup_unfreeze(&pcl->obj, 1);
+
+		if (ret) {
+			ClearPagePrivate(page);
+			put_page(page);
+		}
+	}
+	return ret;
+}
+
+/* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */
+static inline bool z_erofs_try_inplace_io(struct z_erofs_collector *clt,
+					  struct page *page)
+{
+	struct z_erofs_pcluster *const pcl = clt->pcl;
+	const unsigned int clusterpages = BIT(pcl->clusterbits);
+
+	while (clt->compressedpages < pcl->compressed_pages + clusterpages) {
+		if (!cmpxchg(clt->compressedpages++, NULL, page))
+			return true;
+	}
+	return false;
+}
+
+/* callers must be with collection lock held */
+static int z_erofs_attach_page(struct z_erofs_collector *clt,
+			       struct page *page,
+			       enum z_erofs_page_type type)
+{
+	int ret;
+	bool occupied;
+
+	/* give priority for inplaceio */
+	if (clt->mode >= COLLECT_PRIMARY &&
+	    type == Z_EROFS_PAGE_TYPE_EXCLUSIVE &&
+	    z_erofs_try_inplace_io(clt, page))
+		return 0;
+
+	ret = z_erofs_pagevec_enqueue(&clt->vector,
+				      page, type, &occupied);
+	clt->cl->vcnt += (unsigned int)ret;
+
+	return ret ? 0 : -EAGAIN;
+}
+
+static enum z_erofs_collectmode
+try_to_claim_pcluster(struct z_erofs_pcluster *pcl,
+		      z_erofs_next_pcluster_t *owned_head)
+{
+	/* let's claim these following types of pclusters */
+retry:
+	if (pcl->next == Z_EROFS_PCLUSTER_NIL) {
+		/* type 1, nil pcluster */
+		if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_NIL,
+			    *owned_head) != Z_EROFS_PCLUSTER_NIL)
+			goto retry;
+
+		*owned_head = &pcl->next;
+		/* lucky, I am the followee :) */
+		return COLLECT_PRIMARY_FOLLOWED;
+	} else if (pcl->next == Z_EROFS_PCLUSTER_TAIL) {
+		/*
+		 * type 2, link to the end of a existing open chain,
+		 * be careful that its submission itself is governed
+		 * by the original owned chain.
+		 */
+		if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL,
+			    *owned_head) != Z_EROFS_PCLUSTER_TAIL)
+			goto retry;
+		*owned_head = Z_EROFS_PCLUSTER_TAIL;
+		return COLLECT_PRIMARY_HOOKED;
+	}
+	return COLLECT_PRIMARY;	/* :( better luck next time */
+}
+
+static int z_erofs_lookup_collection(struct z_erofs_collector *clt,
+				     struct inode *inode,
+				     struct erofs_map_blocks *map)
+{
+	struct erofs_workgroup *grp;
+	struct z_erofs_pcluster *pcl;
+	struct z_erofs_collection *cl;
+	unsigned int length;
+
+	grp = erofs_find_workgroup(inode->i_sb, map->m_pa >> PAGE_SHIFT);
+	if (!grp)
+		return -ENOENT;
+
+	pcl = container_of(grp, struct z_erofs_pcluster, obj);
+	if (clt->owned_head == &pcl->next || pcl == clt->tailpcl) {
+		DBG_BUGON(1);
+		erofs_workgroup_put(grp);
+		return -EFSCORRUPTED;
+	}
+
+	cl = z_erofs_primarycollection(pcl);
+	if (cl->pageofs != (map->m_la & ~PAGE_MASK)) {
+		DBG_BUGON(1);
+		erofs_workgroup_put(grp);
+		return -EFSCORRUPTED;
+	}
+
+	length = READ_ONCE(pcl->length);
+	if (length & Z_EROFS_PCLUSTER_FULL_LENGTH) {
+		if ((map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) > length) {
+			DBG_BUGON(1);
+			erofs_workgroup_put(grp);
+			return -EFSCORRUPTED;
+		}
+	} else {
+		unsigned int llen = map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT;
+
+		if (map->m_flags & EROFS_MAP_FULL_MAPPED)
+			llen |= Z_EROFS_PCLUSTER_FULL_LENGTH;
+
+		while (llen > length &&
+		       length != cmpxchg_relaxed(&pcl->length, length, llen)) {
+			cpu_relax();
+			length = READ_ONCE(pcl->length);
+		}
+	}
+	mutex_lock(&cl->lock);
+	/* used to check tail merging loop due to corrupted images */
+	if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL)
+		clt->tailpcl = pcl;
+	clt->mode = try_to_claim_pcluster(pcl, &clt->owned_head);
+	/* clean tailpcl if the current owned_head is Z_EROFS_PCLUSTER_TAIL */
+	if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL)
+		clt->tailpcl = NULL;
+	clt->pcl = pcl;
+	clt->cl = cl;
+	return 0;
+}
+
+static int z_erofs_register_collection(struct z_erofs_collector *clt,
+				       struct inode *inode,
+				       struct erofs_map_blocks *map)
+{
+	struct z_erofs_pcluster *pcl;
+	struct z_erofs_collection *cl;
+	int err;
+
+	/* no available workgroup, let's allocate one */
+	pcl = kmem_cache_alloc(pcluster_cachep, GFP_NOFS);
+	if (!pcl)
+		return -ENOMEM;
+
+	z_erofs_pcluster_init_always(pcl);
+	pcl->obj.index = map->m_pa >> PAGE_SHIFT;
+
+	pcl->length = (map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) |
+		(map->m_flags & EROFS_MAP_FULL_MAPPED ?
+			Z_EROFS_PCLUSTER_FULL_LENGTH : 0);
+
+	if (map->m_flags & EROFS_MAP_ZIPPED)
+		pcl->algorithmformat = Z_EROFS_COMPRESSION_LZ4;
+	else
+		pcl->algorithmformat = Z_EROFS_COMPRESSION_SHIFTED;
+
+	pcl->clusterbits = EROFS_I(inode)->z_physical_clusterbits[0];
+	pcl->clusterbits -= PAGE_SHIFT;
+
+	/* new pclusters should be claimed as type 1, primary and followed */
+	pcl->next = clt->owned_head;
+	clt->mode = COLLECT_PRIMARY_FOLLOWED;
+
+	cl = z_erofs_primarycollection(pcl);
+	cl->pageofs = map->m_la & ~PAGE_MASK;
+
+	/*
+	 * lock all primary followed works before visible to others
+	 * and mutex_trylock *never* fails for a new pcluster.
+	 */
+	mutex_trylock(&cl->lock);
+
+	err = erofs_register_workgroup(inode->i_sb, &pcl->obj);
+	if (err) {
+		mutex_unlock(&cl->lock);
+		kmem_cache_free(pcluster_cachep, pcl);
+		return -EAGAIN;
+	}
+	/* used to check tail merging loop due to corrupted images */
+	if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL)
+		clt->tailpcl = pcl;
+	clt->owned_head = &pcl->next;
+	clt->pcl = pcl;
+	clt->cl = cl;
+	return 0;
+}
+
+static int z_erofs_collector_begin(struct z_erofs_collector *clt,
+				   struct inode *inode,
+				   struct erofs_map_blocks *map)
+{
+	int ret;
+
+	DBG_BUGON(clt->cl);
+
+	/* must be Z_EROFS_PCLUSTER_TAIL or pointed to previous collection */
+	DBG_BUGON(clt->owned_head == Z_EROFS_PCLUSTER_NIL);
+	DBG_BUGON(clt->owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
+
+	if (!PAGE_ALIGNED(map->m_pa)) {
+		DBG_BUGON(1);
+		return -EINVAL;
+	}
+
+repeat:
+	ret = z_erofs_lookup_collection(clt, inode, map);
+	if (ret == -ENOENT) {
+		ret = z_erofs_register_collection(clt, inode, map);
+
+		/* someone registered at the same time, give another try */
+		if (ret == -EAGAIN) {
+			cond_resched();
+			goto repeat;
+		}
+	}
+
+	if (ret)
+		return ret;
+
+	z_erofs_pagevec_ctor_init(&clt->vector, Z_EROFS_NR_INLINE_PAGEVECS,
+				  clt->cl->pagevec, clt->cl->vcnt);
+
+	clt->compressedpages = clt->pcl->compressed_pages;
+	if (clt->mode <= COLLECT_PRIMARY) /* cannot do in-place I/O */
+		clt->compressedpages += Z_EROFS_CLUSTER_MAX_PAGES;
+	return 0;
+}
+
+/*
+ * keep in mind that no referenced pclusters will be freed
+ * only after a RCU grace period.
+ */
+static void z_erofs_rcu_callback(struct rcu_head *head)
+{
+	struct z_erofs_collection *const cl =
+		container_of(head, struct z_erofs_collection, rcu);
+
+	kmem_cache_free(pcluster_cachep,
+			container_of(cl, struct z_erofs_pcluster,
+				     primary_collection));
+}
+
+void erofs_workgroup_free_rcu(struct erofs_workgroup *grp)
+{
+	struct z_erofs_pcluster *const pcl =
+		container_of(grp, struct z_erofs_pcluster, obj);
+	struct z_erofs_collection *const cl = z_erofs_primarycollection(pcl);
+
+	call_rcu(&cl->rcu, z_erofs_rcu_callback);
+}
+
+static void z_erofs_collection_put(struct z_erofs_collection *cl)
+{
+	struct z_erofs_pcluster *const pcl =
+		container_of(cl, struct z_erofs_pcluster, primary_collection);
+
+	erofs_workgroup_put(&pcl->obj);
+}
+
+static bool z_erofs_collector_end(struct z_erofs_collector *clt)
+{
+	struct z_erofs_collection *cl = clt->cl;
+
+	if (!cl)
+		return false;
+
+	z_erofs_pagevec_ctor_exit(&clt->vector, false);
+	mutex_unlock(&cl->lock);
+
+	/*
+	 * if all pending pages are added, don't hold its reference
+	 * any longer if the pcluster isn't hosted by ourselves.
+	 */
+	if (clt->mode < COLLECT_PRIMARY_FOLLOWED_NOINPLACE)
+		z_erofs_collection_put(cl);
+
+	clt->cl = NULL;
+	return true;
+}
+
+static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend *fe,
+				       unsigned int cachestrategy,
+				       erofs_off_t la)
+{
+	if (cachestrategy <= EROFS_ZIP_CACHE_DISABLED)
+		return false;
+
+	if (fe->backmost)
+		return true;
+
+	return cachestrategy >= EROFS_ZIP_CACHE_READAROUND &&
+		la < fe->headoffset;
+}
+
+static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
+				struct page *page)
+{
+	struct inode *const inode = fe->inode;
+	struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
+	struct erofs_map_blocks *const map = &fe->map;
+	struct z_erofs_collector *const clt = &fe->clt;
+	const loff_t offset = page_offset(page);
+	bool tight = true;
+
+	enum z_erofs_cache_alloctype cache_strategy;
+	enum z_erofs_page_type page_type;
+	unsigned int cur, end, spiltted, index;
+	int err = 0;
+
+	/* register locked file pages as online pages in pack */
+	z_erofs_onlinepage_init(page);
+
+	spiltted = 0;
+	end = PAGE_SIZE;
+repeat:
+	cur = end - 1;
+
+	/* lucky, within the range of the current map_blocks */
+	if (offset + cur >= map->m_la &&
+	    offset + cur < map->m_la + map->m_llen) {
+		/* didn't get a valid collection previously (very rare) */
+		if (!clt->cl)
+			goto restart_now;
+		goto hitted;
+	}
+
+	/* go ahead the next map_blocks */
+	erofs_dbg("%s: [out-of-range] pos %llu", __func__, offset + cur);
+
+	if (z_erofs_collector_end(clt))
+		fe->backmost = false;
+
+	map->m_la = offset + cur;
+	map->m_llen = 0;
+	err = z_erofs_map_blocks_iter(inode, map, 0);
+	if (err)
+		goto err_out;
+
+restart_now:
+	if (!(map->m_flags & EROFS_MAP_MAPPED))
+		goto hitted;
+
+	err = z_erofs_collector_begin(clt, inode, map);
+	if (err)
+		goto err_out;
+
+	/* preload all compressed pages (maybe downgrade role if necessary) */
+	if (should_alloc_managed_pages(fe, sbi->cache_strategy, map->m_la))
+		cache_strategy = DELAYEDALLOC;
+	else
+		cache_strategy = DONTALLOC;
+
+	preload_compressed_pages(clt, MNGD_MAPPING(sbi), cache_strategy);
+
+hitted:
+	/*
+	 * Ensure the current partial page belongs to this submit chain rather
+	 * than other concurrent submit chains or the noio(bypass) chain since
+	 * those chains are handled asynchronously thus the page cannot be used
+	 * for inplace I/O or pagevec (should be processed in strict order.)
+	 */
+	tight &= (clt->mode >= COLLECT_PRIMARY_HOOKED &&
+		  clt->mode != COLLECT_PRIMARY_FOLLOWED_NOINPLACE);
+
+	cur = end - min_t(unsigned int, offset + end - map->m_la, end);
+	if (!(map->m_flags & EROFS_MAP_MAPPED)) {
+		zero_user_segment(page, cur, end);
+		goto next_part;
+	}
+
+	/* let's derive page type */
+	page_type = cur ? Z_EROFS_VLE_PAGE_TYPE_HEAD :
+		(!spiltted ? Z_EROFS_PAGE_TYPE_EXCLUSIVE :
+			(tight ? Z_EROFS_PAGE_TYPE_EXCLUSIVE :
+				Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED));
+
+	if (cur)
+		tight &= (clt->mode >= COLLECT_PRIMARY_FOLLOWED);
+
+retry:
+	err = z_erofs_attach_page(clt, page, page_type);
+	/* should allocate an additional staging page for pagevec */
+	if (err == -EAGAIN) {
+		struct page *const newpage =
+				alloc_page(GFP_NOFS | __GFP_NOFAIL);
+
+		newpage->mapping = Z_EROFS_MAPPING_STAGING;
+		err = z_erofs_attach_page(clt, newpage,
+					  Z_EROFS_PAGE_TYPE_EXCLUSIVE);
+		if (!err)
+			goto retry;
+	}
+
+	if (err)
+		goto err_out;
+
+	index = page->index - (map->m_la >> PAGE_SHIFT);
+
+	z_erofs_onlinepage_fixup(page, index, true);
+
+	/* bump up the number of spiltted parts of a page */
+	++spiltted;
+	/* also update nr_pages */
+	clt->cl->nr_pages = max_t(pgoff_t, clt->cl->nr_pages, index + 1);
+next_part:
+	/* can be used for verification */
+	map->m_llen = offset + cur - map->m_la;
+
+	end = cur;
+	if (end > 0)
+		goto repeat;
+
+out:
+	z_erofs_onlinepage_endio(page);
+
+	erofs_dbg("%s, finish page: %pK spiltted: %u map->m_llen %llu",
+		  __func__, page, spiltted, map->m_llen);
+	return err;
+
+	/* if some error occurred while processing this page */
+err_out:
+	SetPageError(page);
+	goto out;
+}
+
+static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
+				       bool sync, int bios)
+{
+	/* wake up the caller thread for sync decompression */
+	if (sync) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&io->u.wait.lock, flags);
+		if (!atomic_add_return(bios, &io->pending_bios))
+			wake_up_locked(&io->u.wait);
+		spin_unlock_irqrestore(&io->u.wait.lock, flags);
+		return;
+	}
+
+	if (!atomic_add_return(bios, &io->pending_bios))
+		queue_work(z_erofs_workqueue, &io->u.work);
+}
+
+static void z_erofs_decompressqueue_endio(struct bio *bio)
+{
+	tagptr1_t t = tagptr_init(tagptr1_t, bio->bi_private);
+	struct z_erofs_decompressqueue *q = tagptr_unfold_ptr(t);
+	blk_status_t err = bio->bi_status;
+	struct bio_vec *bvec;
+	unsigned int i;
+
+	bio_for_each_segment_all(bvec, bio, i) {
+		struct page *page = bvec->bv_page;
+
+		DBG_BUGON(PageUptodate(page));
+		DBG_BUGON(!page->mapping);
+
+		if (err)
+			SetPageError(page);
+
+		if (erofs_page_is_managed(EROFS_SB(q->sb), page)) {
+			if (!err)
+				SetPageUptodate(page);
+			unlock_page(page);
+		}
+	}
+	z_erofs_decompress_kickoff(q, tagptr_unfold_tags(t), -1);
+	bio_put(bio);
+}
+
+static int z_erofs_decompress_pcluster(struct super_block *sb,
+				       struct z_erofs_pcluster *pcl,
+				       struct list_head *pagepool)
+{
+	struct erofs_sb_info *const sbi = EROFS_SB(sb);
+	const unsigned int clusterpages = BIT(pcl->clusterbits);
+	struct z_erofs_pagevec_ctor ctor;
+	unsigned int i, outputsize, llen, nr_pages;
+	struct page *pages_onstack[Z_EROFS_VMAP_ONSTACK_PAGES];
+	struct page **pages, **compressed_pages, *page;
+
+	enum z_erofs_page_type page_type;
+	bool overlapped, partial;
+	struct z_erofs_collection *cl;
+	int err;
+
+	might_sleep();
+	cl = z_erofs_primarycollection(pcl);
+	DBG_BUGON(!READ_ONCE(cl->nr_pages));
+
+	mutex_lock(&cl->lock);
+	nr_pages = cl->nr_pages;
+
+	if (nr_pages <= Z_EROFS_VMAP_ONSTACK_PAGES) {
+		pages = pages_onstack;
+	} else if (nr_pages <= Z_EROFS_VMAP_GLOBAL_PAGES &&
+		   mutex_trylock(&z_pagemap_global_lock)) {
+		pages = z_pagemap_global;
+	} else {
+		gfp_t gfp_flags = GFP_KERNEL;
+
+		if (nr_pages > Z_EROFS_VMAP_GLOBAL_PAGES)
+			gfp_flags |= __GFP_NOFAIL;
+
+		pages = kvmalloc_array(nr_pages, sizeof(struct page *),
+				       gfp_flags);
+
+		/* fallback to global pagemap for the lowmem scenario */
+		if (!pages) {
+			mutex_lock(&z_pagemap_global_lock);
+			pages = z_pagemap_global;
+		}
+	}
+
+	for (i = 0; i < nr_pages; ++i)
+		pages[i] = NULL;
+
+	err = 0;
+	z_erofs_pagevec_ctor_init(&ctor, Z_EROFS_NR_INLINE_PAGEVECS,
+				  cl->pagevec, 0);
+
+	for (i = 0; i < cl->vcnt; ++i) {
+		unsigned int pagenr;
+
+		page = z_erofs_pagevec_dequeue(&ctor, &page_type);
+
+		/* all pages in pagevec ought to be valid */
+		DBG_BUGON(!page);
+		DBG_BUGON(!page->mapping);
+
+		if (z_erofs_put_stagingpage(pagepool, page))
+			continue;
+
+		if (page_type == Z_EROFS_VLE_PAGE_TYPE_HEAD)
+			pagenr = 0;
+		else
+			pagenr = z_erofs_onlinepage_index(page);
+
+		DBG_BUGON(pagenr >= nr_pages);
+
+		/*
+		 * currently EROFS doesn't support multiref(dedup),
+		 * so here erroring out one multiref page.
+		 */
+		if (pages[pagenr]) {
+			DBG_BUGON(1);
+			SetPageError(pages[pagenr]);
+			z_erofs_onlinepage_endio(pages[pagenr]);
+			err = -EFSCORRUPTED;
+		}
+		pages[pagenr] = page;
+	}
+	z_erofs_pagevec_ctor_exit(&ctor, true);
+
+	overlapped = false;
+	compressed_pages = pcl->compressed_pages;
+
+	for (i = 0; i < clusterpages; ++i) {
+		unsigned int pagenr;
+
+		page = compressed_pages[i];
+
+		/* all compressed pages ought to be valid */
+		DBG_BUGON(!page);
+		DBG_BUGON(!page->mapping);
+
+		if (!z_erofs_page_is_staging(page)) {
+			if (erofs_page_is_managed(sbi, page)) {
+				if (!PageUptodate(page))
+					err = -EIO;
+				continue;
+			}
+
+			/*
+			 * only if non-head page can be selected
+			 * for inplace decompression
+			 */
+			pagenr = z_erofs_onlinepage_index(page);
+
+			DBG_BUGON(pagenr >= nr_pages);
+			if (pages[pagenr]) {
+				DBG_BUGON(1);
+				SetPageError(pages[pagenr]);
+				z_erofs_onlinepage_endio(pages[pagenr]);
+				err = -EFSCORRUPTED;
+			}
+			pages[pagenr] = page;
+
+			overlapped = true;
+		}
+
+		/* PG_error needs checking for inplaced and staging pages */
+		if (PageError(page)) {
+			DBG_BUGON(PageUptodate(page));
+			err = -EIO;
+		}
+	}
+
+	if (err)
+		goto out;
+
+	llen = pcl->length >> Z_EROFS_PCLUSTER_LENGTH_BIT;
+	if (nr_pages << PAGE_SHIFT >= cl->pageofs + llen) {
+		outputsize = llen;
+		partial = !(pcl->length & Z_EROFS_PCLUSTER_FULL_LENGTH);
+	} else {
+		outputsize = (nr_pages << PAGE_SHIFT) - cl->pageofs;
+		partial = true;
+	}
+
+	err = z_erofs_decompress(&(struct z_erofs_decompress_req) {
+					.sb = sb,
+					.in = compressed_pages,
+					.out = pages,
+					.pageofs_out = cl->pageofs,
+					.inputsize = PAGE_SIZE,
+					.outputsize = outputsize,
+					.alg = pcl->algorithmformat,
+					.inplace_io = overlapped,
+					.partial_decoding = partial
+				 }, pagepool);
+
+out:
+	/* must handle all compressed pages before endding pages */
+	for (i = 0; i < clusterpages; ++i) {
+		page = compressed_pages[i];
+
+		if (erofs_page_is_managed(sbi, page))
+			continue;
+
+		/* recycle all individual staging pages */
+		(void)z_erofs_put_stagingpage(pagepool, page);
+
+		WRITE_ONCE(compressed_pages[i], NULL);
+	}
+
+	for (i = 0; i < nr_pages; ++i) {
+		page = pages[i];
+		if (!page)
+			continue;
+
+		DBG_BUGON(!page->mapping);
+
+		/* recycle all individual staging pages */
+		if (z_erofs_put_stagingpage(pagepool, page))
+			continue;
+
+		if (err < 0)
+			SetPageError(page);
+
+		z_erofs_onlinepage_endio(page);
+	}
+
+	if (pages == z_pagemap_global)
+		mutex_unlock(&z_pagemap_global_lock);
+	else if (pages != pages_onstack)
+		kvfree(pages);
+
+	cl->nr_pages = 0;
+	cl->vcnt = 0;
+
+	/* all cl locks MUST be taken before the following line */
+	WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_NIL);
+
+	/* all cl locks SHOULD be released right now */
+	mutex_unlock(&cl->lock);
+
+	z_erofs_collection_put(cl);
+	return err;
+}
+
+static void z_erofs_decompress_queue(const struct z_erofs_decompressqueue *io,
+				     struct list_head *pagepool)
+{
+	z_erofs_next_pcluster_t owned = io->head;
+
+	while (owned != Z_EROFS_PCLUSTER_TAIL_CLOSED) {
+		struct z_erofs_pcluster *pcl;
+
+		/* no possible that 'owned' equals Z_EROFS_WORK_TPTR_TAIL */
+		DBG_BUGON(owned == Z_EROFS_PCLUSTER_TAIL);
+
+		/* no possible that 'owned' equals NULL */
+		DBG_BUGON(owned == Z_EROFS_PCLUSTER_NIL);
+
+		pcl = container_of(owned, struct z_erofs_pcluster, next);
+		owned = READ_ONCE(pcl->next);
+
+		z_erofs_decompress_pcluster(io->sb, pcl, pagepool);
+	}
+}
+
+static void z_erofs_decompressqueue_work(struct work_struct *work)
+{
+	struct z_erofs_decompressqueue *bgq =
+		container_of(work, struct z_erofs_decompressqueue, u.work);
+	LIST_HEAD(pagepool);
+
+	DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
+	z_erofs_decompress_queue(bgq, &pagepool);
+
+	put_pages_list(&pagepool);
+	kvfree(bgq);
+}
+
+static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl,
+					       unsigned int nr,
+					       struct list_head *pagepool,
+					       struct address_space *mc,
+					       gfp_t gfp)
+{
+	const pgoff_t index = pcl->obj.index;
+	bool tocache = false;
+
+	struct address_space *mapping;
+	struct page *oldpage, *page;
+
+	compressed_page_t t;
+	int justfound;
+
+repeat:
+	page = READ_ONCE(pcl->compressed_pages[nr]);
+	oldpage = page;
+
+	if (!page)
+		goto out_allocpage;
+
+	/*
+	 * the cached page has not been allocated and
+	 * an placeholder is out there, prepare it now.
+	 */
+	if (page == PAGE_UNALLOCATED) {
+		tocache = true;
+		goto out_allocpage;
+	}
+
+	/* process the target tagged pointer */
+	t = tagptr_init(compressed_page_t, page);
+	justfound = tagptr_unfold_tags(t);
+	page = tagptr_unfold_ptr(t);
+
+	mapping = READ_ONCE(page->mapping);
+
+	/*
+	 * unmanaged (file) pages are all locked solidly,
+	 * therefore it is impossible for `mapping' to be NULL.
+	 */
+	if (mapping && mapping != mc)
+		/* ought to be unmanaged pages */
+		goto out;
+
+	lock_page(page);
+
+	/* only true if page reclaim goes wrong, should never happen */
+	DBG_BUGON(justfound && PagePrivate(page));
+
+	/* the page is still in manage cache */
+	if (page->mapping == mc) {
+		WRITE_ONCE(pcl->compressed_pages[nr], page);
+
+		ClearPageError(page);
+		if (!PagePrivate(page)) {
+			/*
+			 * impossible to be !PagePrivate(page) for
+			 * the current restriction as well if
+			 * the page is already in compressed_pages[].
+			 */
+			DBG_BUGON(!justfound);
+
+			justfound = 0;
+			set_page_private(page, (unsigned long)pcl);
+			SetPagePrivate(page);
+		}
+
+		/* no need to submit io if it is already up-to-date */
+		if (PageUptodate(page)) {
+			unlock_page(page);
+			page = NULL;
+		}
+		goto out;
+	}
+
+	/*
+	 * the managed page has been truncated, it's unsafe to
+	 * reuse this one, let's allocate a new cache-managed page.
+	 */
+	DBG_BUGON(page->mapping);
+	DBG_BUGON(!justfound);
+
+	tocache = true;
+	unlock_page(page);
+	put_page(page);
+out_allocpage:
+	page = erofs_allocpage(pagepool, gfp | __GFP_NOFAIL);
+	if (!tocache || add_to_page_cache_lru(page, mc, index + nr, gfp)) {
+		/* non-LRU / non-movable temporary page is needed */
+		page->mapping = Z_EROFS_MAPPING_STAGING;
+		tocache = false;
+	}
+
+	if (oldpage != cmpxchg(&pcl->compressed_pages[nr], oldpage, page)) {
+		if (tocache) {
+			/* since it added to managed cache successfully */
+			unlock_page(page);
+			put_page(page);
+		} else {
+			list_add(&page->lru, pagepool);
+		}
+		cond_resched();
+		goto repeat;
+	}
+
+	if (tocache) {
+		set_page_private(page, (unsigned long)pcl);
+		SetPagePrivate(page);
+	}
+out:	/* the only exit (for tracing and debugging) */
+	return page;
+}
+
+static struct z_erofs_decompressqueue *
+jobqueue_init(struct super_block *sb,
+	      struct z_erofs_decompressqueue *fgq, bool *fg)
+{
+	struct z_erofs_decompressqueue *q;
+
+	if (fg && !*fg) {
+		q = kvzalloc(sizeof(*q), GFP_KERNEL | __GFP_NOWARN);
+		if (!q) {
+			*fg = true;
+			goto fg_out;
+		}
+		INIT_WORK(&q->u.work, z_erofs_decompressqueue_work);
+	} else {
+fg_out:
+		q = fgq;
+		init_waitqueue_head(&fgq->u.wait);
+		atomic_set(&fgq->pending_bios, 0);
+	}
+	q->sb = sb;
+	q->head = Z_EROFS_PCLUSTER_TAIL_CLOSED;
+	return q;
+}
+
+/* define decompression jobqueue types */
+enum {
+	JQ_BYPASS,
+	JQ_SUBMIT,
+	NR_JOBQUEUES,
+};
+
+static void *jobqueueset_init(struct super_block *sb,
+			      struct z_erofs_decompressqueue *q[],
+			      struct z_erofs_decompressqueue *fgq, bool *fg)
+{
+	/*
+	 * if managed cache is enabled, bypass jobqueue is needed,
+	 * no need to read from device for all pclusters in this queue.
+	 */
+	q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, NULL);
+	q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, fg);
+
+	return tagptr_cast_ptr(tagptr_fold(tagptr1_t, q[JQ_SUBMIT], *fg));
+}
+
+static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl,
+				    z_erofs_next_pcluster_t qtail[],
+				    z_erofs_next_pcluster_t owned_head)
+{
+	z_erofs_next_pcluster_t *const submit_qtail = qtail[JQ_SUBMIT];
+	z_erofs_next_pcluster_t *const bypass_qtail = qtail[JQ_BYPASS];
+
+	DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
+	if (owned_head == Z_EROFS_PCLUSTER_TAIL)
+		owned_head = Z_EROFS_PCLUSTER_TAIL_CLOSED;
+
+	WRITE_ONCE(pcl->next, Z_EROFS_PCLUSTER_TAIL_CLOSED);
+
+	WRITE_ONCE(*submit_qtail, owned_head);
+	WRITE_ONCE(*bypass_qtail, &pcl->next);
+
+	qtail[JQ_BYPASS] = &pcl->next;
+}
+
+static void z_erofs_submit_queue(struct super_block *sb,
+				 struct z_erofs_decompress_frontend *f,
+				 struct list_head *pagepool,
+				 struct z_erofs_decompressqueue *fgq,
+				 bool *force_fg)
+{
+	struct erofs_sb_info *const sbi = EROFS_SB(sb);
+	z_erofs_next_pcluster_t qtail[NR_JOBQUEUES];
+	struct z_erofs_decompressqueue *q[NR_JOBQUEUES];
+	void *bi_private;
+	z_erofs_next_pcluster_t owned_head = f->clt.owned_head;
+	/* since bio will be NULL, no need to initialize last_index */
+	pgoff_t last_index;
+	unsigned int nr_bios = 0;
+	struct bio *bio = NULL;
+
+	bi_private = jobqueueset_init(sb, q, fgq, force_fg);
+	qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head;
+	qtail[JQ_SUBMIT] = &q[JQ_SUBMIT]->head;
+
+	/* by default, all need io submission */
+	q[JQ_SUBMIT]->head = owned_head;
+
+	do {
+		struct z_erofs_pcluster *pcl;
+		pgoff_t cur, end;
+		unsigned int i = 0;
+		bool bypass = true;
+
+		/* no possible 'owned_head' equals the following */
+		DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_TAIL_CLOSED);
+		DBG_BUGON(owned_head == Z_EROFS_PCLUSTER_NIL);
+
+		pcl = container_of(owned_head, struct z_erofs_pcluster, next);
+
+		cur = pcl->obj.index;
+		end = cur + BIT(pcl->clusterbits);
+
+		/* close the main owned chain at first */
+		owned_head = cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL,
+				     Z_EROFS_PCLUSTER_TAIL_CLOSED);
+
+		do {
+			struct page *page;
+
+			page = pickup_page_for_submission(pcl, i++, pagepool,
+							  MNGD_MAPPING(sbi),
+							  GFP_NOFS);
+			if (!page)
+				continue;
+
+			if (bio && cur != last_index + 1) {
+submit_bio_retry:
+				submit_bio(bio);
+				bio = NULL;
+			}
+
+			if (!bio) {
+				bio = bio_alloc(GFP_NOIO, BIO_MAX_PAGES);
+
+				bio->bi_end_io = z_erofs_decompressqueue_endio;
+				bio_set_dev(bio, sb->s_bdev);
+				bio->bi_iter.bi_sector = (sector_t)cur <<
+					LOG_SECTORS_PER_BLOCK;
+				bio->bi_private = bi_private;
+				bio->bi_opf = REQ_OP_READ;
+				if (f->readahead)
+					bio->bi_opf |= REQ_RAHEAD;
+				++nr_bios;
+			}
+
+			if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE)
+				goto submit_bio_retry;
+
+			last_index = cur;
+			bypass = false;
+		} while (++cur < end);
+
+		if (!bypass)
+			qtail[JQ_SUBMIT] = &pcl->next;
+		else
+			move_to_bypass_jobqueue(pcl, qtail, owned_head);
+	} while (owned_head != Z_EROFS_PCLUSTER_TAIL);
+
+	if (bio)
+		submit_bio(bio);
+
+	/*
+	 * although background is preferred, no one is pending for submission.
+	 * don't issue workqueue for decompression but drop it directly instead.
+	 */
+	if (!*force_fg && !nr_bios) {
+		kvfree(q[JQ_SUBMIT]);
+		return;
+	}
+	z_erofs_decompress_kickoff(q[JQ_SUBMIT], *force_fg, nr_bios);
+}
+
+static void z_erofs_runqueue(struct super_block *sb,
+			     struct z_erofs_decompress_frontend *f,
+			     struct list_head *pagepool, bool force_fg)
+{
+	struct z_erofs_decompressqueue io[NR_JOBQUEUES];
+
+	if (f->clt.owned_head == Z_EROFS_PCLUSTER_TAIL)
+		return;
+	z_erofs_submit_queue(sb, f, pagepool, io, &force_fg);
+
+	/* handle bypass queue (no i/o pclusters) immediately */
+	z_erofs_decompress_queue(&io[JQ_BYPASS], pagepool);
+
+	if (!force_fg)
+		return;
+
+	/* wait until all bios are completed */
+	io_wait_event(io[JQ_SUBMIT].u.wait,
+		      !atomic_read(&io[JQ_SUBMIT].pending_bios));
+
+	/* handle synchronous decompress queue in the caller context */
+	z_erofs_decompress_queue(&io[JQ_SUBMIT], pagepool);
+}
+
+static int z_erofs_readpage(struct file *file, struct page *page)
+{
+	struct inode *const inode = page->mapping->host;
+	struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
+	int err;
+	LIST_HEAD(pagepool);
+
+	trace_erofs_readpage(page, false);
+
+	f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT;
+
+	err = z_erofs_do_read_page(&f, page);
+	(void)z_erofs_collector_end(&f.clt);
+
+	/* if some compressed cluster ready, need submit them anyway */
+	z_erofs_runqueue(inode->i_sb, &f, &pagepool, true);
+
+	if (err)
+		erofs_err(inode->i_sb, "failed to read, err [%d]", err);
+
+	if (f.map.mpage)
+		put_page(f.map.mpage);
+
+	/* clean up the remaining free pages */
+	put_pages_list(&pagepool);
+	return err;
+}
+
+static int z_erofs_readpages(struct file *filp, struct address_space *mapping,
+			     struct list_head *pages, unsigned int nr_pages)
+{
+	struct inode *const inode = mapping->host;
+	struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
+
+	bool sync = (nr_pages <= sbi->max_sync_decompress_pages);
+	struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
+	gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL);
+	struct page *head = NULL;
+	LIST_HEAD(pagepool);
+
+	trace_erofs_readpages(mapping->host, lru_to_page(pages),
+			      nr_pages, false);
+
+	f.headoffset = (erofs_off_t)lru_to_page(pages)->index << PAGE_SHIFT;
+
+	for (; nr_pages; --nr_pages) {
+		struct page *page = lru_to_page(pages);
+
+		prefetchw(&page->flags);
+		list_del(&page->lru);
+
+		/*
+		 * A pure asynchronous readahead is indicated if
+		 * a PG_readahead marked page is hitted at first.
+		 * Let's also do asynchronous decompression for this case.
+		 */
+		sync &= !(PageReadahead(page) && !head);
+
+		if (add_to_page_cache_lru(page, mapping, page->index, gfp)) {
+			list_add(&page->lru, &pagepool);
+			continue;
+		}
+
+		set_page_private(page, (unsigned long)head);
+		head = page;
+	}
+
+	while (head) {
+		struct page *page = head;
+		int err;
+
+		/* traversal in reverse order */
+		head = (void *)page_private(page);
+
+		err = z_erofs_do_read_page(&f, page);
+		if (err)
+			erofs_err(inode->i_sb,
+				  "readahead error at page %lu @ nid %llu",
+				  page->index, EROFS_I(inode)->nid);
+		put_page(page);
+	}
+
+	(void)z_erofs_collector_end(&f.clt);
+
+	z_erofs_runqueue(inode->i_sb, &f, &pagepool, sync);
+
+	if (f.map.mpage)
+		put_page(f.map.mpage);
+
+	/* clean up the remaining free pages */
+	put_pages_list(&pagepool);
+	return 0;
+}
+
+const struct address_space_operations z_erofs_aops = {
+	.readpage = z_erofs_readpage,
+	.readpages = z_erofs_readpages,
+};
diff --git a/drivers/staging/erofs/zdata.h b/drivers/staging/erofs/zdata.h
new file mode 100644
index 000000000000..68da309d5ad7
--- /dev/null
+++ b/drivers/staging/erofs/zdata.h
@@ -0,0 +1,188 @@
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
+ * Copyright (C) 2018 HUAWEI, Inc.
+ *             https://www.huawei.com/
+ */
+#ifndef __EROFS_FS_ZDATA_H
+#define __EROFS_FS_ZDATA_H
+
+#include "internal.h"
+#include "zpvec.h"
+
+#define Z_EROFS_NR_INLINE_PAGEVECS      3
+
+/*
+ * Structure fields follow one of the following exclusion rules.
+ *
+ * I: Modifiable by initialization/destruction paths and read-only
+ *    for everyone else;
+ *
+ * L: Field should be protected by pageset lock;
+ *
+ * A: Field should be accessed / updated in atomic for parallelized code.
+ */
+struct z_erofs_collection {
+	struct mutex lock;
+
+	/* I: page offset of start position of decompression */
+	unsigned short pageofs;
+
+	/* L: maximum relative page index in pagevec[] */
+	unsigned short nr_pages;
+
+	/* L: total number of pages in pagevec[] */
+	unsigned int vcnt;
+
+	union {
+		/* L: inline a certain number of pagevecs for bootstrap */
+		erofs_vtptr_t pagevec[Z_EROFS_NR_INLINE_PAGEVECS];
+
+		/* I: can be used to free the pcluster by RCU. */
+		struct rcu_head rcu;
+	};
+};
+
+#define Z_EROFS_PCLUSTER_FULL_LENGTH    0x00000001
+#define Z_EROFS_PCLUSTER_LENGTH_BIT     1
+
+/*
+ * let's leave a type here in case of introducing
+ * another tagged pointer later.
+ */
+typedef void *z_erofs_next_pcluster_t;
+
+struct z_erofs_pcluster {
+	struct erofs_workgroup obj;
+	struct z_erofs_collection primary_collection;
+
+	/* A: point to next chained pcluster or TAILs */
+	z_erofs_next_pcluster_t next;
+
+	/* A: compressed pages (including multi-usage pages) */
+	struct page *compressed_pages[Z_EROFS_CLUSTER_MAX_PAGES];
+
+	/* A: lower limit of decompressed length and if full length or not */
+	unsigned int length;
+
+	/* I: compression algorithm format */
+	unsigned char algorithmformat;
+	/* I: bit shift of physical cluster size */
+	unsigned char clusterbits;
+};
+
+#define z_erofs_primarycollection(pcluster) (&(pcluster)->primary_collection)
+
+/* let's avoid the valid 32-bit kernel addresses */
+
+/* the chained workgroup has't submitted io (still open) */
+#define Z_EROFS_PCLUSTER_TAIL           ((void *)0x5F0ECAFE)
+/* the chained workgroup has already submitted io */
+#define Z_EROFS_PCLUSTER_TAIL_CLOSED    ((void *)0x5F0EDEAD)
+
+#define Z_EROFS_PCLUSTER_NIL            (NULL)
+
+#define Z_EROFS_WORKGROUP_SIZE  sizeof(struct z_erofs_pcluster)
+
+struct z_erofs_decompressqueue {
+	struct super_block *sb;
+	atomic_t pending_bios;
+	z_erofs_next_pcluster_t head;
+
+	union {
+		wait_queue_head_t wait;
+		struct work_struct work;
+	} u;
+};
+
+#define MNGD_MAPPING(sbi)	((sbi)->managed_cache->i_mapping)
+static inline bool erofs_page_is_managed(const struct erofs_sb_info *sbi,
+					 struct page *page)
+{
+	return page->mapping == MNGD_MAPPING(sbi);
+}
+
+#define Z_EROFS_ONLINEPAGE_COUNT_BITS   2
+#define Z_EROFS_ONLINEPAGE_COUNT_MASK   ((1 << Z_EROFS_ONLINEPAGE_COUNT_BITS) - 1)
+#define Z_EROFS_ONLINEPAGE_INDEX_SHIFT  (Z_EROFS_ONLINEPAGE_COUNT_BITS)
+
+/*
+ * waiters (aka. ongoing_packs): # to unlock the page
+ * sub-index: 0 - for partial page, >= 1 full page sub-index
+ */
+typedef atomic_t z_erofs_onlinepage_t;
+
+/* type punning */
+union z_erofs_onlinepage_converter {
+	z_erofs_onlinepage_t *o;
+	unsigned long *v;
+};
+
+static inline unsigned int z_erofs_onlinepage_index(struct page *page)
+{
+	union z_erofs_onlinepage_converter u;
+
+	DBG_BUGON(!PagePrivate(page));
+	u.v = &page_private(page);
+
+	return atomic_read(u.o) >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
+}
+
+static inline void z_erofs_onlinepage_init(struct page *page)
+{
+	union {
+		z_erofs_onlinepage_t o;
+		unsigned long v;
+	/* keep from being unlocked in advance */
+	} u = { .o = ATOMIC_INIT(1) };
+
+	set_page_private(page, u.v);
+	smp_wmb();
+	SetPagePrivate(page);
+}
+
+static inline void z_erofs_onlinepage_fixup(struct page *page,
+	uintptr_t index, bool down)
+{
+	union z_erofs_onlinepage_converter u = { .v = &page_private(page) };
+	int orig, orig_index, val;
+
+repeat:
+	orig = atomic_read(u.o);
+	orig_index = orig >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
+	if (orig_index) {
+		if (!index)
+			return;
+
+		DBG_BUGON(orig_index != index);
+	}
+
+	val = (index << Z_EROFS_ONLINEPAGE_INDEX_SHIFT) |
+		((orig & Z_EROFS_ONLINEPAGE_COUNT_MASK) + (unsigned int)down);
+	if (atomic_cmpxchg(u.o, orig, val) != orig)
+		goto repeat;
+}
+
+static inline void z_erofs_onlinepage_endio(struct page *page)
+{
+	union z_erofs_onlinepage_converter u;
+	unsigned int v;
+
+	DBG_BUGON(!PagePrivate(page));
+	u.v = &page_private(page);
+
+	v = atomic_dec_return(u.o);
+	if (!(v & Z_EROFS_ONLINEPAGE_COUNT_MASK)) {
+		ClearPagePrivate(page);
+		if (!PageError(page))
+			SetPageUptodate(page);
+		unlock_page(page);
+	}
+	erofs_dbg("%s, page %p value %x", __func__, page, atomic_read(u.o));
+}
+
+#define Z_EROFS_VMAP_ONSTACK_PAGES	\
+	min_t(unsigned int, THREAD_SIZE / 8 / sizeof(struct page *), 96U)
+#define Z_EROFS_VMAP_GLOBAL_PAGES	2048
+
+#endif
+
diff --git a/drivers/staging/erofs/zmap.c b/drivers/staging/erofs/zmap.c
new file mode 100644
index 000000000000..a13f69cf4026
--- /dev/null
+++ b/drivers/staging/erofs/zmap.c
@@ -0,0 +1,477 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2018-2019 HUAWEI, Inc.
+ *             https://www.huawei.com/
+ */
+#include "internal.h"
+#include <asm/unaligned.h>
+#include <trace/events/erofs.h>
+
+int z_erofs_fill_inode(struct inode *inode)
+{
+	struct erofs_inode *const vi = EROFS_I(inode);
+
+	if (vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY) {
+		vi->z_advise = 0;
+		vi->z_algorithmtype[0] = 0;
+		vi->z_algorithmtype[1] = 0;
+		vi->z_logical_clusterbits = LOG_BLOCK_SIZE;
+		vi->z_physical_clusterbits[0] = vi->z_logical_clusterbits;
+		vi->z_physical_clusterbits[1] = vi->z_logical_clusterbits;
+		set_bit(EROFS_I_Z_INITED_BIT, &vi->flags);
+	}
+
+	inode->i_mapping->a_ops = &z_erofs_aops;
+	return 0;
+}
+
+static int z_erofs_fill_inode_lazy(struct inode *inode)
+{
+	struct erofs_inode *const vi = EROFS_I(inode);
+	struct super_block *const sb = inode->i_sb;
+	int err;
+	erofs_off_t pos;
+	struct page *page;
+	void *kaddr;
+	struct z_erofs_map_header *h;
+
+	if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) {
+		/*
+		 * paired with smp_mb() at the end of the function to ensure
+		 * fields will only be observed after the bit is set.
+		 */
+		smp_mb();
+		return 0;
+	}
+
+	if (wait_on_bit_lock(&vi->flags, EROFS_I_BL_Z_BIT, TASK_KILLABLE))
+		return -ERESTARTSYS;
+
+	err = 0;
+	if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags))
+		goto out_unlock;
+
+	DBG_BUGON(vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY);
+
+	pos = ALIGN(iloc(EROFS_SB(sb), vi->nid) + vi->inode_isize +
+		    vi->xattr_isize, 8);
+	page = erofs_get_meta_page(sb, erofs_blknr(pos));
+	if (IS_ERR(page)) {
+		err = PTR_ERR(page);
+		goto out_unlock;
+	}
+
+	kaddr = kmap_atomic(page);
+
+	h = kaddr + erofs_blkoff(pos);
+	vi->z_advise = le16_to_cpu(h->h_advise);
+	vi->z_algorithmtype[0] = h->h_algorithmtype & 15;
+	vi->z_algorithmtype[1] = h->h_algorithmtype >> 4;
+
+	if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX) {
+		erofs_err(sb, "unknown compression format %u for nid %llu, please upgrade kernel",
+			  vi->z_algorithmtype[0], vi->nid);
+		err = -EOPNOTSUPP;
+		goto unmap_done;
+	}
+
+	vi->z_logical_clusterbits = LOG_BLOCK_SIZE + (h->h_clusterbits & 7);
+	vi->z_physical_clusterbits[0] = vi->z_logical_clusterbits +
+					((h->h_clusterbits >> 3) & 3);
+
+	if (vi->z_physical_clusterbits[0] != LOG_BLOCK_SIZE) {
+		erofs_err(sb, "unsupported physical clusterbits %u for nid %llu, please upgrade kernel",
+			  vi->z_physical_clusterbits[0], vi->nid);
+		err = -EOPNOTSUPP;
+		goto unmap_done;
+	}
+
+	vi->z_physical_clusterbits[1] = vi->z_logical_clusterbits +
+					((h->h_clusterbits >> 5) & 7);
+	/* paired with smp_mb() at the beginning of the function */
+	smp_mb();
+	set_bit(EROFS_I_Z_INITED_BIT, &vi->flags);
+unmap_done:
+	kunmap_atomic(kaddr);
+	unlock_page(page);
+	put_page(page);
+out_unlock:
+	clear_and_wake_up_bit(EROFS_I_BL_Z_BIT, &vi->flags);
+	return err;
+}
+
+struct z_erofs_maprecorder {
+	struct inode *inode;
+	struct erofs_map_blocks *map;
+	void *kaddr;
+
+	unsigned long lcn;
+	/* compression extent information gathered */
+	u8  type;
+	u16 clusterofs;
+	u16 delta[2];
+	erofs_blk_t pblk;
+};
+
+static int z_erofs_reload_indexes(struct z_erofs_maprecorder *m,
+				  erofs_blk_t eblk)
+{
+	struct super_block *const sb = m->inode->i_sb;
+	struct erofs_map_blocks *const map = m->map;
+	struct page *mpage = map->mpage;
+
+	if (mpage) {
+		if (mpage->index == eblk) {
+			if (!m->kaddr)
+				m->kaddr = kmap_atomic(mpage);
+			return 0;
+		}
+
+		if (m->kaddr) {
+			kunmap_atomic(m->kaddr);
+			m->kaddr = NULL;
+		}
+		put_page(mpage);
+	}
+
+	mpage = erofs_get_meta_page(sb, eblk);
+	if (IS_ERR(mpage)) {
+		map->mpage = NULL;
+		return PTR_ERR(mpage);
+	}
+	m->kaddr = kmap_atomic(mpage);
+	unlock_page(mpage);
+	map->mpage = mpage;
+	return 0;
+}
+
+static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m,
+					 unsigned long lcn)
+{
+	struct inode *const inode = m->inode;
+	struct erofs_inode *const vi = EROFS_I(inode);
+	const erofs_off_t ibase = iloc(EROFS_I_SB(inode), vi->nid);
+	const erofs_off_t pos =
+		Z_EROFS_VLE_LEGACY_INDEX_ALIGN(ibase + vi->inode_isize +
+					       vi->xattr_isize) +
+		lcn * sizeof(struct z_erofs_vle_decompressed_index);
+	struct z_erofs_vle_decompressed_index *di;
+	unsigned int advise, type;
+	int err;
+
+	err = z_erofs_reload_indexes(m, erofs_blknr(pos));
+	if (err)
+		return err;
+
+	m->lcn = lcn;
+	di = m->kaddr + erofs_blkoff(pos);
+
+	advise = le16_to_cpu(di->di_advise);
+	type = (advise >> Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT) &
+		((1 << Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS) - 1);
+	switch (type) {
+	case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+		m->clusterofs = 1 << vi->z_logical_clusterbits;
+		m->delta[0] = le16_to_cpu(di->di_u.delta[0]);
+		m->delta[1] = le16_to_cpu(di->di_u.delta[1]);
+		break;
+	case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
+	case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
+		m->clusterofs = le16_to_cpu(di->di_clusterofs);
+		m->pblk = le32_to_cpu(di->di_u.blkaddr);
+		break;
+	default:
+		DBG_BUGON(1);
+		return -EOPNOTSUPP;
+	}
+	m->type = type;
+	return 0;
+}
+
+static unsigned int decode_compactedbits(unsigned int lobits,
+					 unsigned int lomask,
+					 u8 *in, unsigned int pos, u8 *type)
+{
+	const unsigned int v = get_unaligned_le32(in + pos / 8) >> (pos & 7);
+	const unsigned int lo = v & lomask;
+
+	*type = (v >> lobits) & 3;
+	return lo;
+}
+
+static int unpack_compacted_index(struct z_erofs_maprecorder *m,
+				  unsigned int amortizedshift,
+				  unsigned int eofs)
+{
+	struct erofs_inode *const vi = EROFS_I(m->inode);
+	const unsigned int lclusterbits = vi->z_logical_clusterbits;
+	const unsigned int lomask = (1 << lclusterbits) - 1;
+	unsigned int vcnt, base, lo, encodebits, nblk;
+	int i;
+	u8 *in, type;
+
+	if (1 << amortizedshift == 4)
+		vcnt = 2;
+	else if (1 << amortizedshift == 2 && lclusterbits == 12)
+		vcnt = 16;
+	else
+		return -EOPNOTSUPP;
+
+	encodebits = ((vcnt << amortizedshift) - sizeof(__le32)) * 8 / vcnt;
+	base = round_down(eofs, vcnt << amortizedshift);
+	in = m->kaddr + base;
+
+	i = (eofs - base) >> amortizedshift;
+
+	lo = decode_compactedbits(lclusterbits, lomask,
+				  in, encodebits * i, &type);
+	m->type = type;
+	if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) {
+		m->clusterofs = 1 << lclusterbits;
+		if (i + 1 != vcnt) {
+			m->delta[0] = lo;
+			return 0;
+		}
+		/*
+		 * since the last lcluster in the pack is special,
+		 * of which lo saves delta[1] rather than delta[0].
+		 * Hence, get delta[0] by the previous lcluster indirectly.
+		 */
+		lo = decode_compactedbits(lclusterbits, lomask,
+					  in, encodebits * (i - 1), &type);
+		if (type != Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD)
+			lo = 0;
+		m->delta[0] = lo + 1;
+		return 0;
+	}
+	m->clusterofs = lo;
+	m->delta[0] = 0;
+	/* figout out blkaddr (pblk) for HEAD lclusters */
+	nblk = 1;
+	while (i > 0) {
+		--i;
+		lo = decode_compactedbits(lclusterbits, lomask,
+					  in, encodebits * i, &type);
+		if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD)
+			i -= lo;
+
+		if (i >= 0)
+			++nblk;
+	}
+	in += (vcnt << amortizedshift) - sizeof(__le32);
+	m->pblk = le32_to_cpu(*(__le32 *)in) + nblk;
+	return 0;
+}
+
+static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m,
+					    unsigned long lcn)
+{
+	struct inode *const inode = m->inode;
+	struct erofs_inode *const vi = EROFS_I(inode);
+	const unsigned int lclusterbits = vi->z_logical_clusterbits;
+	const erofs_off_t ebase = ALIGN(iloc(EROFS_I_SB(inode), vi->nid) +
+					vi->inode_isize + vi->xattr_isize, 8) +
+		sizeof(struct z_erofs_map_header);
+	const unsigned int totalidx = DIV_ROUND_UP(inode->i_size, EROFS_BLKSIZ);
+	unsigned int compacted_4b_initial, compacted_2b;
+	unsigned int amortizedshift;
+	erofs_off_t pos;
+	int err;
+
+	if (lclusterbits != 12)
+		return -EOPNOTSUPP;
+
+	if (lcn >= totalidx)
+		return -EINVAL;
+
+	m->lcn = lcn;
+	/* used to align to 32-byte (compacted_2b) alignment */
+	compacted_4b_initial = (32 - ebase % 32) / 4;
+	if (compacted_4b_initial == 32 / 4)
+		compacted_4b_initial = 0;
+
+	if (vi->z_advise & Z_EROFS_ADVISE_COMPACTED_2B)
+		compacted_2b = rounddown(totalidx - compacted_4b_initial, 16);
+	else
+		compacted_2b = 0;
+
+	pos = ebase;
+	if (lcn < compacted_4b_initial) {
+		amortizedshift = 2;
+		goto out;
+	}
+	pos += compacted_4b_initial * 4;
+	lcn -= compacted_4b_initial;
+
+	if (lcn < compacted_2b) {
+		amortizedshift = 1;
+		goto out;
+	}
+	pos += compacted_2b * 2;
+	lcn -= compacted_2b;
+	amortizedshift = 2;
+out:
+	pos += lcn * (1 << amortizedshift);
+	err = z_erofs_reload_indexes(m, erofs_blknr(pos));
+	if (err)
+		return err;
+	return unpack_compacted_index(m, amortizedshift, erofs_blkoff(pos));
+}
+
+static int z_erofs_load_cluster_from_disk(struct z_erofs_maprecorder *m,
+					  unsigned int lcn)
+{
+	const unsigned int datamode = EROFS_I(m->inode)->datalayout;
+
+	if (datamode == EROFS_INODE_FLAT_COMPRESSION_LEGACY)
+		return legacy_load_cluster_from_disk(m, lcn);
+
+	if (datamode == EROFS_INODE_FLAT_COMPRESSION)
+		return compacted_load_cluster_from_disk(m, lcn);
+
+	return -EINVAL;
+}
+
+static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m,
+				   unsigned int lookback_distance)
+{
+	struct erofs_inode *const vi = EROFS_I(m->inode);
+	struct erofs_map_blocks *const map = m->map;
+	const unsigned int lclusterbits = vi->z_logical_clusterbits;
+	unsigned long lcn = m->lcn;
+	int err;
+
+	if (lcn < lookback_distance) {
+		erofs_err(m->inode->i_sb,
+			  "bogus lookback distance @ nid %llu", vi->nid);
+		DBG_BUGON(1);
+		return -EFSCORRUPTED;
+	}
+
+	/* load extent head logical cluster if needed */
+	lcn -= lookback_distance;
+	err = z_erofs_load_cluster_from_disk(m, lcn);
+	if (err)
+		return err;
+
+	switch (m->type) {
+	case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+		if (!m->delta[0]) {
+			erofs_err(m->inode->i_sb,
+				  "invalid lookback distance 0 @ nid %llu",
+				  vi->nid);
+			DBG_BUGON(1);
+			return -EFSCORRUPTED;
+		}
+		return z_erofs_extent_lookback(m, m->delta[0]);
+	case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
+		map->m_flags &= ~EROFS_MAP_ZIPPED;
+		/* fallthrough */
+	case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
+		map->m_la = (lcn << lclusterbits) | m->clusterofs;
+		break;
+	default:
+		erofs_err(m->inode->i_sb,
+			  "unknown type %u @ lcn %lu of nid %llu",
+			  m->type, lcn, vi->nid);
+		DBG_BUGON(1);
+		return -EOPNOTSUPP;
+	}
+	return 0;
+}
+
+int z_erofs_map_blocks_iter(struct inode *inode,
+			    struct erofs_map_blocks *map,
+			    int flags)
+{
+	struct erofs_inode *const vi = EROFS_I(inode);
+	struct z_erofs_maprecorder m = {
+		.inode = inode,
+		.map = map,
+	};
+	int err = 0;
+	unsigned int lclusterbits, endoff;
+	unsigned long long ofs, end;
+
+	trace_z_erofs_map_blocks_iter_enter(inode, map, flags);
+
+	/* when trying to read beyond EOF, leave it unmapped */
+	if (map->m_la >= inode->i_size) {
+		map->m_llen = map->m_la + 1 - inode->i_size;
+		map->m_la = inode->i_size;
+		map->m_flags = 0;
+		goto out;
+	}
+
+	err = z_erofs_fill_inode_lazy(inode);
+	if (err)
+		goto out;
+
+	lclusterbits = vi->z_logical_clusterbits;
+	ofs = map->m_la;
+	m.lcn = ofs >> lclusterbits;
+	endoff = ofs & ((1 << lclusterbits) - 1);
+
+	err = z_erofs_load_cluster_from_disk(&m, m.lcn);
+	if (err)
+		goto unmap_out;
+
+	map->m_flags = EROFS_MAP_ZIPPED;	/* by default, compressed */
+	end = (m.lcn + 1ULL) << lclusterbits;
+
+	switch (m.type) {
+	case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
+		if (endoff >= m.clusterofs)
+			map->m_flags &= ~EROFS_MAP_ZIPPED;
+		/* fallthrough */
+	case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
+		if (endoff >= m.clusterofs) {
+			map->m_la = (m.lcn << lclusterbits) | m.clusterofs;
+			break;
+		}
+		/* m.lcn should be >= 1 if endoff < m.clusterofs */
+		if (!m.lcn) {
+			erofs_err(inode->i_sb,
+				  "invalid logical cluster 0 at nid %llu",
+				  vi->nid);
+			err = -EFSCORRUPTED;
+			goto unmap_out;
+		}
+		end = (m.lcn << lclusterbits) | m.clusterofs;
+		map->m_flags |= EROFS_MAP_FULL_MAPPED;
+		m.delta[0] = 1;
+		/* fallthrough */
+	case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+		/* get the correspoinding first chunk */
+		err = z_erofs_extent_lookback(&m, m.delta[0]);
+		if (err)
+			goto unmap_out;
+		break;
+	default:
+		erofs_err(inode->i_sb,
+			  "unknown type %u @ offset %llu of nid %llu",
+			  m.type, ofs, vi->nid);
+		err = -EOPNOTSUPP;
+		goto unmap_out;
+	}
+
+	map->m_llen = end - map->m_la;
+	map->m_plen = 1 << lclusterbits;
+	map->m_pa = blknr_to_addr(m.pblk);
+	map->m_flags |= EROFS_MAP_MAPPED;
+
+unmap_out:
+	if (m.kaddr)
+		kunmap_atomic(m.kaddr);
+
+out:
+	erofs_dbg("%s, m_la %llu m_pa %llu m_llen %llu m_plen %llu m_flags 0%o",
+		  __func__, map->m_la, map->m_pa,
+		  map->m_llen, map->m_plen, map->m_flags);
+
+	trace_z_erofs_map_blocks_iter_exit(inode, map, flags, err);
+
+	/* aggressively BUG_ON iff CONFIG_EROFS_FS_DEBUG is on */
+	DBG_BUGON(err < 0 && err != -ENOMEM);
+	return err;
+}
diff --git a/drivers/staging/erofs/unzip_pagevec.h b/drivers/staging/erofs/zpvec.h
similarity index 67%
rename from drivers/staging/erofs/unzip_pagevec.h
rename to drivers/staging/erofs/zpvec.h
index 23856ba2742d..bfc12ecbbe83 100644
--- a/drivers/staging/erofs/unzip_pagevec.h
+++ b/drivers/staging/erofs/zpvec.h
@@ -1,21 +1,14 @@
-/* SPDX-License-Identifier: GPL-2.0
- *
- * linux/drivers/staging/erofs/unzip_pagevec.h
- *
+/* SPDX-License-Identifier: GPL-2.0-only */
+/*
  * Copyright (C) 2018 HUAWEI, Inc.
- *             http://www.huawei.com/
- * Created by Gao Xiang <gaoxiang25@huawei.com>
- *
- * This file is subject to the terms and conditions of the GNU General Public
- * License.  See the file COPYING in the main directory of the Linux
- * distribution for more details.
+ *             https://www.huawei.com/
  */
-#ifndef __EROFS_UNZIP_PAGEVEC_H
-#define __EROFS_UNZIP_PAGEVEC_H
+#ifndef __EROFS_FS_ZPVEC_H
+#define __EROFS_FS_ZPVEC_H
 
-#include <linux/tagptr.h>
+#include "tagptr.h"
 
-/* page type in pagevec for unzip subsystem */
+/* page type in pagevec for decompress subsystem */
 enum z_erofs_page_type {
 	/* including Z_EROFS_VLE_PAGE_TAIL_EXCLUSIVE */
 	Z_EROFS_PAGE_TYPE_EXCLUSIVE,
@@ -43,7 +36,7 @@ struct z_erofs_pagevec_ctor {
 static inline void z_erofs_pagevec_ctor_exit(struct z_erofs_pagevec_ctor *ctor,
 					     bool atomic)
 {
-	if (ctor->curr == NULL)
+	if (!ctor->curr)
 		return;
 
 	if (atomic)
@@ -54,25 +47,22 @@ static inline void z_erofs_pagevec_ctor_exit(struct z_erofs_pagevec_ctor *ctor,
 
 static inline struct page *
 z_erofs_pagevec_ctor_next_page(struct z_erofs_pagevec_ctor *ctor,
-			       unsigned nr)
+			       unsigned int nr)
 {
-	unsigned index;
+	unsigned int index;
 
 	/* keep away from occupied pages */
-	if (ctor->next != NULL)
+	if (ctor->next)
 		return ctor->next;
 
 	for (index = 0; index < nr; ++index) {
 		const erofs_vtptr_t t = ctor->pages[index];
-		const unsigned tags = tagptr_unfold_tags(t);
+		const unsigned int tags = tagptr_unfold_tags(t);
 
 		if (tags == Z_EROFS_PAGE_TYPE_EXCLUSIVE)
 			return tagptr_unfold_ptr(t);
 	}
-
-	if (unlikely(nr >= ctor->nr))
-		BUG();
-
+	DBG_BUGON(nr >= ctor->nr);
 	return NULL;
 }
 
@@ -94,8 +84,9 @@ z_erofs_pagevec_ctor_pagedown(struct z_erofs_pagevec_ctor *ctor,
 }
 
 static inline void z_erofs_pagevec_ctor_init(struct z_erofs_pagevec_ctor *ctor,
-					     unsigned nr,
-					     erofs_vtptr_t *pages, unsigned i)
+					     unsigned int nr,
+					     erofs_vtptr_t *pages,
+					     unsigned int i)
 {
 	ctor->nr = nr;
 	ctor->curr = ctor->next = NULL;
@@ -109,23 +100,21 @@ static inline void z_erofs_pagevec_ctor_init(struct z_erofs_pagevec_ctor *ctor,
 			z_erofs_pagevec_ctor_pagedown(ctor, false);
 		}
 	}
-
 	ctor->next = z_erofs_pagevec_ctor_next_page(ctor, i);
 	ctor->index = i;
 }
 
-static inline bool
-z_erofs_pagevec_ctor_enqueue(struct z_erofs_pagevec_ctor *ctor,
-			     struct page *page,
-			     enum z_erofs_page_type type,
-			     bool *occupied)
+static inline bool z_erofs_pagevec_enqueue(struct z_erofs_pagevec_ctor *ctor,
+					   struct page *page,
+					   enum z_erofs_page_type type,
+					   bool *occupied)
 {
 	*occupied = false;
-	if (unlikely(ctor->next == NULL && type))
+	if (!ctor->next && type)
 		if (ctor->index + 1 == ctor->nr)
 			return false;
 
-	if (unlikely(ctor->index >= ctor->nr))
+	if (ctor->index >= ctor->nr)
 		z_erofs_pagevec_ctor_pagedown(ctor, false);
 
 	/* exclusive page type must be 0 */
@@ -137,19 +126,17 @@ z_erofs_pagevec_ctor_enqueue(struct z_erofs_pagevec_ctor *ctor,
 		ctor->next = page;
 		*occupied = true;
 	}
-
-	ctor->pages[ctor->index++] =
-		tagptr_fold(erofs_vtptr_t, page, type);
+	ctor->pages[ctor->index++] = tagptr_fold(erofs_vtptr_t, page, type);
 	return true;
 }
 
 static inline struct page *
-z_erofs_pagevec_ctor_dequeue(struct z_erofs_pagevec_ctor *ctor,
-			     enum z_erofs_page_type *type)
+z_erofs_pagevec_dequeue(struct z_erofs_pagevec_ctor *ctor,
+			enum z_erofs_page_type *type)
 {
 	erofs_vtptr_t t;
 
-	if (unlikely(ctor->index >= ctor->nr)) {
+	if (ctor->index >= ctor->nr) {
 		DBG_BUGON(!ctor->next);
 		z_erofs_pagevec_ctor_pagedown(ctor, true);
 	}
@@ -162,11 +149,7 @@ z_erofs_pagevec_ctor_dequeue(struct z_erofs_pagevec_ctor *ctor,
 	if (*type == (uintptr_t)ctor->next)
 		ctor->next = tagptr_unfold_ptr(t);
 
-	ctor->pages[ctor->index++] =
-		tagptr_fold(erofs_vtptr_t, NULL, 0);
-
+	ctor->pages[ctor->index++] = tagptr_fold(erofs_vtptr_t, NULL, 0);
 	return tagptr_unfold_ptr(t);
 }
-
 #endif
-
diff --git a/drivers/staging/erofs/include/trace/events/erofs.h b/include/trace/events/erofs.h
similarity index 85%
rename from drivers/staging/erofs/include/trace/events/erofs.h
rename to include/trace/events/erofs.h
index 5aead93a762f..27f5caa6299a 100644
--- a/drivers/staging/erofs/include/trace/events/erofs.h
+++ b/include/trace/events/erofs.h
@@ -1,4 +1,4 @@
-/* SPDX-License-Identifier: GPL-2.0 */
+/* SPDX-License-Identifier: GPL-2.0-only */
 #undef TRACE_SYSTEM
 #define TRACE_SYSTEM erofs
 
@@ -6,6 +6,9 @@
 #define _TRACE_EROFS_H
 
 #include <linux/tracepoint.h>
+#include <linux/fs.h>
+
+struct erofs_map_blocks;
 
 #define show_dev(dev)		MAJOR(dev), MINOR(dev)
 #define show_dev_nid(entry)	show_dev(entry->dev), entry->nid
@@ -38,7 +41,7 @@ TRACE_EVENT(erofs_lookup,
 
 	TP_fast_assign(
 		__entry->dev	= dir->i_sb->s_dev;
-		__entry->nid	= EROFS_V(dir)->nid;
+		__entry->nid	= EROFS_I(dir)->nid;
 		__entry->name	= dentry->d_name.name;
 		__entry->flags	= flags;
 	),
@@ -63,7 +66,7 @@ TRACE_EVENT(erofs_fill_inode,
 
 	TP_fast_assign(
 		__entry->dev		= inode->i_sb->s_dev;
-		__entry->nid		= EROFS_V(inode)->nid;
+		__entry->nid		= EROFS_I(inode)->nid;
 		__entry->blkaddr	= erofs_blknr(iloc(EROFS_I_SB(inode), __entry->nid));
 		__entry->ofs		= erofs_blkoff(iloc(EROFS_I_SB(inode), __entry->nid));
 		__entry->isdir		= isdir;
@@ -92,7 +95,7 @@ TRACE_EVENT(erofs_readpage,
 
 	TP_fast_assign(
 		__entry->dev	= page->mapping->host->i_sb->s_dev;
-		__entry->nid	= EROFS_V(page->mapping->host)->nid;
+		__entry->nid	= EROFS_I(page->mapping->host)->nid;
 		__entry->dir	= S_ISDIR(page->mapping->host->i_mode);
 		__entry->index	= page->index;
 		__entry->uptodate = PageUptodate(page);
@@ -125,7 +128,7 @@ TRACE_EVENT(erofs_readpages,
 
 	TP_fast_assign(
 		__entry->dev	= inode->i_sb->s_dev;
-		__entry->nid	= EROFS_V(inode)->nid;
+		__entry->nid	= EROFS_I(inode)->nid;
 		__entry->start	= page->index;
 		__entry->nrpage	= nrpage;
 		__entry->raw	= raw;
@@ -154,7 +157,7 @@ DECLARE_EVENT_CLASS(erofs__map_blocks_enter,
 
 	TP_fast_assign(
 		__entry->dev    = inode->i_sb->s_dev;
-		__entry->nid    = EROFS_V(inode)->nid;
+		__entry->nid    = EROFS_I(inode)->nid;
 		__entry->la	= map->m_la;
 		__entry->llen	= map->m_llen;
 		__entry->flags	= flags;
@@ -162,7 +165,8 @@ DECLARE_EVENT_CLASS(erofs__map_blocks_enter,
 
 	TP_printk("dev = (%d,%d), nid = %llu, la %llu llen %llu flags %s",
 		  show_dev_nid(__entry),
-		  __entry->la, __entry->llen, show_map_flags(__entry->flags))
+		  __entry->la, __entry->llen,
+		  __entry->flags ? show_map_flags(__entry->flags) : "NULL")
 );
 
 DEFINE_EVENT(erofs__map_blocks_enter, erofs_map_blocks_flatmode_enter,
@@ -172,6 +176,13 @@ DEFINE_EVENT(erofs__map_blocks_enter, erofs_map_blocks_flatmode_enter,
 	TP_ARGS(inode, map, flags)
 );
 
+DEFINE_EVENT(erofs__map_blocks_enter, z_erofs_map_blocks_iter_enter,
+	TP_PROTO(struct inode *inode, struct erofs_map_blocks *map,
+		 unsigned int flags),
+
+	TP_ARGS(inode, map, flags)
+);
+
 DECLARE_EVENT_CLASS(erofs__map_blocks_exit,
 	TP_PROTO(struct inode *inode, struct erofs_map_blocks *map,
 		 unsigned int flags, int ret),
@@ -192,7 +203,7 @@ DECLARE_EVENT_CLASS(erofs__map_blocks_exit,
 
 	TP_fast_assign(
 		__entry->dev    = inode->i_sb->s_dev;
-		__entry->nid    = EROFS_V(inode)->nid;
+		__entry->nid    = EROFS_I(inode)->nid;
 		__entry->flags	= flags;
 		__entry->la	= map->m_la;
 		__entry->pa	= map->m_pa;
@@ -204,7 +215,8 @@ DECLARE_EVENT_CLASS(erofs__map_blocks_exit,
 
 	TP_printk("dev = (%d,%d), nid = %llu, flags %s "
 		  "la %llu pa %llu llen %llu plen %llu mflags %s ret %d",
-		  show_dev_nid(__entry), show_map_flags(__entry->flags),
+		  show_dev_nid(__entry),
+		  __entry->flags ? show_map_flags(__entry->flags) : "NULL",
 		  __entry->la, __entry->pa, __entry->llen, __entry->plen,
 		  show_mflags(__entry->mflags), __entry->ret)
 );
@@ -216,6 +228,13 @@ DEFINE_EVENT(erofs__map_blocks_exit, erofs_map_blocks_flatmode_exit,
 	TP_ARGS(inode, map, flags, ret)
 );
 
+DEFINE_EVENT(erofs__map_blocks_exit, z_erofs_map_blocks_iter_exit,
+	TP_PROTO(struct inode *inode, struct erofs_map_blocks *map,
+		 unsigned int flags, int ret),
+
+	TP_ARGS(inode, map, flags, ret)
+);
+
 TRACE_EVENT(erofs_destroy_inode,
 	TP_PROTO(struct inode *inode),
 
@@ -228,7 +247,7 @@ TRACE_EVENT(erofs_destroy_inode,
 
 	TP_fast_assign(
 		__entry->dev	= inode->i_sb->s_dev;
-		__entry->nid	= EROFS_V(inode)->nid;
+		__entry->nid	= EROFS_I(inode)->nid;
 	),
 
 	TP_printk("dev = (%d,%d), nid = %llu", show_dev_nid(__entry))

From e6ac4b0ae8bd6f7042e15a2496c1274149dd1e05 Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@linux.alibaba.com>
Date: Mon, 30 Aug 2021 11:44:53 +0800
Subject: [PATCH 05/34] erofs: move from drivers/staging/ to fs/

Since 5.4, erofs has been moved into fs/.

Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
---
 drivers/staging/Kconfig                      | 2 --
 drivers/staging/Makefile                     | 1 -
 fs/Kconfig                                   | 1 +
 fs/Makefile                                  | 1 +
 {drivers/staging => fs}/erofs/Kconfig        | 0
 {drivers/staging => fs}/erofs/Makefile       | 0
 {drivers/staging => fs}/erofs/TODO           | 0
 {drivers/staging => fs}/erofs/compress.h     | 0
 {drivers/staging => fs}/erofs/data.c         | 0
 {drivers/staging => fs}/erofs/decompressor.c | 0
 {drivers/staging => fs}/erofs/dir.c          | 0
 {drivers/staging => fs}/erofs/erofs_fs.h     | 0
 {drivers/staging => fs}/erofs/inode.c        | 0
 {drivers/staging => fs}/erofs/internal.h     | 0
 {drivers/staging => fs}/erofs/namei.c        | 0
 {drivers/staging => fs}/erofs/super.c        | 0
 {drivers/staging => fs}/erofs/tagptr.h       | 0
 {drivers/staging => fs}/erofs/utils.c        | 0
 {drivers/staging => fs}/erofs/xattr.c        | 0
 {drivers/staging => fs}/erofs/xattr.h        | 0
 {drivers/staging => fs}/erofs/zdata.c        | 0
 {drivers/staging => fs}/erofs/zdata.h        | 0
 {drivers/staging => fs}/erofs/zmap.c         | 0
 {drivers/staging => fs}/erofs/zpvec.h        | 0
 24 files changed, 2 insertions(+), 3 deletions(-)
 rename {drivers/staging => fs}/erofs/Kconfig (100%)
 rename {drivers/staging => fs}/erofs/Makefile (100%)
 rename {drivers/staging => fs}/erofs/TODO (100%)
 rename {drivers/staging => fs}/erofs/compress.h (100%)
 rename {drivers/staging => fs}/erofs/data.c (100%)
 rename {drivers/staging => fs}/erofs/decompressor.c (100%)
 rename {drivers/staging => fs}/erofs/dir.c (100%)
 rename {drivers/staging => fs}/erofs/erofs_fs.h (100%)
 rename {drivers/staging => fs}/erofs/inode.c (100%)
 rename {drivers/staging => fs}/erofs/internal.h (100%)
 rename {drivers/staging => fs}/erofs/namei.c (100%)
 rename {drivers/staging => fs}/erofs/super.c (100%)
 rename {drivers/staging => fs}/erofs/tagptr.h (100%)
 rename {drivers/staging => fs}/erofs/utils.c (100%)
 rename {drivers/staging => fs}/erofs/xattr.c (100%)
 rename {drivers/staging => fs}/erofs/xattr.h (100%)
 rename {drivers/staging => fs}/erofs/zdata.c (100%)
 rename {drivers/staging => fs}/erofs/zdata.h (100%)
 rename {drivers/staging => fs}/erofs/zmap.c (100%)
 rename {drivers/staging => fs}/erofs/zpvec.h (100%)

diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig
index 58e8140c1f57..2705c72515c5 100644
--- a/drivers/staging/Kconfig
+++ b/drivers/staging/Kconfig
@@ -122,6 +122,4 @@ source "drivers/staging/gasket/Kconfig"
 
 source "drivers/staging/axis-fifo/Kconfig"
 
-source "drivers/staging/erofs/Kconfig"
-
 endif # STAGING
diff --git a/drivers/staging/Makefile b/drivers/staging/Makefile
index 6f3065569fa0..cac453fcd774 100644
--- a/drivers/staging/Makefile
+++ b/drivers/staging/Makefile
@@ -51,4 +51,3 @@ obj-$(CONFIG_SOC_MT7621)	+= mt7621-eth/
 obj-$(CONFIG_SOC_MT7621)	+= mt7621-dts/
 obj-$(CONFIG_STAGING_GASKET_FRAMEWORK)	+= gasket/
 obj-$(CONFIG_XIL_AXIS_FIFO)	+= axis-fifo/
-obj-$(CONFIG_EROFS_FS)		+= erofs/
diff --git a/fs/Kconfig b/fs/Kconfig
index ac474a61be37..d9e20290da75 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -255,6 +255,7 @@ source "fs/pstore/Kconfig"
 source "fs/sysv/Kconfig"
 source "fs/ufs/Kconfig"
 source "fs/exofs/Kconfig"
+source "fs/erofs/Kconfig"
 
 endif # MISC_FILESYSTEMS
 
diff --git a/fs/Makefile b/fs/Makefile
index 293733f61594..0f6bad3ac103 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -128,3 +128,4 @@ obj-y				+= exofs/ # Multiple modules
 obj-$(CONFIG_CEPH_FS)		+= ceph/
 obj-$(CONFIG_PSTORE)		+= pstore/
 obj-$(CONFIG_EFIVAR_FS)		+= efivarfs/
+obj-$(CONFIG_EROFS_FS)		+= erofs/
diff --git a/drivers/staging/erofs/Kconfig b/fs/erofs/Kconfig
similarity index 100%
rename from drivers/staging/erofs/Kconfig
rename to fs/erofs/Kconfig
diff --git a/drivers/staging/erofs/Makefile b/fs/erofs/Makefile
similarity index 100%
rename from drivers/staging/erofs/Makefile
rename to fs/erofs/Makefile
diff --git a/drivers/staging/erofs/TODO b/fs/erofs/TODO
similarity index 100%
rename from drivers/staging/erofs/TODO
rename to fs/erofs/TODO
diff --git a/drivers/staging/erofs/compress.h b/fs/erofs/compress.h
similarity index 100%
rename from drivers/staging/erofs/compress.h
rename to fs/erofs/compress.h
diff --git a/drivers/staging/erofs/data.c b/fs/erofs/data.c
similarity index 100%
rename from drivers/staging/erofs/data.c
rename to fs/erofs/data.c
diff --git a/drivers/staging/erofs/decompressor.c b/fs/erofs/decompressor.c
similarity index 100%
rename from drivers/staging/erofs/decompressor.c
rename to fs/erofs/decompressor.c
diff --git a/drivers/staging/erofs/dir.c b/fs/erofs/dir.c
similarity index 100%
rename from drivers/staging/erofs/dir.c
rename to fs/erofs/dir.c
diff --git a/drivers/staging/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
similarity index 100%
rename from drivers/staging/erofs/erofs_fs.h
rename to fs/erofs/erofs_fs.h
diff --git a/drivers/staging/erofs/inode.c b/fs/erofs/inode.c
similarity index 100%
rename from drivers/staging/erofs/inode.c
rename to fs/erofs/inode.c
diff --git a/drivers/staging/erofs/internal.h b/fs/erofs/internal.h
similarity index 100%
rename from drivers/staging/erofs/internal.h
rename to fs/erofs/internal.h
diff --git a/drivers/staging/erofs/namei.c b/fs/erofs/namei.c
similarity index 100%
rename from drivers/staging/erofs/namei.c
rename to fs/erofs/namei.c
diff --git a/drivers/staging/erofs/super.c b/fs/erofs/super.c
similarity index 100%
rename from drivers/staging/erofs/super.c
rename to fs/erofs/super.c
diff --git a/drivers/staging/erofs/tagptr.h b/fs/erofs/tagptr.h
similarity index 100%
rename from drivers/staging/erofs/tagptr.h
rename to fs/erofs/tagptr.h
diff --git a/drivers/staging/erofs/utils.c b/fs/erofs/utils.c
similarity index 100%
rename from drivers/staging/erofs/utils.c
rename to fs/erofs/utils.c
diff --git a/drivers/staging/erofs/xattr.c b/fs/erofs/xattr.c
similarity index 100%
rename from drivers/staging/erofs/xattr.c
rename to fs/erofs/xattr.c
diff --git a/drivers/staging/erofs/xattr.h b/fs/erofs/xattr.h
similarity index 100%
rename from drivers/staging/erofs/xattr.h
rename to fs/erofs/xattr.h
diff --git a/drivers/staging/erofs/zdata.c b/fs/erofs/zdata.c
similarity index 100%
rename from drivers/staging/erofs/zdata.c
rename to fs/erofs/zdata.c
diff --git a/drivers/staging/erofs/zdata.h b/fs/erofs/zdata.h
similarity index 100%
rename from drivers/staging/erofs/zdata.h
rename to fs/erofs/zdata.h
diff --git a/drivers/staging/erofs/zmap.c b/fs/erofs/zmap.c
similarity index 100%
rename from drivers/staging/erofs/zmap.c
rename to fs/erofs/zmap.c
diff --git a/drivers/staging/erofs/zpvec.h b/fs/erofs/zpvec.h
similarity index 100%
rename from drivers/staging/erofs/zpvec.h
rename to fs/erofs/zpvec.h

From d4cd715771c3f919dc9089bbcca12d82774b2f97 Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@linux.alibaba.com>
Date: Mon, 25 Oct 2021 15:43:11 +0800
Subject: [PATCH 06/34] erofs: don't trigger WARN() when decompression fails

[ Upstream commit a0961f351d82d43ab0b845304caa235dfe249ae9 ]

syzbot reported a WARNING [1] due to corrupted compressed data.

As Dmitry said, "If this is not a kernel bug, then the code should
not use WARN. WARN if for kernel bugs and is recognized as such by
all testing systems and humans."

[1] https://lore.kernel.org/r/000000000000b3586105cf0ff45e@google.com

Link: https://lore.kernel.org/r/20211025074311.130395-1-hsiangkao@linux.alibaba.com
Cc: Dmitry Vyukov <dvyukov@google.com>
Reviewed-by: Chao Yu <chao@kernel.org>
Reported-by: syzbot+d8aaffc3719597e8cfb4@syzkaller.appspotmail.com
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Signed-off-by: Sasha Levin <sashal@kernel.org>
---
 fs/erofs/decompressor.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index 7822ab4a8262..1893910f489a 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -169,7 +169,6 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out)
 		erofs_err(rq->sb, "failed to decompress %d in[%u, %u] out[%u]",
 			  ret, inlen, inputmargin, rq->outputsize);
 
-		WARN_ON(1);
 		print_hex_dump(KERN_DEBUG, "[ in]: ", DUMP_PREFIX_OFFSET,
 			       16, 1, src + inputmargin, inlen, true);
 		print_hex_dump(KERN_DEBUG, "[out]: ", DUMP_PREFIX_OFFSET,

From b0429df102fbfdf220a929aa73caafe3e449f14d Mon Sep 17 00:00:00 2001
From: Yue Hu <huyue2@yulong.com>
Date: Tue, 16 Nov 2021 09:08:18 +0800
Subject: [PATCH 07/34] erofs: remove the occupied parameter from
 z_erofs_pagevec_enqueue()

commit 7dea3de7d384f4c8156e8bd93112ba6db1eb276c upstream.

No any behavior to variable occupied in z_erofs_attach_page() which
is only caller to z_erofs_pagevec_enqueue().

Link: https://lore.kernel.org/r/20210419102623.2015-1-zbestahu@gmail.com
Signed-off-by: Yue Hu <huyue2@yulong.com>
Reviewed-by: Gao Xiang <xiang@kernel.org>
Signed-off-by: Gao Xiang <xiang@kernel.org>
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/erofs/zdata.c | 4 +---
 fs/erofs/zpvec.h | 5 +----
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 37c3e28b1e91..6dcb069b16da 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -291,7 +291,6 @@ static int z_erofs_attach_page(struct z_erofs_collector *clt,
 			       enum z_erofs_page_type type)
 {
 	int ret;
-	bool occupied;
 
 	/* give priority for inplaceio */
 	if (clt->mode >= COLLECT_PRIMARY &&
@@ -299,8 +298,7 @@ static int z_erofs_attach_page(struct z_erofs_collector *clt,
 	    z_erofs_try_inplace_io(clt, page))
 		return 0;
 
-	ret = z_erofs_pagevec_enqueue(&clt->vector,
-				      page, type, &occupied);
+	ret = z_erofs_pagevec_enqueue(&clt->vector, page, type);
 	clt->cl->vcnt += (unsigned int)ret;
 
 	return ret ? 0 : -EAGAIN;
diff --git a/fs/erofs/zpvec.h b/fs/erofs/zpvec.h
index bfc12ecbbe83..dfd7fe0503bb 100644
--- a/fs/erofs/zpvec.h
+++ b/fs/erofs/zpvec.h
@@ -106,10 +106,8 @@ static inline void z_erofs_pagevec_ctor_init(struct z_erofs_pagevec_ctor *ctor,
 
 static inline bool z_erofs_pagevec_enqueue(struct z_erofs_pagevec_ctor *ctor,
 					   struct page *page,
-					   enum z_erofs_page_type type,
-					   bool *occupied)
+					   enum z_erofs_page_type type)
 {
-	*occupied = false;
 	if (!ctor->next && type)
 		if (ctor->index + 1 == ctor->nr)
 			return false;
@@ -124,7 +122,6 @@ static inline bool z_erofs_pagevec_enqueue(struct z_erofs_pagevec_ctor *ctor,
 	/* should remind that collector->next never equal to 1, 2 */
 	if (type == (uintptr_t)ctor->next) {
 		ctor->next = page;
-		*occupied = true;
 	}
 	ctor->pages[ctor->index++] = tagptr_fold(erofs_vtptr_t, page, type);
 	return true;

From e0bca84ae60e9676ed0741815b03b0b0eb0af0e1 Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@linux.alibaba.com>
Date: Tue, 16 Nov 2021 09:08:19 +0800
Subject: [PATCH 08/34] erofs: fix unsafe pagevec reuse of hooked pclusters

commit 86432a6dca9bed79111990851df5756d3eb5f57c upstream.

There are pclusters in runtime marked with Z_EROFS_PCLUSTER_TAIL
before actual I/O submission. Thus, the decompression chain can be
extended if the following pcluster chain hooks such tail pcluster.

As the related comment mentioned, if some page is made of a hooked
pcluster and another followed pcluster, it can be reused for in-place
I/O (since I/O should be submitted anyway):
 _______________________________________________________________
|  tail (partial) page |          head (partial) page           |
|_____PRIMARY_HOOKED___|____________PRIMARY_FOLLOWED____________|

However, it's by no means safe to reuse as pagevec since if such
PRIMARY_HOOKED pclusters finally move into bypass chain without I/O
submission. It's somewhat hard to reproduce with LZ4 and I just found
it (general protection fault) by ro_fsstressing a LZMA image for long
time.

I'm going to actively clean up related code together with multi-page
folio adaption in the next few months. Let's address it directly for
easier backporting for now.

Call trace for reference:
  z_erofs_decompress_pcluster+0x10a/0x8a0 [erofs]
  z_erofs_decompress_queue.isra.36+0x3c/0x60 [erofs]
  z_erofs_runqueue+0x5f3/0x840 [erofs]
  z_erofs_readahead+0x1e8/0x320 [erofs]
  read_pages+0x91/0x270
  page_cache_ra_unbounded+0x18b/0x240
  filemap_get_pages+0x10a/0x5f0
  filemap_read+0xa9/0x330
  new_sync_read+0x11b/0x1a0
  vfs_read+0xf1/0x190

Link: https://lore.kernel.org/r/20211103182006.4040-1-xiang@kernel.org
Fixes: 3883a79abd02 ("staging: erofs: introduce VLE decompression support")
Cc: <stable@vger.kernel.org> # 4.19+
Reviewed-by: Chao Yu <chao@kernel.org>
Signed-off-by: Gao Xiang <hsiangkao@linux.alibaba.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
---
 fs/erofs/zdata.c | 13 +++++++------
 fs/erofs/zpvec.h | 13 ++++++++++---
 2 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 6dcb069b16da..e0659382857d 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -287,8 +287,8 @@ static inline bool z_erofs_try_inplace_io(struct z_erofs_collector *clt,
 
 /* callers must be with collection lock held */
 static int z_erofs_attach_page(struct z_erofs_collector *clt,
-			       struct page *page,
-			       enum z_erofs_page_type type)
+			       struct page *page, enum z_erofs_page_type type,
+			       bool pvec_safereuse)
 {
 	int ret;
 
@@ -298,9 +298,9 @@ static int z_erofs_attach_page(struct z_erofs_collector *clt,
 	    z_erofs_try_inplace_io(clt, page))
 		return 0;
 
-	ret = z_erofs_pagevec_enqueue(&clt->vector, page, type);
+	ret = z_erofs_pagevec_enqueue(&clt->vector, page, type,
+				      pvec_safereuse);
 	clt->cl->vcnt += (unsigned int)ret;
-
 	return ret ? 0 : -EAGAIN;
 }
 
@@ -642,7 +642,8 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
 		tight &= (clt->mode >= COLLECT_PRIMARY_FOLLOWED);
 
 retry:
-	err = z_erofs_attach_page(clt, page, page_type);
+	err = z_erofs_attach_page(clt, page, page_type,
+				  clt->mode >= COLLECT_PRIMARY_FOLLOWED);
 	/* should allocate an additional staging page for pagevec */
 	if (err == -EAGAIN) {
 		struct page *const newpage =
@@ -650,7 +651,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
 
 		newpage->mapping = Z_EROFS_MAPPING_STAGING;
 		err = z_erofs_attach_page(clt, newpage,
-					  Z_EROFS_PAGE_TYPE_EXCLUSIVE);
+					  Z_EROFS_PAGE_TYPE_EXCLUSIVE, true);
 		if (!err)
 			goto retry;
 	}
diff --git a/fs/erofs/zpvec.h b/fs/erofs/zpvec.h
index dfd7fe0503bb..b05464f4a808 100644
--- a/fs/erofs/zpvec.h
+++ b/fs/erofs/zpvec.h
@@ -106,11 +106,18 @@ static inline void z_erofs_pagevec_ctor_init(struct z_erofs_pagevec_ctor *ctor,
 
 static inline bool z_erofs_pagevec_enqueue(struct z_erofs_pagevec_ctor *ctor,
 					   struct page *page,
-					   enum z_erofs_page_type type)
+					   enum z_erofs_page_type type,
+					   bool pvec_safereuse)
 {
-	if (!ctor->next && type)
-		if (ctor->index + 1 == ctor->nr)
+	if (!ctor->next) {
+		/* some pages cannot be reused as pvec safely without I/O */
+		if (type == Z_EROFS_PAGE_TYPE_EXCLUSIVE && !pvec_safereuse)
+			type = Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED;
+
+		if (type != Z_EROFS_PAGE_TYPE_EXCLUSIVE &&
+		    ctor->index + 1 == ctor->nr)
 			return false;
+	}
 
 	if (ctor->index >= ctor->nr)
 		z_erofs_pagevec_ctor_pagedown(ctor, false);

From 814be7844d0f95b9f0643d5e190013747edb728f Mon Sep 17 00:00:00 2001
From: Vladimir Zapolskiy <vladimir@tuxera.com>
Date: Fri, 30 Oct 2020 14:28:39 +0200
Subject: [PATCH 09/34] erofs: remove a void EROFS_VERSION macro set in
 Makefile

Since commit 4f761fa253b4 ("erofs: rename errln/infoln/debugln to
erofs_{err, info, dbg}") the defined macro EROFS_VERSION has no affect,
therefore removing it from the Makefile is a non-functional change.

Link: https://lore.kernel.org/r/20201030122839.25431-1-vladimir@tuxera.com
Reviewed-by: Gao Xiang <hsiangkao@redhat.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Vladimir Zapolskiy <vladimir@tuxera.com>
Signed-off-by: Gao Xiang <hsiangkao@redhat.com>
---
 fs/erofs/Makefile | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
index 46f2aa4ba46c..af159539fc1b 100644
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -1,11 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
 
-EROFS_VERSION = "1.0"
-
-ccflags-y += -DEROFS_VERSION=\"$(EROFS_VERSION)\"
-
 obj-$(CONFIG_EROFS_FS) += erofs.o
 erofs-objs := super.o inode.o data.o namei.o dir.o utils.o
 erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
 erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o
-

From 77e2119ba87a9924780bcaee4365c98da10f17d8 Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@redhat.com>
Date: Tue, 8 Dec 2020 17:58:32 +0800
Subject: [PATCH 10/34] erofs: get rid of magical Z_EROFS_MAPPING_STAGING

Previously, we played around with magical page->mapping for short-lived
temporary pages since we need to identify different types of pages in
the same pcluster but both invalidated and short-lived temporary pages
can have page->mapping == NULL. It was considered as safe because that
temporary pages are all non-LRU / non-movable pages.

This patch tends to use specific page->private to identify short-lived
pages instead so it won't rely on page->mapping anymore. Details are
described in "compress.h" as well.

Link: https://lore.kernel.org/r/20201208095834.3133565-1-hsiangkao@redhat.com
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Gao Xiang <hsiangkao@redhat.com>
---
 fs/erofs/compress.h     | 57 +++++++++++++++++++++++++++++------------
 fs/erofs/decompressor.c |  2 +-
 fs/erofs/zdata.c        | 44 +++++++++++++++++++------------
 fs/erofs/zdata.h        |  1 +
 4 files changed, 70 insertions(+), 34 deletions(-)

diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h
index e9b1aa0afe0e..07cb63d098d9 100644
--- a/fs/erofs/compress.h
+++ b/fs/erofs/compress.h
@@ -25,30 +25,55 @@ struct z_erofs_decompress_req {
 	bool inplace_io, partial_decoding;
 };
 
+/* some special page->private (unsigned long, see below) */
+#define Z_EROFS_SHORTLIVED_PAGE		(-1UL << 2)
+
 /*
- * - 0x5A110C8D ('sallocated', Z_EROFS_MAPPING_STAGING) -
- * used to mark temporary allocated pages from other
- * file/cached pages and NULL mapping pages.
+ * For all pages in a pcluster, page->private should be one of
+ * Type                         Last 2bits      page->private
+ * short-lived page             00              Z_EROFS_SHORTLIVED_PAGE
+ * cached/managed page          00              pointer to z_erofs_pcluster
+ * online page (file-backed,    01/10/11        sub-index << 2 | count
+ *              some pages can be used for inplace I/O)
+ *
+ * page->mapping should be one of
+ * Type                 page->mapping
+ * short-lived page     NULL
+ * cached/managed page  non-NULL or NULL (invalidated/truncated page)
+ * online page          non-NULL
+ *
+ * For all managed pages, PG_private should be set with 1 extra refcount,
+ * which is used for page reclaim / migration.
  */
-#define Z_EROFS_MAPPING_STAGING         ((void *)0x5A110C8D)
 
-/* check if a page is marked as staging */
-static inline bool z_erofs_page_is_staging(struct page *page)
+/*
+ * short-lived pages are pages directly from buddy system with specific
+ * page->private (no need to set PagePrivate since these are non-LRU /
+ * non-movable pages and bypass reclaim / migration code).
+ */
+static inline bool z_erofs_is_shortlived_page(struct page *page)
 {
-	return page->mapping == Z_EROFS_MAPPING_STAGING;
-}
-
-static inline bool z_erofs_put_stagingpage(struct list_head *pagepool,
-					   struct page *page)
-{
-	if (!z_erofs_page_is_staging(page))
+	if (page->private != Z_EROFS_SHORTLIVED_PAGE)
 		return false;
 
-	/* staging pages should not be used by others at the same time */
-	if (page_ref_count(page) > 1)
+	DBG_BUGON(page->mapping);
+	return true;
+}
+
+static inline bool z_erofs_put_shortlivedpage(struct list_head *pagepool,
+					      struct page *page)
+{
+	if (!z_erofs_is_shortlived_page(page))
+		return false;
+
+	/* short-lived pages should not be used by others at the same time */
+	if (page_ref_count(page) > 1) {
 		put_page(page);
-	else
+	} else {
+		/* follow the pcluster rule above. */
+		set_page_private(page, 0);
 		list_add(&page->lru, pagepool);
+	}
 	return true;
 }
 
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index 1893910f489a..d2853876e91d 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -75,7 +75,7 @@ static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
 			victim = erofs_allocpage(pagepool, GFP_KERNEL);
 			if (!victim)
 				return -ENOMEM;
-			victim->mapping = Z_EROFS_MAPPING_STAGING;
+			set_page_private(victim, Z_EROFS_SHORTLIVED_PAGE);
 		}
 		rq->out[i] = victim;
 	}
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index e0659382857d..0cf4c6d2bf5b 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -235,9 +235,9 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
 
 		/* barrier is implied in the following 'unlock_page' */
 		WRITE_ONCE(pcl->compressed_pages[i], NULL);
+
 		set_page_private(page, 0);
 		ClearPagePrivate(page);
-
 		unlock_page(page);
 		put_page(page);
 	}
@@ -264,6 +264,7 @@ int erofs_try_to_free_cached_page(struct address_space *mapping,
 		erofs_workgroup_unfreeze(&pcl->obj, 1);
 
 		if (ret) {
+			set_page_private(page, 0);
 			ClearPagePrivate(page);
 			put_page(page);
 		}
@@ -644,12 +645,12 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
 retry:
 	err = z_erofs_attach_page(clt, page, page_type,
 				  clt->mode >= COLLECT_PRIMARY_FOLLOWED);
-	/* should allocate an additional staging page for pagevec */
+	/* should allocate an additional short-lived page for pagevec */
 	if (err == -EAGAIN) {
 		struct page *const newpage =
 				alloc_page(GFP_NOFS | __GFP_NOFAIL);
 
-		newpage->mapping = Z_EROFS_MAPPING_STAGING;
+		set_page_private(newpage, Z_EROFS_SHORTLIVED_PAGE);
 		err = z_erofs_attach_page(clt, newpage,
 					  Z_EROFS_PAGE_TYPE_EXCLUSIVE, true);
 		if (!err)
@@ -706,6 +707,11 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
 		queue_work(z_erofs_workqueue, &io->u.work);
 }
 
+static bool z_erofs_page_is_invalidated(struct page *page)
+{
+	return !page->mapping && !z_erofs_is_shortlived_page(page);
+}
+
 static void z_erofs_decompressqueue_endio(struct bio *bio)
 {
 	tagptr1_t t = tagptr_init(tagptr1_t, bio->bi_private);
@@ -718,7 +724,7 @@ static void z_erofs_decompressqueue_endio(struct bio *bio)
 		struct page *page = bvec->bv_page;
 
 		DBG_BUGON(PageUptodate(page));
-		DBG_BUGON(!page->mapping);
+		DBG_BUGON(z_erofs_page_is_invalidated(page));
 
 		if (err)
 			SetPageError(page);
@@ -791,9 +797,9 @@ static int z_erofs_decompress_pcluster(struct super_block *sb,
 
 		/* all pages in pagevec ought to be valid */
 		DBG_BUGON(!page);
-		DBG_BUGON(!page->mapping);
+		DBG_BUGON(z_erofs_page_is_invalidated(page));
 
-		if (z_erofs_put_stagingpage(pagepool, page))
+		if (z_erofs_put_shortlivedpage(pagepool, page))
 			continue;
 
 		if (page_type == Z_EROFS_VLE_PAGE_TYPE_HEAD)
@@ -827,9 +833,9 @@ static int z_erofs_decompress_pcluster(struct super_block *sb,
 
 		/* all compressed pages ought to be valid */
 		DBG_BUGON(!page);
-		DBG_BUGON(!page->mapping);
+		DBG_BUGON(z_erofs_page_is_invalidated(page));
 
-		if (!z_erofs_page_is_staging(page)) {
+		if (!z_erofs_is_shortlived_page(page)) {
 			if (erofs_page_is_managed(sbi, page)) {
 				if (!PageUptodate(page))
 					err = -EIO;
@@ -854,7 +860,7 @@ static int z_erofs_decompress_pcluster(struct super_block *sb,
 			overlapped = true;
 		}
 
-		/* PG_error needs checking for inplaced and staging pages */
+		/* PG_error needs checking for all non-managed pages */
 		if (PageError(page)) {
 			DBG_BUGON(PageUptodate(page));
 			err = -EIO;
@@ -893,8 +899,8 @@ static int z_erofs_decompress_pcluster(struct super_block *sb,
 		if (erofs_page_is_managed(sbi, page))
 			continue;
 
-		/* recycle all individual staging pages */
-		(void)z_erofs_put_stagingpage(pagepool, page);
+		/* recycle all individual short-lived pages */
+		(void)z_erofs_put_shortlivedpage(pagepool, page);
 
 		WRITE_ONCE(compressed_pages[i], NULL);
 	}
@@ -904,10 +910,10 @@ static int z_erofs_decompress_pcluster(struct super_block *sb,
 		if (!page)
 			continue;
 
-		DBG_BUGON(!page->mapping);
+		DBG_BUGON(z_erofs_page_is_invalidated(page));
 
-		/* recycle all individual staging pages */
-		if (z_erofs_put_stagingpage(pagepool, page))
+		/* recycle all individual short-lived pages */
+		if (z_erofs_put_shortlivedpage(pagepool, page))
 			continue;
 
 		if (err < 0)
@@ -1007,13 +1013,17 @@ static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl,
 	mapping = READ_ONCE(page->mapping);
 
 	/*
-	 * unmanaged (file) pages are all locked solidly,
+	 * file-backed online pages in plcuster are all locked steady,
 	 * therefore it is impossible for `mapping' to be NULL.
 	 */
 	if (mapping && mapping != mc)
 		/* ought to be unmanaged pages */
 		goto out;
 
+	/* directly return for shortlived page as well */
+	if (z_erofs_is_shortlived_page(page))
+		goto out;
+
 	lock_page(page);
 
 	/* only true if page reclaim goes wrong, should never happen */
@@ -1058,8 +1068,8 @@ static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl,
 out_allocpage:
 	page = erofs_allocpage(pagepool, gfp | __GFP_NOFAIL);
 	if (!tocache || add_to_page_cache_lru(page, mc, index + nr, gfp)) {
-		/* non-LRU / non-movable temporary page is needed */
-		page->mapping = Z_EROFS_MAPPING_STAGING;
+		/* turn into temporary page if fails */
+		set_page_private(page, Z_EROFS_SHORTLIVED_PAGE);
 		tocache = false;
 	}
 
diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h
index 68da309d5ad7..658bda5efcda 100644
--- a/fs/erofs/zdata.h
+++ b/fs/erofs/zdata.h
@@ -172,6 +172,7 @@ static inline void z_erofs_onlinepage_endio(struct page *page)
 
 	v = atomic_dec_return(u.o);
 	if (!(v & Z_EROFS_ONLINEPAGE_COUNT_MASK)) {
+		set_page_private(page, 0);
 		ClearPagePrivate(page);
 		if (!PageError(page))
 			SetPageUptodate(page);

From d25241a02c9683c2cee9275c4ccf3ac81db2944b Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@redhat.com>
Date: Tue, 8 Dec 2020 17:58:33 +0800
Subject: [PATCH 11/34] erofs: insert to managed cache after adding to pcl

Previously, it could be some concern to call add_to_page_cache_lru()
with page->mapping == Z_EROFS_MAPPING_STAGING (!= NULL).

In contrast, page->private is used instead now, so partially revert
commit 5ddcee1f3a1c ("erofs: get rid of __stagingpage_alloc helper")
with some adaption for simplicity.

Link: https://lore.kernel.org/r/20201208095834.3133565-2-hsiangkao@redhat.com
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Gao Xiang <hsiangkao@redhat.com>
---
 fs/erofs/zdata.c | 24 ++++++++----------------
 1 file changed, 8 insertions(+), 16 deletions(-)

diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 0cf4c6d2bf5b..3cfd10481bda 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -1067,28 +1067,20 @@ static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl,
 	put_page(page);
 out_allocpage:
 	page = erofs_allocpage(pagepool, gfp | __GFP_NOFAIL);
-	if (!tocache || add_to_page_cache_lru(page, mc, index + nr, gfp)) {
-		/* turn into temporary page if fails */
-		set_page_private(page, Z_EROFS_SHORTLIVED_PAGE);
-		tocache = false;
-	}
-
 	if (oldpage != cmpxchg(&pcl->compressed_pages[nr], oldpage, page)) {
-		if (tocache) {
-			/* since it added to managed cache successfully */
-			unlock_page(page);
-			put_page(page);
-		} else {
-			list_add(&page->lru, pagepool);
-		}
+		list_add(&page->lru, pagepool);
 		cond_resched();
 		goto repeat;
 	}
 
-	if (tocache) {
-		set_page_private(page, (unsigned long)pcl);
-		SetPagePrivate(page);
+	if (!tocache || add_to_page_cache_lru(page, mc, index + nr, gfp)) {
+		/* turn into temporary page if fails (1 ref) */
+		set_page_private(page, Z_EROFS_SHORTLIVED_PAGE);
+		goto out;
 	}
+	set_page_private(page, (unsigned long)pcl);
+	SetPagePrivate(page);
+
 out:	/* the only exit (for tracing and debugging) */
 	return page;
 }

From 6fdf3375a10262f74a9bb66812a724eddae65528 Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@redhat.com>
Date: Tue, 8 Dec 2020 17:58:34 +0800
Subject: [PATCH 12/34] erofs: simplify try_to_claim_pcluster()

simplify try_to_claim_pcluster() by directly using cmpxchg() here
(the retry loop caused more overhead.) Also, move the chain loop
detection in and rename it to z_erofs_try_to_claim_pcluster().

Link: https://lore.kernel.org/r/20201208095834.3133565-3-hsiangkao@redhat.com
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Gao Xiang <hsiangkao@redhat.com>
---
 fs/erofs/zdata.c | 53 +++++++++++++++++++++++-------------------------
 1 file changed, 25 insertions(+), 28 deletions(-)

diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 3cfd10481bda..f5372cd96ea9 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -305,34 +305,33 @@ static int z_erofs_attach_page(struct z_erofs_collector *clt,
 	return ret ? 0 : -EAGAIN;
 }
 
-static enum z_erofs_collectmode
-try_to_claim_pcluster(struct z_erofs_pcluster *pcl,
-		      z_erofs_next_pcluster_t *owned_head)
+static void z_erofs_try_to_claim_pcluster(struct z_erofs_collector *clt)
 {
-	/* let's claim these following types of pclusters */
-retry:
-	if (pcl->next == Z_EROFS_PCLUSTER_NIL) {
-		/* type 1, nil pcluster */
-		if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_NIL,
-			    *owned_head) != Z_EROFS_PCLUSTER_NIL)
-			goto retry;
+	struct z_erofs_pcluster *pcl = clt->pcl;
+	z_erofs_next_pcluster_t *owned_head = &clt->owned_head;
 
+	/* type 1, nil pcluster (this pcluster doesn't belong to any chain.) */
+	if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_NIL,
+		    *owned_head) == Z_EROFS_PCLUSTER_NIL) {
 		*owned_head = &pcl->next;
-		/* lucky, I am the followee :) */
-		return COLLECT_PRIMARY_FOLLOWED;
-	} else if (pcl->next == Z_EROFS_PCLUSTER_TAIL) {
-		/*
-		 * type 2, link to the end of a existing open chain,
-		 * be careful that its submission itself is governed
-		 * by the original owned chain.
-		 */
-		if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL,
-			    *owned_head) != Z_EROFS_PCLUSTER_TAIL)
-			goto retry;
-		*owned_head = Z_EROFS_PCLUSTER_TAIL;
-		return COLLECT_PRIMARY_HOOKED;
+		/* so we can attach this pcluster to our submission chain. */
+		clt->mode = COLLECT_PRIMARY_FOLLOWED;
+		return;
 	}
-	return COLLECT_PRIMARY;	/* :( better luck next time */
+
+	/*
+	 * type 2, link to the end of an existing open chain, be careful
+	 * that its submission is controlled by the original attached chain.
+	 */
+	if (cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL,
+		    *owned_head) == Z_EROFS_PCLUSTER_TAIL) {
+		*owned_head = Z_EROFS_PCLUSTER_TAIL;
+		clt->mode = COLLECT_PRIMARY_HOOKED;
+		clt->tailpcl = NULL;
+		return;
+	}
+	/* type 3, it belongs to a chain, but it isn't the end of the chain */
+	clt->mode = COLLECT_PRIMARY;
 }
 
 static int z_erofs_lookup_collection(struct z_erofs_collector *clt,
@@ -385,10 +384,8 @@ static int z_erofs_lookup_collection(struct z_erofs_collector *clt,
 	/* used to check tail merging loop due to corrupted images */
 	if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL)
 		clt->tailpcl = pcl;
-	clt->mode = try_to_claim_pcluster(pcl, &clt->owned_head);
-	/* clean tailpcl if the current owned_head is Z_EROFS_PCLUSTER_TAIL */
-	if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL)
-		clt->tailpcl = NULL;
+	clt->pcl = pcl;
+	z_erofs_try_to_claim_pcluster(clt);
 	clt->pcl = pcl;
 	clt->cl = cl;
 	return 0;

From 33e7e223d8bf0c1fbf93545396be6c5ddb0a83e9 Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@redhat.com>
Date: Wed, 9 Dec 2020 20:37:17 +0800
Subject: [PATCH 13/34] erofs: force inplace I/O under low memory scenario

Try to forcely switch to inplace I/O under low memory scenario in
order to avoid direct memory reclaim due to cached page allocation.

Link: https://lore.kernel.org/r/20201209123717.12430-1-hsiangkao@aol.com
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Gao Xiang <hsiangkao@redhat.com>
---
 fs/erofs/compress.h |  3 +++
 fs/erofs/zdata.c    | 48 +++++++++++++++++++++++++++++++++++++--------
 2 files changed, 43 insertions(+), 8 deletions(-)

diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h
index 07cb63d098d9..3701c72bacb2 100644
--- a/fs/erofs/compress.h
+++ b/fs/erofs/compress.h
@@ -27,11 +27,13 @@ struct z_erofs_decompress_req {
 
 /* some special page->private (unsigned long, see below) */
 #define Z_EROFS_SHORTLIVED_PAGE		(-1UL << 2)
+#define Z_EROFS_PREALLOCATED_PAGE	(-2UL << 2)
 
 /*
  * For all pages in a pcluster, page->private should be one of
  * Type                         Last 2bits      page->private
  * short-lived page             00              Z_EROFS_SHORTLIVED_PAGE
+ * preallocated page (tryalloc) 00              Z_EROFS_PREALLOCATED_PAGE
  * cached/managed page          00              pointer to z_erofs_pcluster
  * online page (file-backed,    01/10/11        sub-index << 2 | count
  *              some pages can be used for inplace I/O)
@@ -39,6 +41,7 @@ struct z_erofs_decompress_req {
  * page->mapping should be one of
  * Type                 page->mapping
  * short-lived page     NULL
+ * preallocated page    NULL
  * cached/managed page  non-NULL or NULL (invalidated/truncated page)
  * online page          non-NULL
  *
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index f5372cd96ea9..d4cf12dbc683 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -19,6 +19,11 @@
 enum z_erofs_cache_alloctype {
 	DONTALLOC,	/* don't allocate any cached pages */
 	DELAYEDALLOC,	/* delayed allocation (at the time of submitting io) */
+	/*
+	 * try to use cached I/O if page allocation succeeds or fallback
+	 * to in-place I/O instead to avoid any direct reclaim.
+	 */
+	TRYALLOC,
 };
 
 /*
@@ -163,13 +168,16 @@ static DEFINE_MUTEX(z_pagemap_global_lock);
 
 static void preload_compressed_pages(struct z_erofs_collector *clt,
 				     struct address_space *mc,
-				     enum z_erofs_cache_alloctype type)
+				     enum z_erofs_cache_alloctype type,
+				     struct list_head *pagepool)
 {
 	const struct z_erofs_pcluster *pcl = clt->pcl;
 	const unsigned int clusterpages = BIT(pcl->clusterbits);
 	struct page **pages = clt->compressedpages;
 	pgoff_t index = pcl->obj.index + (pages - pcl->compressed_pages);
 	bool standalone = true;
+	gfp_t gfp = (mapping_gfp_mask(mc) & ~__GFP_DIRECT_RECLAIM) |
+			__GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
 
 	if (clt->mode < COLLECT_PRIMARY_FOLLOWED)
 		return;
@@ -177,6 +185,7 @@ static void preload_compressed_pages(struct z_erofs_collector *clt,
 	for (; pages < pcl->compressed_pages + clusterpages; ++pages) {
 		struct page *page;
 		compressed_page_t t;
+		struct page *newpage = NULL;
 
 		/* the compressed page was loaded before */
 		if (READ_ONCE(*pages))
@@ -188,7 +197,15 @@ static void preload_compressed_pages(struct z_erofs_collector *clt,
 			t = tag_compressed_page_justfound(page);
 		} else if (type == DELAYEDALLOC) {
 			t = tagptr_init(compressed_page_t, PAGE_UNALLOCATED);
+		} else if (type == TRYALLOC) {
+			newpage = erofs_allocpage(pagepool, gfp);
+			if (!newpage)
+				goto dontalloc;
+
+			set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE);
+			t = tag_compressed_page_justfound(newpage);
 		} else {	/* DONTALLOC */
+dontalloc:
 			if (standalone)
 				clt->compressedpages = pages;
 			standalone = false;
@@ -198,8 +215,12 @@ static void preload_compressed_pages(struct z_erofs_collector *clt,
 		if (!cmpxchg_relaxed(pages, NULL, tagptr_cast_ptr(t)))
 			continue;
 
-		if (page)
+		if (page) {
 			put_page(page);
+		} else if (newpage) {
+			set_page_private(newpage, 0);
+			list_add(&newpage->lru, pagepool);
+		}
 	}
 
 	if (standalone)		/* downgrade to PRIMARY_FOLLOWED_NOINPLACE */
@@ -555,7 +576,7 @@ static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend *fe,
 }
 
 static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
-				struct page *page)
+				struct page *page, struct list_head *pagepool)
 {
 	struct inode *const inode = fe->inode;
 	struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
@@ -608,11 +629,12 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
 
 	/* preload all compressed pages (maybe downgrade role if necessary) */
 	if (should_alloc_managed_pages(fe, sbi->cache_strategy, map->m_la))
-		cache_strategy = DELAYEDALLOC;
+		cache_strategy = TRYALLOC;
 	else
 		cache_strategy = DONTALLOC;
 
-	preload_compressed_pages(clt, MNGD_MAPPING(sbi), cache_strategy);
+	preload_compressed_pages(clt, MNGD_MAPPING(sbi),
+				 cache_strategy, pagepool);
 
 hitted:
 	/*
@@ -1007,6 +1029,16 @@ static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl,
 	justfound = tagptr_unfold_tags(t);
 	page = tagptr_unfold_ptr(t);
 
+	/*
+	 * preallocated cached pages, which is used to avoid direct reclaim
+	 * otherwise, it will go inplace I/O path instead.
+	 */
+	if (page->private == Z_EROFS_PREALLOCATED_PAGE) {
+		WRITE_ONCE(pcl->compressed_pages[nr], page);
+		set_page_private(page, 0);
+		tocache = true;
+		goto out_tocache;
+	}
 	mapping = READ_ONCE(page->mapping);
 
 	/*
@@ -1069,7 +1101,7 @@ static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl,
 		cond_resched();
 		goto repeat;
 	}
-
+out_tocache:
 	if (!tocache || add_to_page_cache_lru(page, mc, index + nr, gfp)) {
 		/* turn into temporary page if fails (1 ref) */
 		set_page_private(page, Z_EROFS_SHORTLIVED_PAGE);
@@ -1279,7 +1311,7 @@ static int z_erofs_readpage(struct file *file, struct page *page)
 
 	f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT;
 
-	err = z_erofs_do_read_page(&f, page);
+	err = z_erofs_do_read_page(&f, page, &pagepool);
 	(void)z_erofs_collector_end(&f.clt);
 
 	/* if some compressed cluster ready, need submit them anyway */
@@ -1342,7 +1374,7 @@ static int z_erofs_readpages(struct file *filp, struct address_space *mapping,
 		/* traversal in reverse order */
 		head = (void *)page_private(page);
 
-		err = z_erofs_do_read_page(&f, page);
+		err = z_erofs_do_read_page(&f, page, &pagepool);
 		if (err)
 			erofs_err(inode->i_sb,
 				  "readahead error at page %lu @ nid %llu",

From 55a37faecea0e46326035416fcfda54a5f9d7ba7 Mon Sep 17 00:00:00 2001
From: Huang Jianan <huangjianan@oppo.com>
Date: Tue, 16 Mar 2021 11:15:14 +0800
Subject: [PATCH 14/34] erofs: avoid memory allocation failure during rolling
 decompression

Currently, err would be treated as io error. Therefore, it'd be
better to ensure memory allocation during rolling decompression
to avoid such io error.

In the long term, we might consider adding another !Uptodate case
for such case.

Link: https://lore.kernel.org/r/20210316031515.90954-1-huangjianan@oppo.com
Reviewed-by: Gao Xiang <hsiangkao@redhat.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Huang Jianan <huangjianan@oppo.com>
Signed-off-by: Guo Weichao <guoweichao@oppo.com>
Signed-off-by: Gao Xiang <hsiangkao@redhat.com>
---
 fs/erofs/decompressor.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index d2853876e91d..cebe62a0fb4e 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -72,9 +72,8 @@ static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
 			victim = availables[--top];
 			get_page(victim);
 		} else {
-			victim = erofs_allocpage(pagepool, GFP_KERNEL);
-			if (!victim)
-				return -ENOMEM;
+			victim = erofs_allocpage(pagepool,
+						 GFP_KERNEL | __GFP_NOFAIL);
 			set_page_private(victim, Z_EROFS_SHORTLIVED_PAGE);
 		}
 		rq->out[i] = victim;

From 076924e0ba769d5da8903a4d8fb30472c04efab0 Mon Sep 17 00:00:00 2001
From: Huang Jianan <huangjianan@oppo.com>
Date: Wed, 17 Mar 2021 11:54:47 +0800
Subject: [PATCH 15/34] erofs: use workqueue decompression for atomic contexts
 only

z_erofs_decompressqueue_endio may not be executed in the atomic
context, for example, when dm-verity is turned on. In this scenario,
data can be decompressed directly to get rid of additional kworker
scheduling overhead.

Link: https://lore.kernel.org/r/20210317035448.13921-2-huangjianan@oppo.com
Reviewed-by: Gao Xiang <hsiangkao@redhat.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Huang Jianan <huangjianan@oppo.com>
Signed-off-by: Guo Weichao <guoweichao@oppo.com>
Signed-off-by: Gao Xiang <hsiangkao@redhat.com>
---
 fs/erofs/zdata.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index d4cf12dbc683..086e28ec5a51 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -708,6 +708,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe,
 	goto out;
 }
 
+static void z_erofs_decompressqueue_work(struct work_struct *work);
 static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
 				       bool sync, int bios)
 {
@@ -722,8 +723,14 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
 		return;
 	}
 
-	if (!atomic_add_return(bios, &io->pending_bios))
+	if (atomic_add_return(bios, &io->pending_bios))
+		return;
+	/* Use workqueue decompression for atomic contexts only */
+	if (in_atomic() || irqs_disabled()) {
 		queue_work(z_erofs_workqueue, &io->u.work);
+		return;
+	}
+	z_erofs_decompressqueue_work(&io->u.work);
 }
 
 static bool z_erofs_page_is_invalidated(struct page *page)

From 9b3398c32d3b89705afddb99d71374120f84d747 Mon Sep 17 00:00:00 2001
From: Huang Jianan <huangjianan@oppo.com>
Date: Wed, 17 Mar 2021 11:54:48 +0800
Subject: [PATCH 16/34] erofs: use sync decompression for atomic contexts only

Sync decompression was introduced to get rid of additional kworker
scheduling overhead. But there is no such overhead in non-atomic
contexts. Therefore, it should be better to turn off sync decompression
to avoid the current thread waiting in z_erofs_runqueue.

Link: https://lore.kernel.org/r/20210317035448.13921-3-huangjianan@oppo.com
Reviewed-by: Gao Xiang <hsiangkao@redhat.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Huang Jianan <huangjianan@oppo.com>
Signed-off-by: Guo Weichao <guoweichao@oppo.com>
Signed-off-by: Gao Xiang <hsiangkao@redhat.com>
---
 fs/erofs/internal.h | 3 +++
 fs/erofs/super.c    | 1 +
 fs/erofs/zdata.c    | 8 ++++++--
 3 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 5ee2368a18d5..eaa2ebcab926 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -54,6 +54,9 @@ struct erofs_sb_info {
 	/* the dedicated workstation for compression */
 	struct radix_tree_root workstn_tree;
 
+	/* strategy of sync decompression (false - auto, true - force on) */
+	bool readahead_sync_decompress;
+
 	/* threshold for decompression synchronously */
 	unsigned int max_sync_decompress_pages;
 
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 13977976c719..18e31aad02f1 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -239,6 +239,7 @@ static void erofs_default_options(struct erofs_sb_info *sbi)
 #ifdef CONFIG_EROFS_FS_ZIP
 	sbi->cache_strategy = EROFS_ZIP_CACHE_READAROUND;
 	sbi->max_sync_decompress_pages = 3;
+	sbi->readahead_sync_decompress = false;
 #endif
 #ifdef CONFIG_EROFS_FS_XATTR
 	set_opt(sbi, XATTR_USER);
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 086e28ec5a51..4dd180a1130c 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -712,6 +712,8 @@ static void z_erofs_decompressqueue_work(struct work_struct *work);
 static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
 				       bool sync, int bios)
 {
+	struct erofs_sb_info *const sbi = EROFS_SB(io->sb);
+
 	/* wake up the caller thread for sync decompression */
 	if (sync) {
 		unsigned long flags;
@@ -725,9 +727,10 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io,
 
 	if (atomic_add_return(bios, &io->pending_bios))
 		return;
-	/* Use workqueue decompression for atomic contexts only */
+	/* Use workqueue and sync decompression for atomic contexts only */
 	if (in_atomic() || irqs_disabled()) {
 		queue_work(z_erofs_workqueue, &io->u.work);
+		sbi->readahead_sync_decompress = true;
 		return;
 	}
 	z_erofs_decompressqueue_work(&io->u.work);
@@ -1341,7 +1344,8 @@ static int z_erofs_readpages(struct file *filp, struct address_space *mapping,
 	struct inode *const inode = mapping->host;
 	struct erofs_sb_info *const sbi = EROFS_I_SB(inode);
 
-	bool sync = (nr_pages <= sbi->max_sync_decompress_pages);
+	bool sync = (sbi->readahead_sync_decompress &&
+			nr_pages <= sbi->max_sync_decompress_pages);
 	struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode);
 	gfp_t gfp = mapping_gfp_constraint(mapping, GFP_KERNEL);
 	struct page *head = NULL;

From eb68e9eefb7231c8431ccd1aca6481c4f0360d72 Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@redhat.com>
Date: Mon, 22 Mar 2021 02:32:27 +0800
Subject: [PATCH 17/34] erofs: complete a missing case for inplace I/O

Add a missing case which could cause unnecessary page allocation but
not directly use inplace I/O instead, which increases runtime extra
memory footprint.

The detail is, considering an online file-backed page, the right half
of the page is chosen to be cached (e.g. the end page of a readahead
request) and some of its data doesn't exist in managed cache, so the
pcluster will be definitely kept in the submission chain. (IOWs, it
cannot be decompressed without I/O, e.g., due to the bypass queue).

Currently, DELAYEDALLOC/TRYALLOC cases can be downgraded as NOINPLACE,
and stop online pages from inplace I/O. After this patch, unneeded page
allocations won't be observed in pickup_page_for_submission() then.

Link: https://lore.kernel.org/r/20210321183227.5182-1-hsiangkao@aol.com
Signed-off-by: Gao Xiang <hsiangkao@redhat.com>
---
 fs/erofs/zdata.c | 44 +++++++++++++++++++++++++++++---------------
 1 file changed, 29 insertions(+), 15 deletions(-)

diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 4dd180a1130c..216b074e1b19 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -113,6 +113,12 @@ enum z_erofs_collectmode {
 	 * |_______PRIMARY_FOLLOWED_______|________PRIMARY_HOOKED___________|
 	 */
 	COLLECT_PRIMARY_HOOKED,
+	/*
+	 * a weak form of COLLECT_PRIMARY_FOLLOWED, the difference is that it
+	 * could be dispatched into bypass queue later due to uptodated managed
+	 * pages. All related online pages cannot be reused for inplace I/O (or
+	 * pagevec) since it can be directly decoded without I/O submission.
+	 */
 	COLLECT_PRIMARY_FOLLOWED_NOINPLACE,
 	/*
 	 * The current collection has been linked with the owned chain, and
@@ -195,21 +201,25 @@ static void preload_compressed_pages(struct z_erofs_collector *clt,
 
 		if (page) {
 			t = tag_compressed_page_justfound(page);
-		} else if (type == DELAYEDALLOC) {
-			t = tagptr_init(compressed_page_t, PAGE_UNALLOCATED);
-		} else if (type == TRYALLOC) {
-			newpage = erofs_allocpage(pagepool, gfp);
-			if (!newpage)
-				goto dontalloc;
-
-			set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE);
-			t = tag_compressed_page_justfound(newpage);
-		} else {	/* DONTALLOC */
-dontalloc:
-			if (standalone)
-				clt->compressedpages = pages;
+		} else {
+			/* I/O is needed, no possible to decompress directly */
 			standalone = false;
-			continue;
+			switch (type) {
+			case DELAYEDALLOC:
+				t = tagptr_init(compressed_page_t,
+						PAGE_UNALLOCATED);
+				break;
+			case TRYALLOC:
+				newpage = erofs_allocpage(pagepool, gfp);
+				if (!newpage)
+					continue;
+				set_page_private(newpage,
+						 Z_EROFS_PREALLOCATED_PAGE);
+				t = tag_compressed_page_justfound(newpage);
+				break;
+			default:        /* DONTALLOC */
+				continue;
+			}
 		}
 
 		if (!cmpxchg_relaxed(pages, NULL, tagptr_cast_ptr(t)))
@@ -223,7 +233,11 @@ static void preload_compressed_pages(struct z_erofs_collector *clt,
 		}
 	}
 
-	if (standalone)		/* downgrade to PRIMARY_FOLLOWED_NOINPLACE */
+	/*
+	 * don't do inplace I/O if all compressed pages are available in
+	 * managed cache since it can be moved to the bypass queue instead.
+	 */
+	if (standalone)
 		clt->mode = COLLECT_PRIMARY_FOLLOWED_NOINPLACE;
 }
 

From b0a1e9660543a54467461a53c708cef93c7f3e18 Mon Sep 17 00:00:00 2001
From: Yue Hu <huyue2@yulong.com>
Date: Thu, 25 Mar 2021 15:10:08 +0800
Subject: [PATCH 18/34] erofs: don't use erofs_map_blocks() any more

Currently, erofs_map_blocks() will be called only from
erofs_{bmap, read_raw_page} which are all for uncompressed files.
So, the compression branch in erofs_map_blocks() is pointless. Let's
remove it and use erofs_map_blocks_flatmode() directly. Also update
related comments.

Link: https://lore.kernel.org/r/20210325071008.573-1-zbestahu@gmail.com
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Yue Hu <huyue2@yulong.com>
Signed-off-by: Gao Xiang <hsiangkao@redhat.com>
---
 fs/erofs/data.c     | 19 ++-----------------
 fs/erofs/internal.h |  6 ++----
 2 files changed, 4 insertions(+), 21 deletions(-)

diff --git a/fs/erofs/data.c b/fs/erofs/data.c
index 4f305b68497e..dc264a819dc2 100644
--- a/fs/erofs/data.c
+++ b/fs/erofs/data.c
@@ -108,21 +108,6 @@ static int erofs_map_blocks_flatmode(struct inode *inode,
 	return err;
 }
 
-int erofs_map_blocks(struct inode *inode,
-		     struct erofs_map_blocks *map, int flags)
-{
-	if (erofs_inode_is_data_compressed(EROFS_I(inode)->datalayout)) {
-		int err = z_erofs_map_blocks_iter(inode, map, flags);
-
-		if (map->mpage) {
-			put_page(map->mpage);
-			map->mpage = NULL;
-		}
-		return err;
-	}
-	return erofs_map_blocks_flatmode(inode, map, flags);
-}
-
 static inline struct bio *erofs_read_raw_page(struct bio *bio,
 					      struct address_space *mapping,
 					      struct page *page,
@@ -158,7 +143,7 @@ static inline struct bio *erofs_read_raw_page(struct bio *bio,
 		erofs_blk_t blknr;
 		unsigned int blkoff;
 
-		err = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW);
+		err = erofs_map_blocks_flatmode(inode, &map, EROFS_GET_BLOCKS_RAW);
 		if (err)
 			goto err_out;
 
@@ -336,7 +321,7 @@ static sector_t erofs_bmap(struct address_space *mapping, sector_t block)
 			return 0;
 	}
 
-	if (!erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW))
+	if (!erofs_map_blocks_flatmode(inode, &map, EROFS_GET_BLOCKS_RAW))
 		return erofs_blknr(map.m_pa);
 
 	return 0;
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index eaa2ebcab926..ef55cfae4cd6 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -284,7 +284,7 @@ extern const struct address_space_operations erofs_raw_access_aops;
 extern const struct address_space_operations z_erofs_aops;
 
 /*
- * Logical to physical block mapping, used by erofs_map_blocks()
+ * Logical to physical block mapping
  *
  * Different with other file systems, it is used for 2 access modes:
  *
@@ -331,7 +331,7 @@ struct erofs_map_blocks {
 	struct page *mpage;
 };
 
-/* Flags used by erofs_map_blocks() */
+/* Flags used by erofs_map_blocks_flatmode() */
 #define EROFS_GET_BLOCKS_RAW    0x0001
 
 /* zmap.c */
@@ -353,8 +353,6 @@ static inline int z_erofs_map_blocks_iter(struct inode *inode,
 /* data.c */
 struct page *erofs_get_meta_page(struct super_block *sb, erofs_blk_t blkaddr);
 
-int erofs_map_blocks(struct inode *, struct erofs_map_blocks *, int);
-
 /* inode.c */
 static inline unsigned long erofs_inode_hash(erofs_nid_t nid)
 {

From 3877bc67a9c285a97653a7e297c718d9f4d95722 Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@redhat.com>
Date: Mon, 29 Mar 2021 09:23:05 +0800
Subject: [PATCH 19/34] erofs: introduce erofs_sb_has_xxx() helpers

Introduce erofs_sb_has_xxx() to make long checks short, especially
for later big pcluster & LZMA features.

Link: https://lore.kernel.org/r/20210329012308.28743-2-hsiangkao@aol.com
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Gao Xiang <hsiangkao@redhat.com>
---
 fs/erofs/decompressor.c | 3 +--
 fs/erofs/internal.h     | 9 +++++++++
 fs/erofs/super.c        | 2 +-
 3 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index cebe62a0fb4e..510b8b3b5271 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -123,8 +123,7 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out)
 	support_0padding = false;
 
 	/* decompression inplace is only safe when 0padding is enabled */
-	if (EROFS_SB(rq->sb)->feature_incompat &
-	    EROFS_FEATURE_INCOMPAT_LZ4_0PADDING) {
+	if (erofs_sb_has_lz4_0padding(EROFS_SB(rq->sb))) {
 		support_0padding = true;
 
 		while (!src[inputmargin & ~PAGE_MASK])
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index ef55cfae4cd6..aa1a7ce8c1aa 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -213,6 +213,15 @@ static inline erofs_off_t iloc(struct erofs_sb_info *sbi, erofs_nid_t nid)
 	return blknr_to_addr(sbi->meta_blkaddr) + (nid << sbi->islotbits);
 }
 
+#define EROFS_FEATURE_FUNCS(name, compat, feature) \
+static inline bool erofs_sb_has_##name(struct erofs_sb_info *sbi) \
+{ \
+	return sbi->feature_##compat & EROFS_FEATURE_##feature; \
+}
+
+EROFS_FEATURE_FUNCS(lz4_0padding, incompat, INCOMPAT_LZ4_0PADDING)
+EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM)
+
 /* atomic flag definitions */
 #define EROFS_I_EA_INITED_BIT	0
 #define EROFS_I_Z_INITED_BIT	1
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 18e31aad02f1..6cac5d1489b6 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -152,7 +152,7 @@ static int erofs_read_superblock(struct super_block *sb)
 	}
 
 	sbi->feature_compat = le32_to_cpu(dsb->feature_compat);
-	if (sbi->feature_compat & EROFS_FEATURE_COMPAT_SB_CHKSUM) {
+	if (erofs_sb_has_sb_chksum(sbi)) {
 		ret = erofs_superblock_csum_verify(sb, data);
 		if (ret)
 			goto out;

From c4040ba892191f981753fa6759ea6584a372ec1a Mon Sep 17 00:00:00 2001
From: Huang Jianan <huangjianan@oppo.com>
Date: Mon, 29 Mar 2021 09:23:06 +0800
Subject: [PATCH 20/34] erofs: support adjust lz4 history window size

lz4 uses LZ4_DISTANCE_MAX to record history preservation. When
using rolling decompression, a block with a higher compression
ratio will cause a larger memory allocation (up to 64k). It may
cause a large resource burden in extreme cases on devices with
small memory and a large number of concurrent IOs. So appropriately
reducing this value can improve performance.

Decreasing this value will reduce the compression ratio (except
when input_size <LZ4_DISTANCE_MAX). But considering that erofs
currently only supports 4k output, reducing this value will not
significantly reduce the compression benefits.

The maximum value of LZ4_DISTANCE_MAX defined by lz4 is 64k, and
we can only reduce this value. For the old kernel, it just can't
reduce the memory allocation during rolling decompression without
affecting the decompression result.

Link: https://lore.kernel.org/r/20210329012308.28743-3-hsiangkao@aol.com
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Huang Jianan <huangjianan@oppo.com>
Signed-off-by: Guo Weichao <guoweichao@oppo.com>
[ Gao Xiang: introduce struct erofs_sb_lz4_info for configurations. ]
Signed-off-by: Gao Xiang <hsiangkao@redhat.com>
---
 fs/erofs/decompressor.c | 21 +++++++++++++++++----
 fs/erofs/erofs_fs.h     |  4 +++-
 fs/erofs/internal.h     | 19 +++++++++++++++++++
 fs/erofs/super.c        |  4 +++-
 4 files changed, 42 insertions(+), 6 deletions(-)

diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index 510b8b3b5271..8ee4fa6ecd20 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -27,6 +27,17 @@ struct z_erofs_decompressor {
 	char *name;
 };
 
+int z_erofs_load_lz4_config(struct super_block *sb,
+			    struct erofs_super_block *dsb)
+{
+	u16 distance = le16_to_cpu(dsb->lz4_max_distance);
+
+	EROFS_SB(sb)->lz4.max_distance_pages = distance ?
+					DIV_ROUND_UP(distance, PAGE_SIZE) + 1 :
+					LZ4_MAX_DISTANCE_PAGES;
+	return 0;
+}
+
 static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
 					 struct list_head *pagepool)
 {
@@ -35,6 +46,8 @@ static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
 	struct page *availables[LZ4_MAX_DISTANCE_PAGES] = { NULL };
 	unsigned long bounced[DIV_ROUND_UP(LZ4_MAX_DISTANCE_PAGES,
 					   BITS_PER_LONG)] = { 0 };
+	unsigned int lz4_max_distance_pages =
+				EROFS_SB(rq->sb)->lz4.max_distance_pages;
 	void *kaddr = NULL;
 	unsigned int i, j, top;
 
@@ -43,14 +56,14 @@ static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
 		struct page *const page = rq->out[i];
 		struct page *victim;
 
-		if (j >= LZ4_MAX_DISTANCE_PAGES)
+		if (j >= lz4_max_distance_pages)
 			j = 0;
 
 		/* 'valid' bounced can only be tested after a complete round */
 		if (test_bit(j, bounced)) {
-			DBG_BUGON(i < LZ4_MAX_DISTANCE_PAGES);
-			DBG_BUGON(top >= LZ4_MAX_DISTANCE_PAGES);
-			availables[top++] = rq->out[i - LZ4_MAX_DISTANCE_PAGES];
+			DBG_BUGON(i < lz4_max_distance_pages);
+			DBG_BUGON(top >= lz4_max_distance_pages);
+			availables[top++] = rq->out[i - lz4_max_distance_pages];
 		}
 
 		if (page) {
diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
index 011c3e281f7f..a1b212f6812b 100644
--- a/fs/erofs/erofs_fs.h
+++ b/fs/erofs/erofs_fs.h
@@ -39,7 +39,9 @@ struct erofs_super_block {
 	__u8 uuid[16];          /* 128-bit uuid for volume */
 	__u8 volume_name[16];   /* volume name */
 	__le32 feature_incompat;
-	__u8 reserved2[44];
+	/* customized lz4 sliding window size instead of 64k by default */
+	__le16 lz4_max_distance;
+	__u8 reserved2[42];
 };
 
 /*
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index aa1a7ce8c1aa..4c15f0b8261e 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -45,6 +45,12 @@ typedef u64 erofs_off_t;
 /* data type for filesystem-wide blocks number */
 typedef u32 erofs_blk_t;
 
+/* all filesystem-wide lz4 configurations */
+struct erofs_sb_lz4_info {
+	/* # of pages needed for EROFS lz4 rolling decompression */
+	u16 max_distance_pages;
+};
+
 struct erofs_sb_info {
 #ifdef CONFIG_EROFS_FS_ZIP
 	/* list for all registered superblocks, mainly for shrinker */
@@ -67,6 +73,8 @@ struct erofs_sb_info {
 
 	/* pseudo inode to manage cached pages */
 	struct inode *managed_cache;
+
+	struct erofs_sb_lz4_info lz4;
 #endif	/* CONFIG_EROFS_FS_ZIP */
 	u32 blocks;
 	u32 meta_blkaddr;
@@ -424,6 +432,8 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
 				       struct erofs_workgroup *egrp);
 int erofs_try_to_free_cached_page(struct address_space *mapping,
 				  struct page *page);
+int z_erofs_load_lz4_config(struct super_block *sb,
+			    struct erofs_super_block *dsb);
 #else
 static inline void erofs_shrinker_register(struct super_block *sb) {}
 static inline void erofs_shrinker_unregister(struct super_block *sb) {}
@@ -431,6 +441,15 @@ static inline int erofs_init_shrinker(void) { return 0; }
 static inline void erofs_exit_shrinker(void) {}
 static inline int z_erofs_init_zip_subsystem(void) { return 0; }
 static inline void z_erofs_exit_zip_subsystem(void) {}
+static inline int z_erofs_load_lz4_config(struct super_block *sb,
+				struct erofs_super_block *dsb)
+{
+	if (dsb->lz4_max_distance) {
+		erofs_err(sb, "lz4 algorithm isn't enabled");
+		return -EINVAL;
+	}
+	return 0;
+}
 #endif	/* !CONFIG_EROFS_FS_ZIP */
 
 #define EFSCORRUPTED    EUCLEAN         /* Filesystem is corrupted */
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 6cac5d1489b6..599463632bd3 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -191,7 +191,9 @@ static int erofs_read_superblock(struct super_block *sb)
 		ret = -EFSCORRUPTED;
 		goto out;
 	}
-	ret = 0;
+
+	/* parse on-disk compression configurations */
+	ret = z_erofs_load_lz4_config(sb, dsb);
 out:
 	kunmap(page);
 	put_page(page);

From 4c219cefe47c98e54dd9c83822d02d0a2b6bcfce Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@redhat.com>
Date: Mon, 29 Mar 2021 09:23:07 +0800
Subject: [PATCH 21/34] erofs: introduce on-disk lz4 fs configurations

Introduce z_erofs_lz4_cfgs to store all lz4 configurations.
Currently it's only max_distance, but will be used for new
features later.

Link: https://lore.kernel.org/r/20210329012308.28743-4-hsiangkao@aol.com
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Gao Xiang <hsiangkao@redhat.com>
---
 fs/erofs/decompressor.c | 15 +++++++++++++--
 fs/erofs/erofs_fs.h     |  6 ++++++
 fs/erofs/internal.h     |  8 +++++---
 fs/erofs/super.c        |  2 +-
 4 files changed, 25 insertions(+), 6 deletions(-)

diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index 8ee4fa6ecd20..4a8a3779f9cd 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -28,9 +28,20 @@ struct z_erofs_decompressor {
 };
 
 int z_erofs_load_lz4_config(struct super_block *sb,
-			    struct erofs_super_block *dsb)
+			    struct erofs_super_block *dsb,
+			    struct z_erofs_lz4_cfgs *lz4, int size)
 {
-	u16 distance = le16_to_cpu(dsb->lz4_max_distance);
+	u16 distance;
+
+	if (lz4) {
+		if (size < sizeof(struct z_erofs_lz4_cfgs)) {
+			erofs_err(sb, "invalid lz4 cfgs, size=%u", size);
+			return -EINVAL;
+		}
+		distance = le16_to_cpu(lz4->max_distance);
+	} else {
+		distance = le16_to_cpu(dsb->lz4_max_distance);
+	}
 
 	EROFS_SB(sb)->lz4.max_distance_pages = distance ?
 					DIV_ROUND_UP(distance, PAGE_SIZE) + 1 :
diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
index a1b212f6812b..383cc5c8f98b 100644
--- a/fs/erofs/erofs_fs.h
+++ b/fs/erofs/erofs_fs.h
@@ -200,6 +200,12 @@ enum {
 	Z_EROFS_COMPRESSION_MAX
 };
 
+/* 14 bytes (+ length field = 16 bytes) */
+struct z_erofs_lz4_cfgs {
+	__le16 max_distance;
+	u8 reserved[12];
+} __packed;
+
 /*
  * bit 0 : COMPACTED_2B indexes (0 - off; 1 - on)
  *  e.g. for 4k logical cluster size,      4B        if compacted 2B is off;
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 4c15f0b8261e..5785a0631a1d 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -433,7 +433,8 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
 int erofs_try_to_free_cached_page(struct address_space *mapping,
 				  struct page *page);
 int z_erofs_load_lz4_config(struct super_block *sb,
-			    struct erofs_super_block *dsb);
+			    struct erofs_super_block *dsb,
+			    struct z_erofs_lz4_cfgs *lz4, int len);
 #else
 static inline void erofs_shrinker_register(struct super_block *sb) {}
 static inline void erofs_shrinker_unregister(struct super_block *sb) {}
@@ -442,9 +443,10 @@ static inline void erofs_exit_shrinker(void) {}
 static inline int z_erofs_init_zip_subsystem(void) { return 0; }
 static inline void z_erofs_exit_zip_subsystem(void) {}
 static inline int z_erofs_load_lz4_config(struct super_block *sb,
-				struct erofs_super_block *dsb)
+				  struct erofs_super_block *dsb,
+				  struct z_erofs_lz4_cfgs *lz4, int len)
 {
-	if (dsb->lz4_max_distance) {
+	if (lz4 || dsb->lz4_max_distance) {
 		erofs_err(sb, "lz4 algorithm isn't enabled");
 		return -EINVAL;
 	}
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 599463632bd3..495a4e2b720d 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -193,7 +193,7 @@ static int erofs_read_superblock(struct super_block *sb)
 	}
 
 	/* parse on-disk compression configurations */
-	ret = z_erofs_load_lz4_config(sb, dsb);
+	ret = z_erofs_load_lz4_config(sb, dsb, NULL, 0);
 out:
 	kunmap(page);
 	put_page(page);

From 16c996732f31abd4e5f74c7225c4635203c7df07 Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@redhat.com>
Date: Mon, 29 Mar 2021 18:00:12 +0800
Subject: [PATCH 22/34] erofs: add on-disk compression configurations

Add a bitmap for available compression algorithms and a variable-sized
on-disk table for compression options in preparation for upcoming big
pcluster and LZMA algorithm, which follows the end of super block.

To parse the compression options, the bitmap is scanned one by one.
For each available algorithm, there is data followed by 2-byte `length'
correspondingly (it's enough for most cases, or entire fs blocks should
be used.)

With such available algorithm bitmap, kernel itself can also refuse to
mount such filesystem if any unsupported compression algorithm exists.

Note that COMPR_CFGS feature will be enabled with BIG_PCLUSTER.

Link: https://lore.kernel.org/r/20210329100012.12980-1-hsiangkao@aol.com
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Gao Xiang <hsiangkao@redhat.com>
---
 fs/erofs/decompressor.c |   2 +-
 fs/erofs/erofs_fs.h     |  16 +++--
 fs/erofs/internal.h     |   5 +-
 fs/erofs/super.c        | 141 +++++++++++++++++++++++++++++++++++++++-
 4 files changed, 157 insertions(+), 7 deletions(-)

diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index 4a8a3779f9cd..6e7a74ede63b 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -40,7 +40,7 @@ int z_erofs_load_lz4_config(struct super_block *sb,
 		}
 		distance = le16_to_cpu(lz4->max_distance);
 	} else {
-		distance = le16_to_cpu(dsb->lz4_max_distance);
+		distance = le16_to_cpu(dsb->u1.lz4_max_distance);
 	}
 
 	EROFS_SB(sb)->lz4.max_distance_pages = distance ?
diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
index 383cc5c8f98b..39e24fda8b95 100644
--- a/fs/erofs/erofs_fs.h
+++ b/fs/erofs/erofs_fs.h
@@ -18,15 +18,18 @@
  * be incompatible with this kernel version.
  */
 #define EROFS_FEATURE_INCOMPAT_LZ4_0PADDING	0x00000001
+#define EROFS_FEATURE_INCOMPAT_COMPR_CFGS	0x00000002
 #define EROFS_ALL_FEATURE_INCOMPAT		EROFS_FEATURE_INCOMPAT_LZ4_0PADDING
 
-/* 128-byte erofs on-disk super block */
+#define EROFS_SB_EXTSLOT_SIZE	16
+
+/* erofs on-disk super block (currently 128 bytes) */
 struct erofs_super_block {
 	__le32 magic;           /* file system magic number */
 	__le32 checksum;        /* crc32c(super_block) */
 	__le32 feature_compat;
 	__u8 blkszbits;         /* support block_size == PAGE_SIZE only */
-	__u8 reserved;
+	__u8 sb_extslots;	/* superblock size = 128 + sb_extslots * 16 */
 
 	__le16 root_nid;	/* nid of root directory */
 	__le64 inos;            /* total valid ino # (== f_files - f_favail) */
@@ -39,8 +42,12 @@ struct erofs_super_block {
 	__u8 uuid[16];          /* 128-bit uuid for volume */
 	__u8 volume_name[16];   /* volume name */
 	__le32 feature_incompat;
-	/* customized lz4 sliding window size instead of 64k by default */
-	__le16 lz4_max_distance;
+	union {
+		/* bitmap for available compression algorithms */
+		__le16 available_compr_algs;
+		/* customized sliding window size instead of 64k by default */
+		__le16 lz4_max_distance;
+	} __packed u1;
 	__u8 reserved2[42];
 };
 
@@ -199,6 +206,7 @@ enum {
 	Z_EROFS_COMPRESSION_LZ4	= 0,
 	Z_EROFS_COMPRESSION_MAX
 };
+#define Z_EROFS_ALL_COMPR_ALGS		(1 << (Z_EROFS_COMPRESSION_MAX - 1))
 
 /* 14 bytes (+ length field = 16 bytes) */
 struct z_erofs_lz4_cfgs {
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 5785a0631a1d..1dc54ef5f9ba 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -67,6 +67,7 @@ struct erofs_sb_info {
 	unsigned int max_sync_decompress_pages;
 
 	unsigned int shrinker_run_no;
+	u16 available_compr_algs;
 
 	/* current strategy of how to use managed cache */
 	unsigned char cache_strategy;
@@ -85,6 +86,7 @@ struct erofs_sb_info {
 	/* inode slot unit size in bit shift */
 	unsigned char islotbits;
 
+	u32 sb_size;			/* total superblock size */
 	u32 build_time_nsec;
 	u64 build_time;
 
@@ -228,6 +230,7 @@ static inline bool erofs_sb_has_##name(struct erofs_sb_info *sbi) \
 }
 
 EROFS_FEATURE_FUNCS(lz4_0padding, incompat, INCOMPAT_LZ4_0PADDING)
+EROFS_FEATURE_FUNCS(compr_cfgs, incompat, INCOMPAT_COMPR_CFGS)
 EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM)
 
 /* atomic flag definitions */
@@ -446,7 +449,7 @@ static inline int z_erofs_load_lz4_config(struct super_block *sb,
 				  struct erofs_super_block *dsb,
 				  struct z_erofs_lz4_cfgs *lz4, int len)
 {
-	if (lz4 || dsb->lz4_max_distance) {
+	if (lz4 || dsb->u1.lz4_max_distance) {
 		erofs_err(sb, "lz4 algorithm isn't enabled");
 		return -EINVAL;
 	}
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index 495a4e2b720d..ba63008b2c28 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -125,6 +125,136 @@ static bool check_layout_compatibility(struct super_block *sb,
 	return true;
 }
 
+#ifdef CONFIG_EROFS_FS_ZIP
+/* read variable-sized metadata, offset will be aligned by 4-byte */
+static void *erofs_read_metadata(struct super_block *sb, struct page **pagep,
+				 erofs_off_t *offset, int *lengthp)
+{
+	struct page *page = *pagep;
+	u8 *buffer, *ptr;
+	int len, i, cnt;
+	erofs_blk_t blk;
+
+	*offset = round_up(*offset, 4);
+	blk = erofs_blknr(*offset);
+
+	if (!page || page->index != blk) {
+		if (page) {
+			unlock_page(page);
+			put_page(page);
+		}
+		page = erofs_get_meta_page(sb, blk);
+		if (IS_ERR(page))
+			goto err_nullpage;
+	}
+
+	ptr = kmap(page);
+	len = le16_to_cpu(*(__le16 *)&ptr[erofs_blkoff(*offset)]);
+	if (!len)
+		len = U16_MAX + 1;
+	buffer = kmalloc(len, GFP_KERNEL);
+	if (!buffer) {
+		buffer = ERR_PTR(-ENOMEM);
+		goto out;
+	}
+	*offset += sizeof(__le16);
+	*lengthp = len;
+
+	for (i = 0; i < len; i += cnt) {
+		cnt = min(EROFS_BLKSIZ - (int)erofs_blkoff(*offset), len - i);
+		blk = erofs_blknr(*offset);
+
+		if (!page || page->index != blk) {
+			if (page) {
+				kunmap(page);
+				unlock_page(page);
+				put_page(page);
+			}
+			page = erofs_get_meta_page(sb, blk);
+			if (IS_ERR(page)) {
+				kfree(buffer);
+				goto err_nullpage;
+			}
+			ptr = kmap(page);
+		}
+		memcpy(buffer + i, ptr + erofs_blkoff(*offset), cnt);
+		*offset += cnt;
+	}
+out:
+	kunmap(page);
+	*pagep = page;
+	return buffer;
+err_nullpage:
+	*pagep = NULL;
+	return page;
+}
+
+static int erofs_load_compr_cfgs(struct super_block *sb,
+				 struct erofs_super_block *dsb)
+{
+	struct erofs_sb_info *sbi;
+	struct page *page;
+	unsigned int algs, alg;
+	erofs_off_t offset;
+	int size, ret;
+
+	sbi = EROFS_SB(sb);
+	sbi->available_compr_algs = le16_to_cpu(dsb->u1.available_compr_algs);
+
+	if (sbi->available_compr_algs & ~Z_EROFS_ALL_COMPR_ALGS) {
+		erofs_err(sb, "try to load compressed fs with unsupported algorithms %x",
+			  sbi->available_compr_algs & ~Z_EROFS_ALL_COMPR_ALGS);
+		return -EINVAL;
+	}
+
+	offset = EROFS_SUPER_OFFSET + sbi->sb_size;
+	page = NULL;
+	alg = 0;
+	ret = 0;
+
+	for (algs = sbi->available_compr_algs; algs; algs >>= 1, ++alg) {
+		void *data;
+
+		if (!(algs & 1))
+			continue;
+
+		data = erofs_read_metadata(sb, &page, &offset, &size);
+		if (IS_ERR(data)) {
+			ret = PTR_ERR(data);
+			goto err;
+		}
+
+		switch (alg) {
+		case Z_EROFS_COMPRESSION_LZ4:
+			ret = z_erofs_load_lz4_config(sb, dsb, data, size);
+			break;
+		default:
+			DBG_BUGON(1);
+			ret = -EFAULT;
+		}
+		kfree(data);
+		if (ret)
+			goto err;
+	}
+err:
+	if (page) {
+		unlock_page(page);
+		put_page(page);
+	}
+	return ret;
+}
+#else
+static int erofs_load_compr_cfgs(struct super_block *sb,
+				 struct erofs_super_block *dsb)
+{
+	if (dsb->u1.available_compr_algs) {
+		erofs_err(sb, "try to load compressed fs when compression is disabled");
+		return -EINVAL;
+	}
+	return 0;
+}
+#endif
+
 static int erofs_read_superblock(struct super_block *sb)
 {
 	struct erofs_sb_info *sbi;
@@ -170,6 +300,12 @@ static int erofs_read_superblock(struct super_block *sb)
 	if (!check_layout_compatibility(sb, dsb))
 		goto out;
 
+	sbi->sb_size = 128 + dsb->sb_extslots * EROFS_SB_EXTSLOT_SIZE;
+	if (sbi->sb_size > EROFS_BLKSIZ) {
+		erofs_err(sb, "invalid sb_extslots %u (more than a fs block)",
+			  sbi->sb_size);
+		goto out;
+	}
 	sbi->blocks = le32_to_cpu(dsb->blocks);
 	sbi->meta_blkaddr = le32_to_cpu(dsb->meta_blkaddr);
 #ifdef CONFIG_EROFS_FS_XATTR
@@ -193,7 +329,10 @@ static int erofs_read_superblock(struct super_block *sb)
 	}
 
 	/* parse on-disk compression configurations */
-	ret = z_erofs_load_lz4_config(sb, dsb, NULL, 0);
+	if (erofs_sb_has_compr_cfgs(sbi))
+		ret = erofs_load_compr_cfgs(sb, dsb);
+	else
+		ret = z_erofs_load_lz4_config(sb, dsb, NULL, 0);
 out:
 	kunmap(page);
 	put_page(page);

From c9996699b46105476c14ca1adeb8b2fa096b1abe Mon Sep 17 00:00:00 2001
From: Ruiqi Gong <gongruiqi1@huawei.com>
Date: Wed, 31 Mar 2021 05:39:20 -0400
Subject: [PATCH 23/34] erofs: Clean up spelling mistakes found in fs/erofs

zmap.c: s/correspoinding/corresponding
zdata.c: s/endding/ending

Link: https://lore.kernel.org/r/20210331093920.31923-1-gongruiqi1@huawei.com
Reported-by: Hulk Robot <hulkci@huawei.com>
Signed-off-by: Ruiqi Gong <gongruiqi1@huawei.com>
Reviewed-by: Gao Xiang <hsiangkao@redhat.com>
Signed-off-by: Gao Xiang <hsiangkao@redhat.com>
---
 fs/erofs/zdata.c | 2 +-
 fs/erofs/zmap.c  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 216b074e1b19..477163b69c04 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -935,7 +935,7 @@ static int z_erofs_decompress_pcluster(struct super_block *sb,
 				 }, pagepool);
 
 out:
-	/* must handle all compressed pages before endding pages */
+	/* must handle all compressed pages before ending pages */
 	for (i = 0; i < clusterpages; ++i) {
 		page = compressed_pages[i];
 
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index a13f69cf4026..bc3cfb6fb957 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -442,7 +442,7 @@ int z_erofs_map_blocks_iter(struct inode *inode,
 		m.delta[0] = 1;
 		/* fallthrough */
 	case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
-		/* get the correspoinding first chunk */
+		/* get the corresponding first chunk */
 		err = z_erofs_extent_lookback(&m, m.delta[0]);
 		if (err)
 			goto unmap_out;

From 5eb979af027198bf0ce3838ccfd55d878ed5fae2 Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@redhat.com>
Date: Wed, 7 Apr 2021 12:39:18 +0800
Subject: [PATCH 24/34] erofs: reserve physical_clusterbits[]

Formal big pcluster design is actually more powerful / flexable than
the previous thought whose pclustersize was fixed as power-of-2 blocks,
which was obviously inefficient and space-wasting. Instead, pclustersize
can now be set independently for each pcluster, so various pcluster
sizes can also be used together in one file if mkfs wants (for example,
according to data type and/or compression ratio).

Let's get rid of previous physical_clusterbits[] setting (also notice
that corresponding on-disk fields are still 0 for now). Therefore,
head1/2 can be used for at most 2 different algorithms in one file and
again pclustersize is now independent of these.

Link: https://lore.kernel.org/r/20210407043927.10623-2-xiang@kernel.org
Acked-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Gao Xiang <hsiangkao@redhat.com>
---
 fs/erofs/erofs_fs.h |  4 +---
 fs/erofs/internal.h |  1 -
 fs/erofs/zdata.c    |  3 +--
 fs/erofs/zmap.c     | 15 ---------------
 4 files changed, 2 insertions(+), 21 deletions(-)

diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
index 39e24fda8b95..04010fb6b370 100644
--- a/fs/erofs/erofs_fs.h
+++ b/fs/erofs/erofs_fs.h
@@ -233,9 +233,7 @@ struct z_erofs_map_header {
 	__u8	h_algorithmtype;
 	/*
 	 * bit 0-2 : logical cluster bits - 12, e.g. 0 for 4096;
-	 * bit 3-4 : (physical - logical) cluster bits of head 1:
-	 *       For example, if logical clustersize = 4096, 1 for 8192.
-	 * bit 5-7 : (physical - logical) cluster bits of head 2.
+	 * bit 3-7 : reserved.
 	 */
 	__u8	h_clusterbits;
 };
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 1dc54ef5f9ba..821d910cd1cc 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -261,7 +261,6 @@ struct erofs_inode {
 			unsigned short z_advise;
 			unsigned char  z_algorithmtype[2];
 			unsigned char  z_logical_clusterbits;
-			unsigned char  z_physical_clusterbits[2];
 		};
 #endif	/* CONFIG_EROFS_FS_ZIP */
 	};
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 477163b69c04..f93a23cfe257 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -451,8 +451,7 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt,
 	else
 		pcl->algorithmformat = Z_EROFS_COMPRESSION_SHIFTED;
 
-	pcl->clusterbits = EROFS_I(inode)->z_physical_clusterbits[0];
-	pcl->clusterbits -= PAGE_SHIFT;
+	pcl->clusterbits = 0;
 
 	/* new pclusters should be claimed as type 1, primary and followed */
 	pcl->next = clt->owned_head;
diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index bc3cfb6fb957..4f504708c558 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -16,11 +16,8 @@ int z_erofs_fill_inode(struct inode *inode)
 		vi->z_algorithmtype[0] = 0;
 		vi->z_algorithmtype[1] = 0;
 		vi->z_logical_clusterbits = LOG_BLOCK_SIZE;
-		vi->z_physical_clusterbits[0] = vi->z_logical_clusterbits;
-		vi->z_physical_clusterbits[1] = vi->z_logical_clusterbits;
 		set_bit(EROFS_I_Z_INITED_BIT, &vi->flags);
 	}
-
 	inode->i_mapping->a_ops = &z_erofs_aops;
 	return 0;
 }
@@ -76,18 +73,6 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
 	}
 
 	vi->z_logical_clusterbits = LOG_BLOCK_SIZE + (h->h_clusterbits & 7);
-	vi->z_physical_clusterbits[0] = vi->z_logical_clusterbits +
-					((h->h_clusterbits >> 3) & 3);
-
-	if (vi->z_physical_clusterbits[0] != LOG_BLOCK_SIZE) {
-		erofs_err(sb, "unsupported physical clusterbits %u for nid %llu, please upgrade kernel",
-			  vi->z_physical_clusterbits[0], vi->nid);
-		err = -EOPNOTSUPP;
-		goto unmap_done;
-	}
-
-	vi->z_physical_clusterbits[1] = vi->z_logical_clusterbits +
-					((h->h_clusterbits >> 5) & 7);
 	/* paired with smp_mb() at the beginning of the function */
 	smp_mb();
 	set_bit(EROFS_I_Z_INITED_BIT, &vi->flags);

From 092ced045a27369516a938aa862904f4405d8d69 Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@redhat.com>
Date: Sat, 10 Apr 2021 03:06:30 +0800
Subject: [PATCH 25/34] erofs: introduce multipage per-CPU buffers

To deal the with the cases which inplace decompression is infeasible
for some inplace I/O. Per-CPU buffers was introduced to get rid of page
allocation latency and thrash for low-latency decompression algorithms
such as lz4.

For the big pcluster feature, introduce multipage per-CPU buffers to
keep such inplace I/O pclusters temporarily as well but note that
per-CPU pages are just consecutive virtually.

When a new big pcluster fs is mounted, its max pclustersize will be
read and per-CPU buffers can be growed if needed. Shrinking adjustable
per-CPU buffers is more complex (because we don't know if such size
is still be used), so currently just release them all when unloading.

Link: https://lore.kernel.org/r/20210409190630.19569-1-xiang@kernel.org
Acked-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Gao Xiang <hsiangkao@redhat.com>
---
 fs/erofs/Makefile       |   2 +-
 fs/erofs/decompressor.c |   8 ++-
 fs/erofs/internal.h     |  25 ++-----
 fs/erofs/pcpubuf.c      | 148 ++++++++++++++++++++++++++++++++++++++++
 fs/erofs/super.c        |   2 +
 fs/erofs/utils.c        |  12 ----
 6 files changed, 163 insertions(+), 34 deletions(-)
 create mode 100644 fs/erofs/pcpubuf.c

diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile
index af159539fc1b..1f9aced49070 100644
--- a/fs/erofs/Makefile
+++ b/fs/erofs/Makefile
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0-only
 
 obj-$(CONFIG_EROFS_FS) += erofs.o
-erofs-objs := super.o inode.o data.o namei.o dir.o utils.o
+erofs-objs := super.o inode.o data.o namei.o dir.o utils.o pcpubuf.o
 erofs-$(CONFIG_EROFS_FS_XATTR) += xattr.o
 erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index 6e7a74ede63b..7292b1b130c6 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -46,7 +46,9 @@ int z_erofs_load_lz4_config(struct super_block *sb,
 	EROFS_SB(sb)->lz4.max_distance_pages = distance ?
 					DIV_ROUND_UP(distance, PAGE_SIZE) + 1 :
 					LZ4_MAX_DISTANCE_PAGES;
-	return 0;
+
+	/* TODO: use max pclusterblks after bigpcluster is enabled */
+	return erofs_pcpubuf_growsize(1);
 }
 
 static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
@@ -113,7 +115,7 @@ static void *generic_copy_inplace_data(struct z_erofs_decompress_req *rq,
 	 * pages should be copied in order to avoid being overlapped.
 	 */
 	struct page **in = rq->in;
-	u8 *const tmp = erofs_get_pcpubuf(0);
+	u8 *const tmp = erofs_get_pcpubuf(1);
 	u8 *tmpp = tmp;
 	unsigned int inlen = rq->inputsize - pageofs_in;
 	unsigned int count = min_t(uint, inlen, PAGE_SIZE - pageofs_in);
@@ -269,7 +271,7 @@ static int z_erofs_decompress_generic(struct z_erofs_decompress_req *rq,
 	 * compressed data is preferred.
 	 */
 	if (rq->outputsize <= PAGE_SIZE * 7 / 8) {
-		dst = erofs_get_pcpubuf(0);
+		dst = erofs_get_pcpubuf(1);
 		if (IS_ERR(dst))
 			return PTR_ERR(dst);
 
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 821d910cd1cc..c88daf2b10c3 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -192,9 +192,6 @@ static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp)
 
 /* hard limit of pages per compressed cluster */
 #define Z_EROFS_CLUSTER_MAX_PAGES       (CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT)
-#define EROFS_PCPUBUF_NR_PAGES          Z_EROFS_CLUSTER_MAX_PAGES
-#else
-#define EROFS_PCPUBUF_NR_PAGES          0
 #endif	/* !CONFIG_EROFS_FS_ZIP */
 
 /* we strictly follow PAGE_SIZE and no buffer head yet */
@@ -399,24 +396,16 @@ int erofs_namei(struct inode *dir, struct qstr *name,
 /* dir.c */
 extern const struct file_operations erofs_dir_fops;
 
+/* pcpubuf.c */
+void *erofs_get_pcpubuf(unsigned int requiredpages);
+void erofs_put_pcpubuf(void *ptr);
+int erofs_pcpubuf_growsize(unsigned int nrpages);
+void erofs_pcpubuf_init(void);
+void erofs_pcpubuf_exit(void);
+
 /* utils.c / zdata.c */
 struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp);
 
-#if (EROFS_PCPUBUF_NR_PAGES > 0)
-void *erofs_get_pcpubuf(unsigned int pagenr);
-#define erofs_put_pcpubuf(buf) do { \
-	(void)&(buf);	\
-	preempt_enable();	\
-} while (0)
-#else
-static inline void *erofs_get_pcpubuf(unsigned int pagenr)
-{
-	return ERR_PTR(-EOPNOTSUPP);
-}
-
-#define erofs_put_pcpubuf(buf) do {} while (0)
-#endif
-
 #ifdef CONFIG_EROFS_FS_ZIP
 int erofs_workgroup_put(struct erofs_workgroup *grp);
 struct erofs_workgroup *erofs_find_workgroup(struct super_block *sb,
diff --git a/fs/erofs/pcpubuf.c b/fs/erofs/pcpubuf.c
new file mode 100644
index 000000000000..6c885575128a
--- /dev/null
+++ b/fs/erofs/pcpubuf.c
@@ -0,0 +1,148 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) Gao Xiang <xiang@kernel.org>
+ *
+ * For low-latency decompression algorithms (e.g. lz4), reserve consecutive
+ * per-CPU virtual memory (in pages) in advance to store such inplace I/O
+ * data if inplace decompression is failed (due to unmet inplace margin for
+ * example).
+ */
+#include "internal.h"
+
+struct erofs_pcpubuf {
+	raw_spinlock_t lock;
+	void *ptr;
+	struct page **pages;
+	unsigned int nrpages;
+};
+
+static DEFINE_PER_CPU(struct erofs_pcpubuf, erofs_pcb);
+
+void *erofs_get_pcpubuf(unsigned int requiredpages)
+	__acquires(pcb->lock)
+{
+	struct erofs_pcpubuf *pcb = &get_cpu_var(erofs_pcb);
+
+	raw_spin_lock(&pcb->lock);
+	/* check if the per-CPU buffer is too small */
+	if (requiredpages > pcb->nrpages) {
+		raw_spin_unlock(&pcb->lock);
+		put_cpu_var(erofs_pcb);
+		/* (for sparse checker) pretend pcb->lock is still taken */
+		__acquire(pcb->lock);
+		return NULL;
+	}
+	return pcb->ptr;
+}
+
+void erofs_put_pcpubuf(void *ptr) __releases(pcb->lock)
+{
+	struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, smp_processor_id());
+
+	DBG_BUGON(pcb->ptr != ptr);
+	raw_spin_unlock(&pcb->lock);
+	put_cpu_var(erofs_pcb);
+}
+
+/* the next step: support per-CPU page buffers hotplug */
+int erofs_pcpubuf_growsize(unsigned int nrpages)
+{
+	static DEFINE_MUTEX(pcb_resize_mutex);
+	static unsigned int pcb_nrpages;
+	LIST_HEAD(pagepool);
+	int delta, cpu, ret, i;
+
+	mutex_lock(&pcb_resize_mutex);
+	delta = nrpages - pcb_nrpages;
+	ret = 0;
+	/* avoid shrinking pcpubuf, since no idea how many fses rely on */
+	if (delta <= 0)
+		goto out;
+
+	for_each_possible_cpu(cpu) {
+		struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu);
+		struct page **pages, **oldpages;
+		void *ptr, *old_ptr;
+
+		pages = kmalloc_array(nrpages, sizeof(*pages), GFP_KERNEL);
+		if (!pages) {
+			ret = -ENOMEM;
+			break;
+		}
+
+		for (i = 0; i < nrpages; ++i) {
+			pages[i] = erofs_allocpage(&pagepool, GFP_KERNEL);
+			if (!pages[i]) {
+				ret = -ENOMEM;
+				oldpages = pages;
+				goto free_pagearray;
+			}
+		}
+		ptr = vmap(pages, nrpages, VM_MAP, PAGE_KERNEL);
+		if (!ptr) {
+			ret = -ENOMEM;
+			oldpages = pages;
+			goto free_pagearray;
+		}
+		raw_spin_lock(&pcb->lock);
+		old_ptr = pcb->ptr;
+		pcb->ptr = ptr;
+		oldpages = pcb->pages;
+		pcb->pages = pages;
+		i = pcb->nrpages;
+		pcb->nrpages = nrpages;
+		raw_spin_unlock(&pcb->lock);
+
+		if (!oldpages) {
+			DBG_BUGON(old_ptr);
+			continue;
+		}
+
+		if (old_ptr)
+			vunmap(old_ptr);
+free_pagearray:
+		while (i)
+			list_add(&oldpages[--i]->lru, &pagepool);
+		kfree(oldpages);
+		if (ret)
+			break;
+	}
+	pcb_nrpages = nrpages;
+	put_pages_list(&pagepool);
+out:
+	mutex_unlock(&pcb_resize_mutex);
+	return ret;
+}
+
+void erofs_pcpubuf_init(void)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu);
+
+		raw_spin_lock_init(&pcb->lock);
+	}
+}
+
+void erofs_pcpubuf_exit(void)
+{
+	int cpu, i;
+
+	for_each_possible_cpu(cpu) {
+		struct erofs_pcpubuf *pcb = &per_cpu(erofs_pcb, cpu);
+
+		if (pcb->ptr) {
+			vunmap(pcb->ptr);
+			pcb->ptr = NULL;
+		}
+		if (!pcb->pages)
+			continue;
+
+		for (i = 0; i < pcb->nrpages; ++i)
+			if (pcb->pages[i])
+				put_page(pcb->pages[i]);
+		kfree(pcb->pages);
+		pcb->pages = NULL;
+	}
+}
diff --git a/fs/erofs/super.c b/fs/erofs/super.c
index ba63008b2c28..f5c0be845bd7 100644
--- a/fs/erofs/super.c
+++ b/fs/erofs/super.c
@@ -668,6 +668,7 @@ static int __init erofs_module_init(void)
 	if (err)
 		goto shrinker_err;
 
+	erofs_pcpubuf_init();
 	err = z_erofs_init_zip_subsystem();
 	if (err)
 		goto zip_err;
@@ -697,6 +698,7 @@ static void __exit erofs_module_exit(void)
 	/* Ensure all RCU free inodes are safe before cache is destroyed. */
 	rcu_barrier();
 	kmem_cache_destroy(erofs_inode_cachep);
+	erofs_pcpubuf_exit();
 }
 
 /* get filesystem statistics */
diff --git a/fs/erofs/utils.c b/fs/erofs/utils.c
index aab4a607f028..160554accdfb 100644
--- a/fs/erofs/utils.c
+++ b/fs/erofs/utils.c
@@ -20,18 +20,6 @@ struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
 	return page;
 }
 
-#if (EROFS_PCPUBUF_NR_PAGES > 0)
-static struct {
-	u8 data[PAGE_SIZE * EROFS_PCPUBUF_NR_PAGES];
-} ____cacheline_aligned_in_smp erofs_pcpubuf[NR_CPUS];
-
-void *erofs_get_pcpubuf(unsigned int pagenr)
-{
-	preempt_disable();
-	return &erofs_pcpubuf[smp_processor_id()].data[pagenr * PAGE_SIZE];
-}
-#endif
-
 #ifdef CONFIG_EROFS_FS_ZIP
 /* global shrink count (for all mounted EROFS instances) */
 static atomic_long_t erofs_global_shrink_cnt;

From e4d6726a4fc8c47d935d0eb30f6baa40aa4c601f Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@redhat.com>
Date: Wed, 7 Apr 2021 12:39:20 +0800
Subject: [PATCH 26/34] erofs: introduce physical cluster slab pools

Since multiple pcluster sizes could be used at once, the number of
compressed pages will become a variable factor. It's necessary to
introduce slab pools rather than a single slab cache now.

This limits the pclustersize to 1M (Z_EROFS_PCLUSTER_MAX_SIZE), and
get rid of the obsolete EROFS_FS_CLUSTER_PAGE_LIMIT, which has no
use now.

Link: https://lore.kernel.org/r/20210407043927.10623-4-xiang@kernel.org
Acked-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Gao Xiang <hsiangkao@redhat.com>
---
 fs/erofs/Kconfig    |  14 ----
 fs/erofs/erofs_fs.h |   3 +
 fs/erofs/internal.h |   3 -
 fs/erofs/zdata.c    | 179 +++++++++++++++++++++++++++++---------------
 fs/erofs/zdata.h    |  14 ++--
 5 files changed, 127 insertions(+), 86 deletions(-)

diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig
index 74b0aaa7114c..858b3339f381 100644
--- a/fs/erofs/Kconfig
+++ b/fs/erofs/Kconfig
@@ -76,17 +76,3 @@ config EROFS_FS_ZIP
 
 	  If you don't want to enable compression feature, say N.
 
-config EROFS_FS_CLUSTER_PAGE_LIMIT
-	int "EROFS Cluster Pages Hard Limit"
-	depends on EROFS_FS_ZIP
-	range 1 256
-	default "1"
-	help
-	  Indicates maximum # of pages of a compressed
-	  physical cluster.
-
-	  For example, if files in a image were compressed
-	  into 8k-unit, hard limit should not be configured
-	  less than 2. Otherwise, the image will be refused
-	  to mount on this kernel.
-
diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
index 04010fb6b370..b766e65c9164 100644
--- a/fs/erofs/erofs_fs.h
+++ b/fs/erofs/erofs_fs.h
@@ -201,6 +201,9 @@ static inline unsigned int erofs_xattr_entry_size(struct erofs_xattr_entry *e)
 				 e->e_name_len + le16_to_cpu(e->e_value_size));
 }
 
+/* maximum supported size of a physical compression cluster */
+#define Z_EROFS_PCLUSTER_MAX_SIZE	(1024 * 1024)
+
 /* available compression algorithm types (for h_algorithmtype) */
 enum {
 	Z_EROFS_COMPRESSION_LZ4	= 0,
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index c88daf2b10c3..8c43fe34b92a 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -189,9 +189,6 @@ static inline int erofs_wait_on_workgroup_freezed(struct erofs_workgroup *grp)
 	return v;
 }
 #endif	/* !CONFIG_SMP */
-
-/* hard limit of pages per compressed cluster */
-#define Z_EROFS_CLUSTER_MAX_PAGES       (CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT)
 #endif	/* !CONFIG_EROFS_FS_ZIP */
 
 /* we strictly follow PAGE_SIZE and no buffer head yet */
diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index f93a23cfe257..2d65ef4253fe 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -9,6 +9,93 @@
 
 #include <trace/events/erofs.h>
 
+/*
+ * since pclustersize is variable for big pcluster feature, introduce slab
+ * pools implementation for different pcluster sizes.
+ */
+struct z_erofs_pcluster_slab {
+	struct kmem_cache *slab;
+	unsigned int maxpages;
+	char name[48];
+};
+
+#define _PCLP(n) { .maxpages = n }
+
+static struct z_erofs_pcluster_slab pcluster_pool[] __read_mostly = {
+	_PCLP(1), _PCLP(4), _PCLP(16), _PCLP(64), _PCLP(128),
+	_PCLP(Z_EROFS_PCLUSTER_MAX_PAGES)
+};
+
+static void z_erofs_destroy_pcluster_pool(void)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) {
+		if (!pcluster_pool[i].slab)
+			continue;
+		kmem_cache_destroy(pcluster_pool[i].slab);
+		pcluster_pool[i].slab = NULL;
+	}
+}
+
+static int z_erofs_create_pcluster_pool(void)
+{
+	struct z_erofs_pcluster_slab *pcs;
+	struct z_erofs_pcluster *a;
+	unsigned int size;
+
+	for (pcs = pcluster_pool;
+	     pcs < pcluster_pool + ARRAY_SIZE(pcluster_pool); ++pcs) {
+		size = struct_size(a, compressed_pages, pcs->maxpages);
+
+		sprintf(pcs->name, "erofs_pcluster-%u", pcs->maxpages);
+		pcs->slab = kmem_cache_create(pcs->name, size, 0,
+					      SLAB_RECLAIM_ACCOUNT, NULL);
+		if (pcs->slab)
+			continue;
+
+		z_erofs_destroy_pcluster_pool();
+		return -ENOMEM;
+	}
+	return 0;
+}
+
+static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int nrpages)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) {
+		struct z_erofs_pcluster_slab *pcs = pcluster_pool + i;
+		struct z_erofs_pcluster *pcl;
+
+		if (nrpages > pcs->maxpages)
+			continue;
+
+		pcl = kmem_cache_zalloc(pcs->slab, GFP_NOFS);
+		if (!pcl)
+			return ERR_PTR(-ENOMEM);
+		pcl->pclusterpages = nrpages;
+		return pcl;
+	}
+	return ERR_PTR(-EINVAL);
+}
+
+static void z_erofs_free_pcluster(struct z_erofs_pcluster *pcl)
+{
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(pcluster_pool); ++i) {
+		struct z_erofs_pcluster_slab *pcs = pcluster_pool + i;
+
+		if (pcl->pclusterpages > pcs->maxpages)
+			continue;
+
+		kmem_cache_free(pcs->slab, pcl);
+		return;
+	}
+	DBG_BUGON(1);
+}
+
 /*
  * a compressed_pages[] placeholder in order to avoid
  * being filled with file pages for in-place decompression.
@@ -36,12 +123,11 @@ typedef tagptr1_t compressed_page_t;
 	tagptr_fold(compressed_page_t, page, 1)
 
 static struct workqueue_struct *z_erofs_workqueue __read_mostly;
-static struct kmem_cache *pcluster_cachep __read_mostly;
 
 void z_erofs_exit_zip_subsystem(void)
 {
 	destroy_workqueue(z_erofs_workqueue);
-	kmem_cache_destroy(pcluster_cachep);
+	z_erofs_destroy_pcluster_pool();
 }
 
 static inline int z_erofs_init_workqueue(void)
@@ -58,42 +144,16 @@ static inline int z_erofs_init_workqueue(void)
 	return z_erofs_workqueue ? 0 : -ENOMEM;
 }
 
-static void z_erofs_pcluster_init_once(void *ptr)
-{
-	struct z_erofs_pcluster *pcl = ptr;
-	struct z_erofs_collection *cl = z_erofs_primarycollection(pcl);
-	unsigned int i;
-
-	mutex_init(&cl->lock);
-	cl->nr_pages = 0;
-	cl->vcnt = 0;
-	for (i = 0; i < Z_EROFS_CLUSTER_MAX_PAGES; ++i)
-		pcl->compressed_pages[i] = NULL;
-}
-
-static void z_erofs_pcluster_init_always(struct z_erofs_pcluster *pcl)
-{
-	struct z_erofs_collection *cl = z_erofs_primarycollection(pcl);
-
-	atomic_set(&pcl->obj.refcount, 1);
-
-	DBG_BUGON(cl->nr_pages);
-	DBG_BUGON(cl->vcnt);
-}
-
 int __init z_erofs_init_zip_subsystem(void)
 {
-	pcluster_cachep = kmem_cache_create("erofs_compress",
-					    Z_EROFS_WORKGROUP_SIZE, 0,
-					    SLAB_RECLAIM_ACCOUNT,
-					    z_erofs_pcluster_init_once);
-	if (pcluster_cachep) {
-		if (!z_erofs_init_workqueue())
-			return 0;
+	int err = z_erofs_create_pcluster_pool();
 
-		kmem_cache_destroy(pcluster_cachep);
-	}
-	return -ENOMEM;
+	if (err)
+		return err;
+	err = z_erofs_init_workqueue();
+	if (err)
+		z_erofs_destroy_pcluster_pool();
+	return err;
 }
 
 enum z_erofs_collectmode {
@@ -178,7 +238,6 @@ static void preload_compressed_pages(struct z_erofs_collector *clt,
 				     struct list_head *pagepool)
 {
 	const struct z_erofs_pcluster *pcl = clt->pcl;
-	const unsigned int clusterpages = BIT(pcl->clusterbits);
 	struct page **pages = clt->compressedpages;
 	pgoff_t index = pcl->obj.index + (pages - pcl->compressed_pages);
 	bool standalone = true;
@@ -188,7 +247,7 @@ static void preload_compressed_pages(struct z_erofs_collector *clt,
 	if (clt->mode < COLLECT_PRIMARY_FOLLOWED)
 		return;
 
-	for (; pages < pcl->compressed_pages + clusterpages; ++pages) {
+	for (; pages < pcl->compressed_pages + pcl->pclusterpages; ++pages) {
 		struct page *page;
 		compressed_page_t t;
 		struct page *newpage = NULL;
@@ -248,14 +307,13 @@ int erofs_try_to_free_all_cached_pages(struct erofs_sb_info *sbi,
 	struct z_erofs_pcluster *const pcl =
 		container_of(grp, struct z_erofs_pcluster, obj);
 	struct address_space *const mapping = MNGD_MAPPING(sbi);
-	const unsigned int clusterpages = BIT(pcl->clusterbits);
 	int i;
 
 	/*
 	 * refcount of workgroup is now freezed as 1,
 	 * therefore no need to worry about available decompression users.
 	 */
-	for (i = 0; i < clusterpages; ++i) {
+	for (i = 0; i < pcl->pclusterpages; ++i) {
 		struct page *page = pcl->compressed_pages[i];
 
 		if (!page)
@@ -283,13 +341,12 @@ int erofs_try_to_free_cached_page(struct address_space *mapping,
 				  struct page *page)
 {
 	struct z_erofs_pcluster *const pcl = (void *)page_private(page);
-	const unsigned int clusterpages = BIT(pcl->clusterbits);
 	int ret = 0;	/* 0 - busy */
 
 	if (erofs_workgroup_try_to_freeze(&pcl->obj, 1)) {
 		unsigned int i;
 
-		for (i = 0; i < clusterpages; ++i) {
+		for (i = 0; i < pcl->pclusterpages; ++i) {
 			if (pcl->compressed_pages[i] == page) {
 				WRITE_ONCE(pcl->compressed_pages[i], NULL);
 				ret = 1;
@@ -312,9 +369,9 @@ static inline bool z_erofs_try_inplace_io(struct z_erofs_collector *clt,
 					  struct page *page)
 {
 	struct z_erofs_pcluster *const pcl = clt->pcl;
-	const unsigned int clusterpages = BIT(pcl->clusterbits);
 
-	while (clt->compressedpages < pcl->compressed_pages + clusterpages) {
+	while (clt->compressedpages <
+	       pcl->compressed_pages + pcl->pclusterpages) {
 		if (!cmpxchg(clt->compressedpages++, NULL, page))
 			return true;
 	}
@@ -434,12 +491,12 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt,
 	struct z_erofs_collection *cl;
 	int err;
 
-	/* no available workgroup, let's allocate one */
-	pcl = kmem_cache_alloc(pcluster_cachep, GFP_NOFS);
-	if (!pcl)
-		return -ENOMEM;
+	/* no available pcluster, let's allocate one */
+	pcl = z_erofs_alloc_pcluster(map->m_plen >> PAGE_SHIFT);
+	if (IS_ERR(pcl))
+		return PTR_ERR(pcl);
 
-	z_erofs_pcluster_init_always(pcl);
+	atomic_set(&pcl->obj.refcount, 1);
 	pcl->obj.index = map->m_pa >> PAGE_SHIFT;
 
 	pcl->length = (map->m_llen << Z_EROFS_PCLUSTER_LENGTH_BIT) |
@@ -451,8 +508,6 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt,
 	else
 		pcl->algorithmformat = Z_EROFS_COMPRESSION_SHIFTED;
 
-	pcl->clusterbits = 0;
-
 	/* new pclusters should be claimed as type 1, primary and followed */
 	pcl->next = clt->owned_head;
 	clt->mode = COLLECT_PRIMARY_FOLLOWED;
@@ -464,12 +519,13 @@ static int z_erofs_register_collection(struct z_erofs_collector *clt,
 	 * lock all primary followed works before visible to others
 	 * and mutex_trylock *never* fails for a new pcluster.
 	 */
-	mutex_trylock(&cl->lock);
+	mutex_init(&cl->lock);
+	DBG_BUGON(!mutex_trylock(&cl->lock));
 
 	err = erofs_register_workgroup(inode->i_sb, &pcl->obj);
 	if (err) {
 		mutex_unlock(&cl->lock);
-		kmem_cache_free(pcluster_cachep, pcl);
+		z_erofs_free_pcluster(pcl);
 		return -EAGAIN;
 	}
 	/* used to check tail merging loop due to corrupted images */
@@ -518,7 +574,7 @@ static int z_erofs_collector_begin(struct z_erofs_collector *clt,
 
 	clt->compressedpages = clt->pcl->compressed_pages;
 	if (clt->mode <= COLLECT_PRIMARY) /* cannot do in-place I/O */
-		clt->compressedpages += Z_EROFS_CLUSTER_MAX_PAGES;
+		clt->compressedpages += clt->pcl->pclusterpages;
 	return 0;
 }
 
@@ -531,9 +587,8 @@ static void z_erofs_rcu_callback(struct rcu_head *head)
 	struct z_erofs_collection *const cl =
 		container_of(head, struct z_erofs_collection, rcu);
 
-	kmem_cache_free(pcluster_cachep,
-			container_of(cl, struct z_erofs_pcluster,
-				     primary_collection));
+	z_erofs_free_pcluster(container_of(cl, struct z_erofs_pcluster,
+					   primary_collection));
 }
 
 void erofs_workgroup_free_rcu(struct erofs_workgroup *grp)
@@ -786,9 +841,8 @@ static int z_erofs_decompress_pcluster(struct super_block *sb,
 				       struct list_head *pagepool)
 {
 	struct erofs_sb_info *const sbi = EROFS_SB(sb);
-	const unsigned int clusterpages = BIT(pcl->clusterbits);
 	struct z_erofs_pagevec_ctor ctor;
-	unsigned int i, outputsize, llen, nr_pages;
+	unsigned int i, inputsize, outputsize, llen, nr_pages;
 	struct page *pages_onstack[Z_EROFS_VMAP_ONSTACK_PAGES];
 	struct page **pages, **compressed_pages, *page;
 
@@ -868,7 +922,7 @@ static int z_erofs_decompress_pcluster(struct super_block *sb,
 	overlapped = false;
 	compressed_pages = pcl->compressed_pages;
 
-	for (i = 0; i < clusterpages; ++i) {
+	for (i = 0; i < pcl->pclusterpages; ++i) {
 		unsigned int pagenr;
 
 		page = compressed_pages[i];
@@ -921,12 +975,13 @@ static int z_erofs_decompress_pcluster(struct super_block *sb,
 		partial = true;
 	}
 
+	inputsize = pcl->pclusterpages * PAGE_SIZE;
 	err = z_erofs_decompress(&(struct z_erofs_decompress_req) {
 					.sb = sb,
 					.in = compressed_pages,
 					.out = pages,
 					.pageofs_out = cl->pageofs,
-					.inputsize = PAGE_SIZE,
+					.inputsize = inputsize,
 					.outputsize = outputsize,
 					.alg = pcl->algorithmformat,
 					.inplace_io = overlapped,
@@ -935,7 +990,7 @@ static int z_erofs_decompress_pcluster(struct super_block *sb,
 
 out:
 	/* must handle all compressed pages before ending pages */
-	for (i = 0; i < clusterpages; ++i) {
+	for (i = 0; i < pcl->pclusterpages; ++i) {
 		page = compressed_pages[i];
 
 		if (erofs_page_is_managed(sbi, page))
@@ -1237,7 +1292,7 @@ static void z_erofs_submit_queue(struct super_block *sb,
 		pcl = container_of(owned_head, struct z_erofs_pcluster, next);
 
 		cur = pcl->obj.index;
-		end = cur + BIT(pcl->clusterbits);
+		end = cur + pcl->pclusterpages;
 
 		/* close the main owned chain at first */
 		owned_head = cmpxchg(&pcl->next, Z_EROFS_PCLUSTER_TAIL,
diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h
index 658bda5efcda..3a008f1b9f78 100644
--- a/fs/erofs/zdata.h
+++ b/fs/erofs/zdata.h
@@ -9,6 +9,7 @@
 #include "internal.h"
 #include "zpvec.h"
 
+#define Z_EROFS_PCLUSTER_MAX_PAGES	(Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE)
 #define Z_EROFS_NR_INLINE_PAGEVECS      3
 
 /*
@@ -58,16 +59,17 @@ struct z_erofs_pcluster {
 	/* A: point to next chained pcluster or TAILs */
 	z_erofs_next_pcluster_t next;
 
-	/* A: compressed pages (including multi-usage pages) */
-	struct page *compressed_pages[Z_EROFS_CLUSTER_MAX_PAGES];
-
 	/* A: lower limit of decompressed length and if full length or not */
 	unsigned int length;
 
+	/* I: physical cluster size in pages */
+	unsigned short pclusterpages;
+
 	/* I: compression algorithm format */
 	unsigned char algorithmformat;
-	/* I: bit shift of physical cluster size */
-	unsigned char clusterbits;
+
+	/* A: compressed pages (can be cached or inplaced pages) */
+	struct page *compressed_pages[];
 };
 
 #define z_erofs_primarycollection(pcluster) (&(pcluster)->primary_collection)
@@ -81,8 +83,6 @@ struct z_erofs_pcluster {
 
 #define Z_EROFS_PCLUSTER_NIL            (NULL)
 
-#define Z_EROFS_WORKGROUP_SIZE  sizeof(struct z_erofs_pcluster)
-
 struct z_erofs_decompressqueue {
 	struct super_block *sb;
 	atomic_t pending_bios;

From 57a75ccc50bf3191c189f89c6dca10d641b4377c Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@redhat.com>
Date: Wed, 7 Apr 2021 12:39:21 +0800
Subject: [PATCH 27/34] erofs: fix up inplace I/O pointer for big pcluster

When picking up inplace I/O pages, it should be traversed in reverse
order in aligned with the traversal order of file-backed online pages.
Also, index should be updated together when preloading compressed pages.

Previously, only page-sized pclustersize was supported so no problem
at all. Also rename `compressedpages' to `icpage_ptr' to reflect its
functionality.

Link: https://lore.kernel.org/r/20210407043927.10623-5-xiang@kernel.org
Acked-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Gao Xiang <hsiangkao@redhat.com>
---
 fs/erofs/zdata.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c
index 2d65ef4253fe..2d2cbfac500b 100644
--- a/fs/erofs/zdata.c
+++ b/fs/erofs/zdata.c
@@ -203,7 +203,8 @@ struct z_erofs_collector {
 
 	struct z_erofs_pcluster *pcl, *tailpcl;
 	struct z_erofs_collection *cl;
-	struct page **compressedpages;
+	/* a pointer used to pick up inplace I/O pages */
+	struct page **icpage_ptr;
 	z_erofs_next_pcluster_t owned_head;
 
 	enum z_erofs_collectmode mode;
@@ -237,17 +238,19 @@ static void preload_compressed_pages(struct z_erofs_collector *clt,
 				     enum z_erofs_cache_alloctype type,
 				     struct list_head *pagepool)
 {
-	const struct z_erofs_pcluster *pcl = clt->pcl;
-	struct page **pages = clt->compressedpages;
-	pgoff_t index = pcl->obj.index + (pages - pcl->compressed_pages);
+	struct z_erofs_pcluster *pcl = clt->pcl;
 	bool standalone = true;
 	gfp_t gfp = (mapping_gfp_mask(mc) & ~__GFP_DIRECT_RECLAIM) |
 			__GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
+	struct page **pages;
+	pgoff_t index;
 
 	if (clt->mode < COLLECT_PRIMARY_FOLLOWED)
 		return;
 
-	for (; pages < pcl->compressed_pages + pcl->pclusterpages; ++pages) {
+	pages = pcl->compressed_pages;
+	index = pcl->obj.index;
+	for (; index < pcl->obj.index + pcl->pclusterpages; ++index, ++pages) {
 		struct page *page;
 		compressed_page_t t;
 		struct page *newpage = NULL;
@@ -365,16 +368,14 @@ int erofs_try_to_free_cached_page(struct address_space *mapping,
 }
 
 /* page_type must be Z_EROFS_PAGE_TYPE_EXCLUSIVE */
-static inline bool z_erofs_try_inplace_io(struct z_erofs_collector *clt,
-					  struct page *page)
+static bool z_erofs_try_inplace_io(struct z_erofs_collector *clt,
+				   struct page *page)
 {
 	struct z_erofs_pcluster *const pcl = clt->pcl;
 
-	while (clt->compressedpages <
-	       pcl->compressed_pages + pcl->pclusterpages) {
-		if (!cmpxchg(clt->compressedpages++, NULL, page))
+	while (clt->icpage_ptr > pcl->compressed_pages)
+		if (!cmpxchg(--clt->icpage_ptr, NULL, page))
 			return true;
-	}
 	return false;
 }
 
@@ -572,9 +573,8 @@ static int z_erofs_collector_begin(struct z_erofs_collector *clt,
 	z_erofs_pagevec_ctor_init(&clt->vector, Z_EROFS_NR_INLINE_PAGEVECS,
 				  clt->cl->pagevec, clt->cl->vcnt);
 
-	clt->compressedpages = clt->pcl->compressed_pages;
-	if (clt->mode <= COLLECT_PRIMARY) /* cannot do in-place I/O */
-		clt->compressedpages += clt->pcl->pclusterpages;
+	/* since file-backed online pages are traversed in reverse order */
+	clt->icpage_ptr = clt->pcl->compressed_pages + clt->pcl->pclusterpages;
 	return 0;
 }
 

From 850be2e6fbc33577dfc8e84d8397cccb893c56eb Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@redhat.com>
Date: Wed, 7 Apr 2021 12:39:22 +0800
Subject: [PATCH 28/34] erofs: add big physical cluster definition

Big pcluster indicates the size of compressed data for each physical
pcluster is no longer fixed as block size, but could be more than 1
block (more accurately, 1 logical pcluster)

When big pcluster feature is enabled for head0/1, delta0 of the 1st
non-head lcluster index will keep block count of this pcluster in
lcluster size instead of 1. Or, the compressed size of pcluster
should be 1 lcluster if pcluster has no non-head lcluster index.

Also note that BIG_PCLUSTER feature reuses COMPR_CFGS feature since
it depends on COMPR_CFGS and will be released together.

Link: https://lore.kernel.org/r/20210407043927.10623-6-xiang@kernel.org
Acked-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Gao Xiang <hsiangkao@redhat.com>
---
 fs/erofs/erofs_fs.h | 19 +++++++++++++++----
 fs/erofs/internal.h |  1 +
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
index b766e65c9164..f9ac2060af9b 100644
--- a/fs/erofs/erofs_fs.h
+++ b/fs/erofs/erofs_fs.h
@@ -19,6 +19,7 @@
  */
 #define EROFS_FEATURE_INCOMPAT_LZ4_0PADDING	0x00000001
 #define EROFS_FEATURE_INCOMPAT_COMPR_CFGS	0x00000002
+#define EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER	0x00000002
 #define EROFS_ALL_FEATURE_INCOMPAT		EROFS_FEATURE_INCOMPAT_LZ4_0PADDING
 
 #define EROFS_SB_EXTSLOT_SIZE	16
@@ -214,17 +215,20 @@ enum {
 /* 14 bytes (+ length field = 16 bytes) */
 struct z_erofs_lz4_cfgs {
 	__le16 max_distance;
-	u8 reserved[12];
+	__le16 max_pclusterblks;
+	u8 reserved[10];
 } __packed;
 
 /*
  * bit 0 : COMPACTED_2B indexes (0 - off; 1 - on)
  *  e.g. for 4k logical cluster size,      4B        if compacted 2B is off;
  *                                  (4B) + 2B + (4B) if compacted 2B is on.
+ * bit 1 : HEAD1 big pcluster (0 - off; 1 - on)
+ * bit 2 : HEAD2 big pcluster (0 - off; 1 - on)
  */
-#define Z_EROFS_ADVISE_COMPACTED_2B_BIT         0
-
-#define Z_EROFS_ADVISE_COMPACTED_2B     (1 << Z_EROFS_ADVISE_COMPACTED_2B_BIT)
+#define Z_EROFS_ADVISE_COMPACTED_2B		0x0001
+#define Z_EROFS_ADVISE_BIG_PCLUSTER_1		0x0002
+#define Z_EROFS_ADVISE_BIG_PCLUSTER_2		0x0004
 
 struct z_erofs_map_header {
 	__le32	h_reserved1;
@@ -279,6 +283,13 @@ enum {
 #define Z_EROFS_VLE_DI_CLUSTER_TYPE_BITS        2
 #define Z_EROFS_VLE_DI_CLUSTER_TYPE_BIT         0
 
+/*
+ * D0_CBLKCNT will be marked _only_ at the 1st non-head lcluster to store the
+ * compressed block count of a compressed extent (in logical clusters, aka.
+ * block count of a pcluster).
+ */
+#define Z_EROFS_VLE_DI_D0_CBLKCNT		(1 << 11)
+
 struct z_erofs_vle_decompressed_index {
 	__le16 di_advise;
 	/* where to decompress in the head cluster */
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 8c43fe34b92a..2683709fdacd 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -225,6 +225,7 @@ static inline bool erofs_sb_has_##name(struct erofs_sb_info *sbi) \
 
 EROFS_FEATURE_FUNCS(lz4_0padding, incompat, INCOMPAT_LZ4_0PADDING)
 EROFS_FEATURE_FUNCS(compr_cfgs, incompat, INCOMPAT_COMPR_CFGS)
+EROFS_FEATURE_FUNCS(big_pcluster, incompat, INCOMPAT_BIG_PCLUSTER)
 EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM)
 
 /* atomic flag definitions */

From 725d21c44d47d04defa4c1d5b52541e991fe9347 Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@redhat.com>
Date: Wed, 7 Apr 2021 12:39:23 +0800
Subject: [PATCH 29/34] erofs: adjust per-CPU buffers according to
 max_pclusterblks

Adjust per-CPU buffers on demand since big pcluster definition is
available. Also, bail out unsupported pcluster size according to
Z_EROFS_PCLUSTER_MAX_SIZE.

Link: https://lore.kernel.org/r/20210407043927.10623-7-xiang@kernel.org
Acked-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Gao Xiang <hsiangkao@redhat.com>
---
 fs/erofs/decompressor.c | 20 ++++++++++++++++----
 fs/erofs/internal.h     |  2 ++
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index 7292b1b130c6..562e05d7edc2 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -31,6 +31,7 @@ int z_erofs_load_lz4_config(struct super_block *sb,
 			    struct erofs_super_block *dsb,
 			    struct z_erofs_lz4_cfgs *lz4, int size)
 {
+	struct erofs_sb_info *sbi = EROFS_SB(sb);
 	u16 distance;
 
 	if (lz4) {
@@ -39,16 +40,27 @@ int z_erofs_load_lz4_config(struct super_block *sb,
 			return -EINVAL;
 		}
 		distance = le16_to_cpu(lz4->max_distance);
+
+		sbi->lz4.max_pclusterblks = le16_to_cpu(lz4->max_pclusterblks);
+		if (!sbi->lz4.max_pclusterblks) {
+			sbi->lz4.max_pclusterblks = 1;	/* reserved case */
+		} else if (sbi->lz4.max_pclusterblks >
+			   Z_EROFS_PCLUSTER_MAX_SIZE / EROFS_BLKSIZ) {
+			erofs_err(sb, "too large lz4 pclusterblks %u",
+				  sbi->lz4.max_pclusterblks);
+			return -EINVAL;
+		} else if (sbi->lz4.max_pclusterblks >= 2) {
+			erofs_info(sb, "EXPERIMENTAL big pcluster feature in use. Use at your own risk!");
+		}
 	} else {
 		distance = le16_to_cpu(dsb->u1.lz4_max_distance);
+		sbi->lz4.max_pclusterblks = 1;
 	}
 
-	EROFS_SB(sb)->lz4.max_distance_pages = distance ?
+	sbi->lz4.max_distance_pages = distance ?
 					DIV_ROUND_UP(distance, PAGE_SIZE) + 1 :
 					LZ4_MAX_DISTANCE_PAGES;
-
-	/* TODO: use max pclusterblks after bigpcluster is enabled */
-	return erofs_pcpubuf_growsize(1);
+	return erofs_pcpubuf_growsize(sbi->lz4.max_pclusterblks);
 }
 
 static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index 2683709fdacd..d510f8ff8f60 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -49,6 +49,8 @@ typedef u32 erofs_blk_t;
 struct erofs_sb_lz4_info {
 	/* # of pages needed for EROFS lz4 rolling decompression */
 	u16 max_distance_pages;
+	/* maximum possible blocks for pclusters in the filesystem */
+	u16 max_pclusterblks;
 };
 
 struct erofs_sb_info {

From 3cc2aca3bfc40d675d18310f0c0b57e31472b1b7 Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@redhat.com>
Date: Wed, 7 Apr 2021 12:39:24 +0800
Subject: [PATCH 30/34] erofs: support parsing big pcluster compress indexes

When INCOMPAT_BIG_PCLUSTER sb feature is enabled, legacy compress indexes
will also have the same on-disk header compact indexes to keep per-file
configurations instead of leaving it zeroed.

If ADVISE_BIG_PCLUSTER is set for a file, CBLKCNT will be loaded for each
pcluster in this file by parsing 1st non-head lcluster.

Link: https://lore.kernel.org/r/20210407043927.10623-8-xiang@kernel.org
Acked-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Gao Xiang <hsiangkao@redhat.com>
---
 fs/erofs/zmap.c | 79 +++++++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 73 insertions(+), 6 deletions(-)

diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index 4f504708c558..ee4345a86dca 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -10,8 +10,10 @@
 int z_erofs_fill_inode(struct inode *inode)
 {
 	struct erofs_inode *const vi = EROFS_I(inode);
+	struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb);
 
-	if (vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY) {
+	if (!erofs_sb_has_big_pcluster(sbi) &&
+	    vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY) {
 		vi->z_advise = 0;
 		vi->z_algorithmtype[0] = 0;
 		vi->z_algorithmtype[1] = 0;
@@ -48,7 +50,8 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
 	if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags))
 		goto out_unlock;
 
-	DBG_BUGON(vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY);
+	DBG_BUGON(!erofs_sb_has_big_pcluster(EROFS_SB(sb)) &&
+		  vi->datalayout == EROFS_INODE_FLAT_COMPRESSION_LEGACY);
 
 	pos = ALIGN(iloc(EROFS_SB(sb), vi->nid) + vi->inode_isize +
 		    vi->xattr_isize, 8);
@@ -95,7 +98,7 @@ struct z_erofs_maprecorder {
 	u8  type;
 	u16 clusterofs;
 	u16 delta[2];
-	erofs_blk_t pblk;
+	erofs_blk_t pblk, compressedlcs;
 };
 
 static int z_erofs_reload_indexes(struct z_erofs_maprecorder *m,
@@ -158,6 +161,15 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m,
 	case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
 		m->clusterofs = 1 << vi->z_logical_clusterbits;
 		m->delta[0] = le16_to_cpu(di->di_u.delta[0]);
+		if (m->delta[0] & Z_EROFS_VLE_DI_D0_CBLKCNT) {
+			if (!(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) {
+				DBG_BUGON(1);
+				return -EFSCORRUPTED;
+			}
+			m->compressedlcs = m->delta[0] &
+				~Z_EROFS_VLE_DI_D0_CBLKCNT;
+			m->delta[0] = 1;
+		}
 		m->delta[1] = le16_to_cpu(di->di_u.delta[1]);
 		break;
 	case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
@@ -365,6 +377,58 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m,
 	return 0;
 }
 
+static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
+					    unsigned int initial_lcn)
+{
+	struct erofs_inode *const vi = EROFS_I(m->inode);
+	struct erofs_map_blocks *const map = m->map;
+	const unsigned int lclusterbits = vi->z_logical_clusterbits;
+	unsigned long lcn;
+	int err;
+
+	DBG_BUGON(m->type != Z_EROFS_VLE_CLUSTER_TYPE_PLAIN &&
+		  m->type != Z_EROFS_VLE_CLUSTER_TYPE_HEAD);
+	if (!(map->m_flags & EROFS_MAP_ZIPPED) ||
+	    !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1)) {
+		map->m_plen = 1 << lclusterbits;
+		return 0;
+	}
+
+	lcn = m->lcn + 1;
+	if (m->compressedlcs)
+		goto out;
+	if (lcn == initial_lcn)
+		goto err_bonus_cblkcnt;
+
+	err = z_erofs_load_cluster_from_disk(m, lcn);
+	if (err)
+		return err;
+
+	switch (m->type) {
+	case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
+		if (m->delta[0] != 1)
+			goto err_bonus_cblkcnt;
+		if (m->compressedlcs)
+			break;
+		/* fallthrough */
+	default:
+		erofs_err(m->inode->i_sb,
+			  "cannot found CBLKCNT @ lcn %lu of nid %llu",
+			  lcn, vi->nid);
+		DBG_BUGON(1);
+		return -EFSCORRUPTED;
+	}
+out:
+	map->m_plen = m->compressedlcs << lclusterbits;
+	return 0;
+err_bonus_cblkcnt:
+	erofs_err(m->inode->i_sb,
+		  "bogus CBLKCNT @ lcn %lu of nid %llu",
+		  lcn, vi->nid);
+	DBG_BUGON(1);
+	return -EFSCORRUPTED;
+}
+
 int z_erofs_map_blocks_iter(struct inode *inode,
 			    struct erofs_map_blocks *map,
 			    int flags)
@@ -376,6 +440,7 @@ int z_erofs_map_blocks_iter(struct inode *inode,
 	};
 	int err = 0;
 	unsigned int lclusterbits, endoff;
+	unsigned long initial_lcn;
 	unsigned long long ofs, end;
 
 	trace_z_erofs_map_blocks_iter_enter(inode, map, flags);
@@ -394,10 +459,10 @@ int z_erofs_map_blocks_iter(struct inode *inode,
 
 	lclusterbits = vi->z_logical_clusterbits;
 	ofs = map->m_la;
-	m.lcn = ofs >> lclusterbits;
+	initial_lcn = ofs >> lclusterbits;
 	endoff = ofs & ((1 << lclusterbits) - 1);
 
-	err = z_erofs_load_cluster_from_disk(&m, m.lcn);
+	err = z_erofs_load_cluster_from_disk(&m, initial_lcn);
 	if (err)
 		goto unmap_out;
 
@@ -441,10 +506,12 @@ int z_erofs_map_blocks_iter(struct inode *inode,
 	}
 
 	map->m_llen = end - map->m_la;
-	map->m_plen = 1 << lclusterbits;
 	map->m_pa = blknr_to_addr(m.pblk);
 	map->m_flags |= EROFS_MAP_MAPPED;
 
+	err = z_erofs_get_extent_compressedlen(&m, initial_lcn);
+	if (err)
+		goto out;
 unmap_out:
 	if (m.kaddr)
 		kunmap_atomic(m.kaddr);

From d4d480d7cc61524145ba34586ec04f8d0974dd59 Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@redhat.com>
Date: Wed, 7 Apr 2021 12:39:25 +0800
Subject: [PATCH 31/34] erofs: support parsing big pcluster compact indexes

Different from non-compact indexes, several lclusters are packed
as the compact form at once and an unique base blkaddr is stored for
each pack, so each lcluster index would take less space on avarage
(e.g. 2 bytes for COMPACT_2B.) btw, that is also why BIG_PCLUSTER
switch should be consistent for compact head0/1.

Prior to big pcluster, the size of all pclusters was 1 lcluster.
Therefore, when a new HEAD lcluster was scanned, blkaddr would be
bumped by 1 lcluster. However, that way doesn't work anymore for
big pcluster since we actually don't know the compressed size of
pclusters in advance (before reading CBLKCNT lcluster).

So, instead, let blkaddr of each pack be the first pcluster blkaddr
with a valid CBLKCNT, in detail,

 1) if CBLKCNT starts at the pack, this first valid pcluster is
    itself, e.g.
  _____________________________________________________________
 |_CBLKCNT0_|_NONHEAD_| .. |_HEAD_|_CBLKCNT1_| ... |_HEAD_| ...
 ^ = blkaddr base          ^ += CBLKCNT0           ^ += CBLKCNT1

 2) if CBLKCNT doesn't start at the pack, the first valid pcluster
    is the next pcluster, e.g.
  _________________________________________________________
 | NONHEAD_| .. |_HEAD_|_CBLKCNT0_| ... |_HEAD_|_HEAD_| ...
                ^ = blkaddr base        ^ += CBLKCNT0
                                               ^ += 1

When a CBLKCNT is found, blkaddr will be increased by CBLKCNT
lclusters, or a new HEAD is found immediately, bump blkaddr by 1
instead (see the picture above.)

Also noted if CBLKCNT is the end of the pack, instead of storing
delta1 (distance of the next HEAD lcluster) as normal NONHEADs,
it still uses the compressed block count (delta0) since delta1
can be calculated indirectly but the block count can't.

Adjust decoding logic to fit big pcluster compact indexes as well.

Link: https://lore.kernel.org/r/20210407043927.10623-9-xiang@kernel.org
Acked-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Gao Xiang <hsiangkao@redhat.com>
---
 fs/erofs/zmap.c | 70 ++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 61 insertions(+), 9 deletions(-)

diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index ee4345a86dca..9bf2c9170989 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -76,6 +76,22 @@ static int z_erofs_fill_inode_lazy(struct inode *inode)
 	}
 
 	vi->z_logical_clusterbits = LOG_BLOCK_SIZE + (h->h_clusterbits & 7);
+	if (!erofs_sb_has_big_pcluster(EROFS_SB(sb)) &&
+	    vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 |
+			    Z_EROFS_ADVISE_BIG_PCLUSTER_2)) {
+		erofs_err(sb, "per-inode big pcluster without sb feature for nid %llu",
+			  vi->nid);
+		err = -EFSCORRUPTED;
+		goto unmap_done;
+	}
+	if (vi->datalayout == EROFS_INODE_FLAT_COMPRESSION &&
+	    !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1) ^
+	    !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2)) {
+		erofs_err(sb, "big pcluster head1/2 of compact indexes should be consistent for nid %llu",
+			  vi->nid);
+		err = -EFSCORRUPTED;
+		goto unmap_done;
+	}
 	/* paired with smp_mb() at the beginning of the function */
 	smp_mb();
 	set_bit(EROFS_I_Z_INITED_BIT, &vi->flags);
@@ -206,6 +222,7 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
 	unsigned int vcnt, base, lo, encodebits, nblk;
 	int i;
 	u8 *in, type;
+	bool big_pcluster;
 
 	if (1 << amortizedshift == 4)
 		vcnt = 2;
@@ -214,6 +231,7 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
 	else
 		return -EOPNOTSUPP;
 
+	big_pcluster = vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1;
 	encodebits = ((vcnt << amortizedshift) - sizeof(__le32)) * 8 / vcnt;
 	base = round_down(eofs, vcnt << amortizedshift);
 	in = m->kaddr + base;
@@ -225,7 +243,15 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
 	m->type = type;
 	if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) {
 		m->clusterofs = 1 << lclusterbits;
-		if (i + 1 != vcnt) {
+		if (lo & Z_EROFS_VLE_DI_D0_CBLKCNT) {
+			if (!big_pcluster) {
+				DBG_BUGON(1);
+				return -EFSCORRUPTED;
+			}
+			m->compressedlcs = lo & ~Z_EROFS_VLE_DI_D0_CBLKCNT;
+			m->delta[0] = 1;
+			return 0;
+		} else if (i + 1 != (int)vcnt) {
 			m->delta[0] = lo;
 			return 0;
 		}
@@ -238,22 +264,48 @@ static int unpack_compacted_index(struct z_erofs_maprecorder *m,
 					  in, encodebits * (i - 1), &type);
 		if (type != Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD)
 			lo = 0;
+		else if (lo & Z_EROFS_VLE_DI_D0_CBLKCNT)
+			lo = 1;
 		m->delta[0] = lo + 1;
 		return 0;
 	}
 	m->clusterofs = lo;
 	m->delta[0] = 0;
 	/* figout out blkaddr (pblk) for HEAD lclusters */
-	nblk = 1;
-	while (i > 0) {
-		--i;
-		lo = decode_compactedbits(lclusterbits, lomask,
-					  in, encodebits * i, &type);
-		if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD)
-			i -= lo;
+	if (!big_pcluster) {
+		nblk = 1;
+		while (i > 0) {
+			--i;
+			lo = decode_compactedbits(lclusterbits, lomask,
+						  in, encodebits * i, &type);
+			if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD)
+				i -= lo;
 
-		if (i >= 0)
+			if (i >= 0)
+				++nblk;
+		}
+	} else {
+		nblk = 0;
+		while (i > 0) {
+			--i;
+			lo = decode_compactedbits(lclusterbits, lomask,
+						  in, encodebits * i, &type);
+			if (type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD) {
+				if (lo & Z_EROFS_VLE_DI_D0_CBLKCNT) {
+					--i;
+					nblk += lo & ~Z_EROFS_VLE_DI_D0_CBLKCNT;
+					continue;
+				}
+				/* bigpcluster shouldn't have plain d0 == 1 */
+				if (lo <= 1) {
+					DBG_BUGON(1);
+					return -EFSCORRUPTED;
+				}
+				i -= lo - 2;
+				continue;
+			}
 			++nblk;
+		}
 	}
 	in += (vcnt << amortizedshift) - sizeof(__le32);
 	m->pblk = le32_to_cpu(*(__le32 *)in) + nblk;

From 3337f637087fce04bcc5ba28f66abdefe21142fa Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@redhat.com>
Date: Wed, 7 Apr 2021 12:39:26 +0800
Subject: [PATCH 32/34] erofs: support decompress big pcluster for lz4 backend

Prior to big pcluster, there was only one compressed page so it'd
easy to map this. However, when big pcluster is enabled, more work
needs to be done to handle multiple compressed pages. In detail,

 - (maptype 0) if there is only one compressed page + no need
   to copy inplace I/O, just map it directly what we did before;

 - (maptype 1) if there are more compressed pages + no need to
   copy inplace I/O, vmap such compressed pages instead;

 - (maptype 2) if inplace I/O needs to be copied, use per-CPU
   buffers for decompression then.

Another thing is how to detect inplace decompression is feasable or
not (it's still quite easy for non big pclusters), apart from the
inplace margin calculation, inplace I/O page reusing order is also
needed to be considered for each compressed page. Currently, if the
compressed page is the xth page, it shouldn't be reused as [0 ...
nrpages_out - nrpages_in + x], otherwise a full copy will be triggered.

Although there are some extra optimization ideas for this, I'd like
to make big pcluster work correctly first and obviously it can be
further optimized later since it has nothing with the on-disk format
at all.

Link: https://lore.kernel.org/r/20210407043927.10623-10-xiang@kernel.org
Acked-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Gao Xiang <hsiangkao@redhat.com>
---
 fs/erofs/decompressor.c | 220 ++++++++++++++++++++++------------------
 fs/erofs/internal.h     |  15 +++
 2 files changed, 139 insertions(+), 96 deletions(-)

diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index 562e05d7edc2..ad3f31380e6b 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -119,44 +119,85 @@ static int z_erofs_lz4_prepare_destpages(struct z_erofs_decompress_req *rq,
 	return kaddr ? 1 : 0;
 }
 
-static void *generic_copy_inplace_data(struct z_erofs_decompress_req *rq,
-				       u8 *src, unsigned int pageofs_in)
+static void *z_erofs_handle_inplace_io(struct z_erofs_decompress_req *rq,
+			void *inpage, unsigned int *inputmargin, int *maptype,
+			bool support_0padding)
 {
-	/*
-	 * if in-place decompression is ongoing, those decompressed
-	 * pages should be copied in order to avoid being overlapped.
-	 */
-	struct page **in = rq->in;
-	u8 *const tmp = erofs_get_pcpubuf(1);
-	u8 *tmpp = tmp;
-	unsigned int inlen = rq->inputsize - pageofs_in;
-	unsigned int count = min_t(uint, inlen, PAGE_SIZE - pageofs_in);
+	unsigned int nrpages_in, nrpages_out;
+	unsigned int ofull, oend, inputsize, total, i, j;
+	struct page **in;
+	void *src, *tmp;
 
-	while (tmpp < tmp + inlen) {
-		if (!src)
-			src = kmap_atomic(*in);
-		memcpy(tmpp, src + pageofs_in, count);
-		kunmap_atomic(src);
-		src = NULL;
-		tmpp += count;
-		pageofs_in = 0;
-		count = PAGE_SIZE;
-		++in;
+	inputsize = rq->inputsize;
+	nrpages_in = PAGE_ALIGN(inputsize) >> PAGE_SHIFT;
+	oend = rq->pageofs_out + rq->outputsize;
+	ofull = PAGE_ALIGN(oend);
+	nrpages_out = ofull >> PAGE_SHIFT;
+
+	if (rq->inplace_io) {
+		if (rq->partial_decoding || !support_0padding ||
+		    ofull - oend < LZ4_DECOMPRESS_INPLACE_MARGIN(inputsize))
+			goto docopy;
+
+		for (i = 0; i < nrpages_in; ++i) {
+			DBG_BUGON(rq->in[i] == NULL);
+			for (j = 0; j < nrpages_out - nrpages_in + i; ++j)
+				if (rq->out[j] == rq->in[i])
+					goto docopy;
+		}
 	}
-	return tmp;
+
+	if (nrpages_in <= 1) {
+		*maptype = 0;
+		return inpage;
+	}
+	kunmap_atomic(inpage);
+	might_sleep();
+	src = erofs_vm_map_ram(rq->in, nrpages_in);
+	if (!src)
+		return ERR_PTR(-ENOMEM);
+	*maptype = 1;
+	return src;
+
+docopy:
+	/* Or copy compressed data which can be overlapped to per-CPU buffer */
+	in = rq->in;
+	src = erofs_get_pcpubuf(nrpages_in);
+	if (!src) {
+		DBG_BUGON(1);
+		kunmap_atomic(inpage);
+		return ERR_PTR(-EFAULT);
+	}
+
+	tmp = src;
+	total = rq->inputsize;
+	while (total) {
+		unsigned int page_copycnt =
+			min_t(unsigned int, total, PAGE_SIZE - *inputmargin);
+
+		if (!inpage)
+			inpage = kmap_atomic(*in);
+		memcpy(tmp, inpage + *inputmargin, page_copycnt);
+		kunmap_atomic(inpage);
+		inpage = NULL;
+		tmp += page_copycnt;
+		total -= page_copycnt;
+		++in;
+		*inputmargin = 0;
+	}
+	*maptype = 2;
+	return src;
 }
 
 static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out)
 {
-	unsigned int inputmargin, inlen;
-	u8 *src;
-	bool copied, support_0padding;
-	int ret;
+	unsigned int inputmargin;
+	u8 *headpage, *src;
+	bool support_0padding;
+	int ret, maptype;
 
-	if (rq->inputsize > PAGE_SIZE)
-		return -EOPNOTSUPP;
-
-	src = kmap_atomic(*rq->in);
+	DBG_BUGON(*rq->in == NULL);
+	headpage = kmap_atomic(*rq->in);
 	inputmargin = 0;
 	support_0padding = false;
 
@@ -164,49 +205,36 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out)
 	if (erofs_sb_has_lz4_0padding(EROFS_SB(rq->sb))) {
 		support_0padding = true;
 
-		while (!src[inputmargin & ~PAGE_MASK])
+		while (!headpage[inputmargin & ~PAGE_MASK])
 			if (!(++inputmargin & ~PAGE_MASK))
 				break;
 
 		if (inputmargin >= rq->inputsize) {
-			kunmap_atomic(src);
+			kunmap_atomic(headpage);
 			return -EIO;
 		}
 	}
 
-	copied = false;
-	inlen = rq->inputsize - inputmargin;
-	if (rq->inplace_io) {
-		const uint oend = (rq->pageofs_out +
-				   rq->outputsize) & ~PAGE_MASK;
-		const uint nr = PAGE_ALIGN(rq->pageofs_out +
-					   rq->outputsize) >> PAGE_SHIFT;
-
-		if (rq->partial_decoding || !support_0padding ||
-		    rq->out[nr - 1] != rq->in[0] ||
-		    rq->inputsize - oend <
-		      LZ4_DECOMPRESS_INPLACE_MARGIN(inlen)) {
-			src = generic_copy_inplace_data(rq, src, inputmargin);
-			inputmargin = 0;
-			copied = true;
-		}
-	}
+	rq->inputsize -= inputmargin;
+	src = z_erofs_handle_inplace_io(rq, headpage, &inputmargin, &maptype,
+					support_0padding);
+	if (IS_ERR(src))
+		return PTR_ERR(src);
 
 	/* legacy format could compress extra data in a pcluster. */
 	if (rq->partial_decoding || !support_0padding)
 		ret = LZ4_decompress_safe_partial(src + inputmargin, out,
-						  inlen, rq->outputsize,
-						  rq->outputsize);
+				rq->inputsize, rq->outputsize, rq->outputsize);
 	else
 		ret = LZ4_decompress_safe(src + inputmargin, out,
-					  inlen, rq->outputsize);
+					  rq->inputsize, rq->outputsize);
 
 	if (ret != rq->outputsize) {
 		erofs_err(rq->sb, "failed to decompress %d in[%u, %u] out[%u]",
-			  ret, inlen, inputmargin, rq->outputsize);
+			  ret, rq->inputsize, inputmargin, rq->outputsize);
 
 		print_hex_dump(KERN_DEBUG, "[ in]: ", DUMP_PREFIX_OFFSET,
-			       16, 1, src + inputmargin, inlen, true);
+			       16, 1, src + inputmargin, rq->inputsize, true);
 		print_hex_dump(KERN_DEBUG, "[out]: ", DUMP_PREFIX_OFFSET,
 			       16, 1, out, rq->outputsize, true);
 
@@ -215,10 +243,16 @@ static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, u8 *out)
 		ret = -EIO;
 	}
 
-	if (copied)
-		erofs_put_pcpubuf(src);
-	else
+	if (maptype == 0) {
 		kunmap_atomic(src);
+	} else if (maptype == 1) {
+		vm_unmap_ram(src, PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT);
+	} else if (maptype == 2) {
+		erofs_put_pcpubuf(src);
+	} else {
+		DBG_BUGON(1);
+		return -EFAULT;
+	}
 	return ret;
 }
 
@@ -268,57 +302,51 @@ static int z_erofs_decompress_generic(struct z_erofs_decompress_req *rq,
 	const struct z_erofs_decompressor *alg = decompressors + rq->alg;
 	unsigned int dst_maptype;
 	void *dst;
-	int ret, i;
+	int ret;
 
-	if (nrpages_out == 1 && !rq->inplace_io) {
-		DBG_BUGON(!*rq->out);
-		dst = kmap_atomic(*rq->out);
-		dst_maptype = 0;
-		goto dstmap_out;
-	}
-
-	/*
-	 * For the case of small output size (especially much less
-	 * than PAGE_SIZE), memcpy the decompressed data rather than
-	 * compressed data is preferred.
-	 */
-	if (rq->outputsize <= PAGE_SIZE * 7 / 8) {
-		dst = erofs_get_pcpubuf(1);
-		if (IS_ERR(dst))
-			return PTR_ERR(dst);
-
-		rq->inplace_io = false;
-		ret = alg->decompress(rq, dst);
-		if (!ret)
-			copy_from_pcpubuf(rq->out, dst, rq->pageofs_out,
-					  rq->outputsize);
-
-		erofs_put_pcpubuf(dst);
-		return ret;
+	/* two optimized fast paths only for non bigpcluster cases yet */
+	if (rq->inputsize <= PAGE_SIZE) {
+		if (nrpages_out == 1 && !rq->inplace_io) {
+			DBG_BUGON(!*rq->out);
+			dst = kmap_atomic(*rq->out);
+			dst_maptype = 0;
+			goto dstmap_out;
+		}
+
+		/*
+		 * For the case of small output size (especially much less
+		 * than PAGE_SIZE), memcpy the decompressed data rather than
+		 * compressed data is preferred.
+		 */
+		if (rq->outputsize <= PAGE_SIZE * 7 / 8) {
+			dst = erofs_get_pcpubuf(1);
+			if (IS_ERR(dst))
+				return PTR_ERR(dst);
+
+			rq->inplace_io = false;
+			ret = alg->decompress(rq, dst);
+			if (!ret)
+				copy_from_pcpubuf(rq->out, dst, rq->pageofs_out,
+						  rq->outputsize);
+
+			erofs_put_pcpubuf(dst);
+			return ret;
+		}
 	}
 
+	/* general decoding path which can be used for all cases */
 	ret = alg->prepare_destpages(rq, pagepool);
-	if (ret < 0) {
+	if (ret < 0)
 		return ret;
-	} else if (ret) {
+	if (ret) {
 		dst = page_address(*rq->out);
 		dst_maptype = 1;
 		goto dstmap_out;
 	}
 
-	i = 0;
-	while (1) {
-		dst = vm_map_ram(rq->out, nrpages_out, -1, PAGE_KERNEL);
-
-		/* retry two more times (totally 3 times) */
-		if (dst || ++i >= 3)
-			break;
-		vm_unmap_aliases();
-	}
-
+	dst = erofs_vm_map_ram(rq->out, nrpages_out);
 	if (!dst)
 		return -ENOMEM;
-
 	dst_maptype = 2;
 
 dstmap_out:
diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h
index d510f8ff8f60..c8f904bbe985 100644
--- a/fs/erofs/internal.h
+++ b/fs/erofs/internal.h
@@ -396,6 +396,21 @@ int erofs_namei(struct inode *dir, struct qstr *name,
 /* dir.c */
 extern const struct file_operations erofs_dir_fops;
 
+static inline void *erofs_vm_map_ram(struct page **pages, unsigned int count)
+{
+	int retried = 0;
+
+	while (1) {
+		void *p = vm_map_ram(pages, count, -1, PAGE_KERNEL);
+
+		/* retry two more times (totally 3 times) */
+		if (p || ++retried >= 3)
+			return p;
+		vm_unmap_aliases();
+	}
+	return NULL;
+}
+
 /* pcpubuf.c */
 void *erofs_get_pcpubuf(unsigned int requiredpages);
 void erofs_put_pcpubuf(void *ptr);

From 50ea9ae0027c3a7b22ebe602c7a9c50c66a0eaeb Mon Sep 17 00:00:00 2001
From: Gao Xiang <hsiangkao@redhat.com>
Date: Wed, 7 Apr 2021 12:39:27 +0800
Subject: [PATCH 33/34] erofs: enable big pcluster feature

Enable COMPR_CFGS and BIG_PCLUSTER since the implementations are
all settled properly.

Link: https://lore.kernel.org/r/20210407043927.10623-11-xiang@kernel.org
Acked-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Gao Xiang <hsiangkao@redhat.com>
---
 fs/erofs/erofs_fs.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h
index f9ac2060af9b..32f79b14ef50 100644
--- a/fs/erofs/erofs_fs.h
+++ b/fs/erofs/erofs_fs.h
@@ -20,7 +20,10 @@
 #define EROFS_FEATURE_INCOMPAT_LZ4_0PADDING	0x00000001
 #define EROFS_FEATURE_INCOMPAT_COMPR_CFGS	0x00000002
 #define EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER	0x00000002
-#define EROFS_ALL_FEATURE_INCOMPAT		EROFS_FEATURE_INCOMPAT_LZ4_0PADDING
+#define EROFS_ALL_FEATURE_INCOMPAT		\
+	(EROFS_FEATURE_INCOMPAT_LZ4_0PADDING | \
+	 EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \
+	 EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER)
 
 #define EROFS_SB_EXTSLOT_SIZE	16
 

From f68c34ca41a19fb33fc6bfaee1b1b383264ad01c Mon Sep 17 00:00:00 2001
From: Gao Xiang <xiang@kernel.org>
Date: Mon, 10 May 2021 14:47:15 +0800
Subject: [PATCH 34/34] erofs: fix 1 lcluster-sized pcluster for big pcluster

If the 1st NONHEAD lcluster of a pcluster isn't CBLKCNT lcluster type
rather than a HEAD or PLAIN type instead, which means its pclustersize
_must_ be 1 lcluster (since its uncompressed size < 2 lclusters),
as illustrated below:

       HEAD     HEAD / PLAIN    lcluster type
   ____________ ____________
  |_:__________|_________:__|   file data (uncompressed)
   .                .
  .____________.
  |____________|                pcluster data (compressed)

Such on-disk case was explained before [1] but missed to be handled
properly in the runtime implementation.

It can be observed if manually generating 1 lcluster-sized pcluster
with 2 lclusters (thus CBLKCNT doesn't exist.) Let's fix it now.

[1] https://lore.kernel.org/r/20210407043927.10623-1-xiang@kernel.org

Link: https://lore.kernel.org/r/20210510064715.29123-1-xiang@kernel.org
Fixes: cec6e93beadf ("erofs: support parsing big pcluster compress indexes")
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Gao Xiang <xiang@kernel.org>
---
 fs/erofs/zmap.c | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c
index 9bf2c9170989..a7da7ed4bda7 100644
--- a/fs/erofs/zmap.c
+++ b/fs/erofs/zmap.c
@@ -449,14 +449,31 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m,
 	lcn = m->lcn + 1;
 	if (m->compressedlcs)
 		goto out;
-	if (lcn == initial_lcn)
-		goto err_bonus_cblkcnt;
 
 	err = z_erofs_load_cluster_from_disk(m, lcn);
 	if (err)
 		return err;
 
+	/*
+	 * If the 1st NONHEAD lcluster has already been handled initially w/o
+	 * valid compressedlcs, which means at least it mustn't be CBLKCNT, or
+	 * an internal implemenatation error is detected.
+	 *
+	 * The following code can also handle it properly anyway, but let's
+	 * BUG_ON in the debugging mode only for developers to notice that.
+	 */
+	DBG_BUGON(lcn == initial_lcn &&
+		  m->type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD);
+
 	switch (m->type) {
+	case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN:
+	case Z_EROFS_VLE_CLUSTER_TYPE_HEAD:
+		/*
+		 * if the 1st NONHEAD lcluster is actually PLAIN or HEAD type
+		 * rather than CBLKCNT, it's a 1 lcluster-sized pcluster.
+		 */
+		m->compressedlcs = 1;
+		break;
 	case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD:
 		if (m->delta[0] != 1)
 			goto err_bonus_cblkcnt;