[PATCH 2/3] lib: zlib_inflate: improves decompression performance

From: zengzhaoxiu
Date: Mon Nov 09 2020 - 14:28:18 EST


From: Zhaoxiu Zeng <zhaoxiu.zeng@xxxxxxxxx>

This patch does:
1. Cleanup code and reduce branches
2. Use copy_from_back to copy the matched bytes from the back output buffer

I tested on 5.8.18-300.fc33.x86_64.
The performance of function zlib_inflate is improved by about 7%.
If the CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS is disabled in copy_from_back.h,
the performance is improved by about 5%.

Signed-off-by: Zhaoxiu Zeng <zhaoxiu.zeng@xxxxxxxxx>
---
lib/zlib_inflate/inffast.c | 122 ++++++-------------------------------
1 file changed, 17 insertions(+), 105 deletions(-)

diff --git a/lib/zlib_inflate/inffast.c b/lib/zlib_inflate/inffast.c
index ed1f3df27260..c27e45fc5335 100644
--- a/lib/zlib_inflate/inffast.c
+++ b/lib/zlib_inflate/inffast.c
@@ -4,29 +4,13 @@
*/

#include <linux/zutil.h>
+#include <asm/copy_from_back.h>
#include "inftrees.h"
#include "inflate.h"
#include "inffast.h"

#ifndef ASMINF

-union uu {
- unsigned short us;
- unsigned char b[2];
-};
-
-/* Endian independed version */
-static inline unsigned short
-get_unaligned16(const unsigned short *p)
-{
- union uu mm;
- unsigned char *b = (unsigned char *)p;
-
- mm.b[0] = b[0];
- mm.b[1] = b[1];
- return mm.us;
-}
-
/*
Decode literal, length, and distance codes and write out the resulting
literal and match bytes until either not enough input or output is
@@ -184,104 +168,32 @@ void inflate_fast(z_streamp strm, unsigned start)
state->mode = BAD;
break;
}
- from = window;
- if (write == 0) { /* very common case */
- from += wsize - op;
- if (op < len) { /* some from window */
- len -= op;
- do {
- *out++ = *from++;
- } while (--op);
- from = out - dist; /* rest from output */
- }
- }
- else if (write < op) { /* wrap around window */
- from += wsize + write - op;
- op -= write;
- if (op < len) { /* some from end of window */
- len -= op;
- do {
- *out++ = *from++;
- } while (--op);
- from = window;
- if (write < len) { /* some from start of window */
- op = write;
+ from = window + write - op;
+ if (write < op) { /* very common case */
+ from += wsize;
+ if (write) { /* wrap around window */
+ op -= write;
+ if (op < len) { /* some from end of window */
len -= op;
do {
*out++ = *from++;
} while (--op);
- from = out - dist; /* rest from output */
+ from = window; /* some from start of window */
+ op = write;
}
}
}
- else { /* contiguous in window */
- from += write - op;
- if (op < len) { /* some from window */
- len -= op;
- do {
- *out++ = *from++;
- } while (--op);
- from = out - dist; /* rest from output */
- }
- }
- while (len > 2) {
- *out++ = *from++;
- *out++ = *from++;
- *out++ = *from++;
- len -= 3;
- }
- if (len) {
- *out++ = *from++;
- if (len > 1)
+ if (op < len) { /* some from window */
+ len -= op; /* rest from output */
+ do {
*out++ = *from++;
+ } while (--op);
+ } else {
+ dist = out - from;
}
}
- else {
- unsigned short *sout;
- unsigned long loops;
-
- from = out - dist; /* copy direct from output */
- /* minimum length is three */
- /* Align out addr */
- if (!((long)(out - 1) & 1)) {
- *out++ = *from++;
- len--;
- }
- sout = (unsigned short *)(out);
- if (dist > 2) {
- unsigned short *sfrom;
-
- sfrom = (unsigned short *)(from);
- loops = len >> 1;
- do
-#ifdef CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS
- *sout++ = *sfrom++;
-#else
- *sout++ = get_unaligned16(sfrom++);
-#endif
- while (--loops);
- out = (unsigned char *)sout;
- from = (unsigned char *)sfrom;
- } else { /* dist == 1 or dist == 2 */
- unsigned short pat16;
-
- pat16 = *(sout-1);
- if (dist == 1) {
- union uu mm;
- /* copy one char pattern to both bytes */
- mm.us = pat16;
- mm.b[0] = mm.b[1];
- pat16 = mm.us;
- }
- loops = len >> 1;
- do
- *sout++ = pat16;
- while (--loops);
- out = (unsigned char *)sout;
- }
- if (len & 1)
- *out++ = *from++;
- }
+ /* copy direct from output */
+ out = copy_from_back(out, dist, len);
}
else if ((op & 64) == 0) { /* 2nd level distance code */
this = dcode[this.val + (hold & ((1U << op) - 1))];
--
2.28.0