**Next message:**David Miller: "Re: [PATCH 0/5 v3] Fix NVMe driver support on Power with 32-bit DMA"**Previous message:**Viresh Kumar: "[PATCH 3/3] PM / OPP: Remove 'operating-points-names' binding"**In reply to:**Alexey Brodkin: "Re: [PATCH] __div64_32: implement division by multiplication for 32-bit arches"**Next in thread:**Vineet Gupta: "Re: [PATCH] __div64_32: implement division by multiplication for 32-bit arches"**Messages sorted by:**[ date ] [ thread ] [ subject ] [ author ]

On Wed, 28 Oct 2015, Nicolas Pitre wrote:

>

>

>

>

>

>

>

>

>

>

>

>

>

>

Well... not as close at least on ARM. Maybe 2x to 3x more costly than

the one with assembly. Still better than 100x or so without this

optimization.

>

>

>

>

>

>

>

>

>

>

>

>

>

>

OK... I was intrigued, so I adapted my ARM code to the generic case,

including the overflow avoidance optimizations. Please have look and

tell me how this works for you.

If this patch is accepted upstream, then it could be possible to

abstract only the actual multiplication part with some architecture

specific assembly.

diff --git a/include/asm-generic/div64.h b/include/asm-generic/div64.h

index 8f4e319334..43c3b21dca 100644

--- a/include/asm-generic/div64.h

+++ b/include/asm-generic/div64.h

@@ -32,6 +32,149 @@

#elif BITS_PER_LONG == 32

+/* macroized fls implementation to ensure proper constant propagation */

+#define __div64_fls(bits) \

+({ \

+ unsigned int __left = (bits), __nr = 0; \

+ if (__left & 0xffff0000) __nr += 16, __left >>= 16; \

+ if (__left & 0x0000ff00) __nr += 8, __left >>= 8; \

+ if (__left & 0x000000f0) __nr += 4, __left >>= 4; \

+ if (__left & 0x0000000c) __nr += 2, __left >>= 2; \

+ if (__left & 0x00000002) __nr += 1; \

+ __nr; \

+})

+

+/*

+ * If the divisor happens to be constant, we determine the appropriate

+ * inverse at compile time to turn the division into a few inline

+ * multiplications which ought to be much faster. And yet only if compiling

+ * with a sufficiently recent gcc version to perform proper 64-bit constant

+ * propagation.

+ *

+ * (It is unfortunate that gcc doesn't perform all this internally.)

+ */

+#define __div64_const32(n, ___b) \

+({ \

+ /* \

+ * Multiplication by reciprocal of b: n / b = n * (p / b) / p \

+ * \

+ * We rely on the fact that most of this code gets optimized \

+ * away at compile time due to constant propagation and only \

+ * a few multiplication instructions should remain. \

+ * Hence this monstrous macro (static inline doesn't always \

+ * do the trick for some reason). \

+ */ \

+ uint64_t ___res, ___x, ___t, ___m, ___n = (n); \

+ uint32_t ___c, ___p, ___m_lo, ___m_hi, ___n_lo, ___n_hi; \

+ \

+ /* determine number of bits to represent b */ \

+ ___p = 1 << __div64_fls(___b); \

+ \

+ /* compute m = ((p << 64) + b - 1) / b */ \

+ ___m = (~0ULL / ___b) * ___p; \

+ ___m += (((~0ULL % ___b + 1) * ___p) + ___b - 1) / ___b; \

+ \

+ /* dividend that produces one less than the highest result */ \

+ ___x = ~0ULL / ___b * ___b - 1; \

+ \

+ /* test our m with res = m * x / (p << 64) */ \

+ ___res = ((___m & 0xffffffff) * (___x & 0xffffffff)) >> 32; \

+ ___t = ___res += (___m & 0xffffffff) * (___x >> 32); \

+ ___res += (___x & 0xffffffff) * (___m >> 32); \

+ ___t = (___res < ___t) ? (1ULL << 32) : 0; \

+ ___res = (___res >> 32) + ___t; \

+ ___res += (___m >> 32) * (___x >> 32); \

+ ___res /= ___p; \

+ \

+ /* Now sanitize and optimize what we've got. */ \

+ if (~0ULL % (___b / (___b & -___b)) == 0) { \

+ /* those cases can be simplified with: */ \

+ ___n /= (___b & -___b); \

+ ___m = ~0ULL / (___b / (___b & -___b)); \

+ ___p = 1; \

+ ___c = 1; \

+ } else if (___res != ___x / ___b) { \

+ /* \

+ * We can't get away without a correction to compensate \

+ * for bit truncation errors. To avoid it we'd need an \

+ * additional bit to represent m which would overflow \

+ * our 64-bit variable. \

+ * \

+ * Instead we do m = p / b and n / b = (n * m + m) / p. \

+ */ \

+ ___c = 1; \

+ /* Compute m = (p << 64) / b */ \

+ ___m = (~0ULL / ___b) * ___p; \

+ ___m += ((~0ULL % ___b + 1) * ___p) / ___b; \

+ } else { \

+ /* \

+ * Reduce m / p, and try to clear bit 31 of m when \

+ * possible, otherwise that'll need extra overflow \

+ * handling later. \

+ */ \

+ unsigned int ___bits = -(___m & -___m); \

+ ___bits |= ___m >> 32; \

+ ___bits = (~___bits) << 1; \

+ /* \

+ * If ___bits == 0 then setting bit 31 is unavoidable. \

+ * Simply apply the maximum possible reduction in that \

+ * case. Otherwise the MSB of ___bits indicates the \

+ * best reduction we should apply. \

+ */ \

+ if (!___bits) { \

+ ___p /= (___m & -___m); \

+ ___m /= (___m & -___m); \

+ } else { \

+ ___p >>= __div64_fls(___bits); \

+ ___m >>= __div64_fls(___bits); \

+ } \

+ /* No correction needed. */ \

+ ___c = 0; \

+ } \

+ \

+ /* \

+ * Now we have a combination of 2 conditions: \

+ * \

+ * 1) whether or not we need a correction (___c), and \

+ * \

+ * 2) whether or not there might be an overflow in the cross \

+ * product determined by (___m & ((1 << 63) | (1 << 31))). \

+ * \

+ * Select the best way to do the m * n / (p << 64) operation. \

+ * From here on there will be actual runtime code generated. \

+ */ \

+ \

+ ___m_lo = ___m; \

+ ___m_hi = ___m >> 32; \

+ ___n_lo = ___n; \

+ ___n_hi = ___n >> 32; \

+ \

+ if (!___c) { \

+ ___res = ((uint64_t)___m_lo * ___n_lo) >> 32; \

+ } else if (!(___m & ((1ULL << 63) | (1ULL << 31)))) { \

+ ___res = (___m + (uint64_t)___m_lo * ___n_lo) >> 32; \

+ } else { \

+ ___res = ___m + (uint64_t)___m_lo * ___n_lo; \

+ ___t = (___res < ___m) ? (1ULL << 32) : 0; \

+ ___res = (___res >> 32) + ___t; \

+ } \

+ \

+ if (!(___m & ((1ULL << 63) | (1ULL << 31)))) { \

+ ___res += (uint64_t)___m_lo * ___n_hi; \

+ ___res += (uint64_t)___m_hi * ___n_lo; \

+ ___res >>= 32; \

+ } else { \

+ ___t = ___res += (uint64_t)___m_lo * ___n_hi; \

+ ___res += (uint64_t)___m_hi * ___n_lo; \

+ ___t = (___res < ___t) ? (1ULL << 32) : 0; \

+ ___res = (___res >> 32) + ___t; \

+ } \

+ \

+ ___res += (uint64_t)___m_hi * ___n_hi; \

+ \

+ ___res / ___p; \

+})

+

extern uint32_t __div64_32(uint64_t *dividend, uint32_t divisor);

/* The unnecessary pointer compare is there

@@ -41,7 +184,20 @@ extern uint32_t __div64_32(uint64_t *dividend, uint32_t divisor);

uint32_t __base = (base); \

uint32_t __rem; \

(void)(((typeof((n)) *)0) == ((uint64_t *)0)); \

- if (likely(((n) >> 32) == 0)) { \

+ if (__builtin_constant_p(__base) && \

+ (__base & (__base - 1)) == 0) { \

+ /* constant power of 2: gcc is fine */ \

+ __rem = (n) & (__base - 1); \

+ (n) /= __base; \

+ } else if ((__GNUC__ >= 4) && \

+ __builtin_constant_p(__base) && \

+ __base != 0) { \

+ uint32_t __res_lo, __n_lo = (n); \

+ (n) = __div64_const32(n, __base); \

+ /* the remainder can be computed with 32-bit regs */ \

+ __res_lo = (n); \

+ __rem = __n_lo - __res_lo * __base; \

+ } else if (likely(((n) >> 32) == 0)) { \

__rem = (uint32_t)(n) % __base; \

(n) = (uint32_t)(n) / __base; \

} else \

--

To unsubscribe from this list: send the line "unsubscribe linux-kernel" in

the body of a message to majordomo@xxxxxxxxxxxxxxx

More majordomo info at http://vger.kernel.org/majordomo-info.html

Please read the FAQ at http://www.tux.org/lkml/

**Next message:**David Miller: "Re: [PATCH 0/5 v3] Fix NVMe driver support on Power with 32-bit DMA"**Previous message:**Viresh Kumar: "[PATCH 3/3] PM / OPP: Remove 'operating-points-names' binding"**In reply to:**Alexey Brodkin: "Re: [PATCH] __div64_32: implement division by multiplication for 32-bit arches"**Next in thread:**Vineet Gupta: "Re: [PATCH] __div64_32: implement division by multiplication for 32-bit arches"**Messages sorted by:**[ date ] [ thread ] [ subject ] [ author ]