Re: [patch 00/2] improve .text size on gcc 4.0 and newer compilers

From: Ingo Molnar
Date: Fri Dec 30 2005 - 04:40:11 EST



* Andi Kleen <ak@xxxxxxx> wrote:

> There are important exceptions like:
>
> - Code that really wants to do compile time constant resolution
> (like the x86 copy_*_user) and even throws linker errors when wrong.
> - Anything in a include file (otherwise it gets duplicated for
> every #include which can actually increase text size a lot)
> - There is some code which absolutely needs inline in the x86-64
> vsyscall code.
>
> But arguably they should be force_inline.

FYI, i picked up a couple of those in the 3rd patch that i sent
yesterday (see below too). That patch marks a handful of functions
__always_inline. This improved size by another 2-3%. Not bad from a
small patch:

asm-i386/apic.h | 6 +++---
asm-i386/bitops.h | 2 +-
asm-i386/current.h | 2 +-
asm-i386/string.h | 8 ++++----
linux/buffer_head.h | 10 +++++-----
linux/byteorder/swab.h | 18 +++++++++---------
linux/mm.h | 2 +-
linux/slab.h | 2 +-
8 files changed, 25 insertions(+), 25 deletions(-)

> I'm not quite sure I buy Ingo's original argument also. If he's only
> looking at text size then with the above fixed then he ideally would
> like to not inline anything (because except these exceptions above
> .text usually near always shrinks when not inlining). But that's not
> necessarily best for performance.

well, i think the numbers talk for themselves. Here are my latest
results:

----
The effect of the patches on x86, using a generic .config is:

text data bss dec hex filename
3286166 869852 387260 4543278 45532e vmlinux-orig
3194123 955168 387260 4536551 4538e7 vmlinux-inline
3119495 884960 387748 4392203 43050b vmlinux-inline+units
3051709 869380 387748 4308837 41bf65 vmlinux-inline+units+fixes
3049357 868928 387748 4306033 41b471 vmlinux-inline+units+fixes+capable

i.e. a 7.8% code-size reduction. Using a tiny .config gives:

text data bss dec hex filename
437271 77646 32192 547109 85925 vmlinux-orig
452694 77646 32192 562532 89564 vmlinux-inline
431891 77422 32128 541441 84301 vmlinux-inline+units
414803 77422 32128 524353 80041 vmlinux-inline+units+fixes
414020 77422 32128 523570 7fd32 vmlinux-inline+units+fixes+capable

or an 5.6% reduction.

i've also done test-builds with CC_OPTIMIZE_FOR_SIZE disabled:

text data bss dec hex filename
4080998 870384 387260 5338642 517612 vmlinux-orig
4084421 872024 387260 5343705 5189d9 vmlinux-inline
4010957 834048 387748 5232753 4fd871 vmlinux-inline+units
4010039 833112 387748 5230899 4fd133 vmlinux-inline+units+fixes
4007617 833120 387748 5228485 4fc7c5 vmlinux-inline+units+fixes+capable

or a 1.8% code size reduction.

Ingo

--------
Subject: mark a handful of inline functions as 'must inline'

this patch marks a number of functions as 'must inline' - so that they
get inlined even if optimizing for size. This patch gives another 2-3%
of size saved, when CONFIG_CC_OPTIMIZE_FOR_SIZE is enabled.

Signed-off-by: Ingo Molnar <mingo@xxxxxxx>

----

include/asm-i386/apic.h | 6 +++---
include/asm-i386/bitops.h | 2 +-
include/asm-i386/current.h | 2 +-
include/asm-i386/string.h | 8 ++++----
include/linux/buffer_head.h | 10 +++++-----
include/linux/byteorder/swab.h | 18 +++++++++---------
include/linux/mm.h | 2 +-
include/linux/slab.h | 2 +-
8 files changed, 25 insertions(+), 25 deletions(-)

Index: linux-gcc.q/include/asm-i386/apic.h
===================================================================
--- linux-gcc.q.orig/include/asm-i386/apic.h
+++ linux-gcc.q/include/asm-i386/apic.h
@@ -49,17 +49,17 @@ static inline void lapic_enable(void)
* Basic functions accessing APICs.
*/

-static __inline void apic_write(unsigned long reg, unsigned long v)
+static __always_inline void apic_write(unsigned long reg, unsigned long v)
{
*((volatile unsigned long *)(APIC_BASE+reg)) = v;
}

-static __inline void apic_write_atomic(unsigned long reg, unsigned long v)
+static __always_inline void apic_write_atomic(unsigned long reg, unsigned long v)
{
xchg((volatile unsigned long *)(APIC_BASE+reg), v);
}

-static __inline unsigned long apic_read(unsigned long reg)
+static __always_inline unsigned long apic_read(unsigned long reg)
{
return *((volatile unsigned long *)(APIC_BASE+reg));
}
Index: linux-gcc.q/include/asm-i386/bitops.h
===================================================================
--- linux-gcc.q.orig/include/asm-i386/bitops.h
+++ linux-gcc.q/include/asm-i386/bitops.h
@@ -247,7 +247,7 @@ static inline int test_and_change_bit(in
static int test_bit(int nr, const volatile void * addr);
#endif

-static inline int constant_test_bit(int nr, const volatile unsigned long *addr)
+static __always_inline int constant_test_bit(int nr, const volatile unsigned long *addr)
{
return ((1UL << (nr & 31)) & (addr[nr >> 5])) != 0;
}
Index: linux-gcc.q/include/asm-i386/current.h
===================================================================
--- linux-gcc.q.orig/include/asm-i386/current.h
+++ linux-gcc.q/include/asm-i386/current.h
@@ -5,7 +5,7 @@

struct task_struct;

-static inline struct task_struct * get_current(void)
+static __always_inline struct task_struct * get_current(void)
{
return current_thread_info()->task;
}
Index: linux-gcc.q/include/asm-i386/string.h
===================================================================
--- linux-gcc.q.orig/include/asm-i386/string.h
+++ linux-gcc.q/include/asm-i386/string.h
@@ -201,7 +201,7 @@ __asm__ __volatile__(
return __res;
}

-static inline void * __memcpy(void * to, const void * from, size_t n)
+static __always_inline void * __memcpy(void * to, const void * from, size_t n)
{
int d0, d1, d2;
__asm__ __volatile__(
@@ -223,7 +223,7 @@ return (to);
* This looks ugly, but the compiler can optimize it totally,
* as the count is constant.
*/
-static inline void * __constant_memcpy(void * to, const void * from, size_t n)
+static __always_inline void * __constant_memcpy(void * to, const void * from, size_t n)
{
long esi, edi;
if (!n) return to;
@@ -367,7 +367,7 @@ return s;
* things 32 bits at a time even when we don't know the size of the
* area at compile-time..
*/
-static inline void * __constant_c_memset(void * s, unsigned long c, size_t count)
+static __always_inline void * __constant_c_memset(void * s, unsigned long c, size_t count)
{
int d0, d1;
__asm__ __volatile__(
@@ -416,7 +416,7 @@ extern char *strstr(const char *cs, cons
* This looks horribly ugly, but the compiler can optimize it totally,
* as we by now know that both pattern and count is constant..
*/
-static inline void * __constant_c_and_count_memset(void * s, unsigned long pattern, size_t count)
+static __always_inline void * __constant_c_and_count_memset(void * s, unsigned long pattern, size_t count)
{
switch (count) {
case 0:
Index: linux-gcc.q/include/linux/buffer_head.h
===================================================================
--- linux-gcc.q.orig/include/linux/buffer_head.h
+++ linux-gcc.q/include/linux/buffer_head.h
@@ -72,15 +72,15 @@ struct buffer_head {
* and buffer_foo() functions.
*/
#define BUFFER_FNS(bit, name) \
-static inline void set_buffer_##name(struct buffer_head *bh) \
+static __always_inline void set_buffer_##name(struct buffer_head *bh) \
{ \
set_bit(BH_##bit, &(bh)->b_state); \
} \
-static inline void clear_buffer_##name(struct buffer_head *bh) \
+static __always_inline void clear_buffer_##name(struct buffer_head *bh) \
{ \
clear_bit(BH_##bit, &(bh)->b_state); \
} \
-static inline int buffer_##name(const struct buffer_head *bh) \
+static __always_inline int buffer_##name(const struct buffer_head *bh) \
{ \
return test_bit(BH_##bit, &(bh)->b_state); \
}
@@ -89,11 +89,11 @@ static inline int buffer_##name(const st
* test_set_buffer_foo() and test_clear_buffer_foo()
*/
#define TAS_BUFFER_FNS(bit, name) \
-static inline int test_set_buffer_##name(struct buffer_head *bh) \
+static __always_inline int test_set_buffer_##name(struct buffer_head *bh)\
{ \
return test_and_set_bit(BH_##bit, &(bh)->b_state); \
} \
-static inline int test_clear_buffer_##name(struct buffer_head *bh) \
+static __always_inline int test_clear_buffer_##name(struct buffer_head *bh)\
{ \
return test_and_clear_bit(BH_##bit, &(bh)->b_state); \
} \
Index: linux-gcc.q/include/linux/byteorder/swab.h
===================================================================
--- linux-gcc.q.orig/include/linux/byteorder/swab.h
+++ linux-gcc.q/include/linux/byteorder/swab.h
@@ -130,34 +130,34 @@
#endif /* OPTIMIZE */


-static __inline__ __attribute_const__ __u16 __fswab16(__u16 x)
+static __always_inline __attribute_const__ __u16 __fswab16(__u16 x)
{
return __arch__swab16(x);
}
-static __inline__ __u16 __swab16p(const __u16 *x)
+static __always_inline __u16 __swab16p(const __u16 *x)
{
return __arch__swab16p(x);
}
-static __inline__ void __swab16s(__u16 *addr)
+static __always_inline void __swab16s(__u16 *addr)
{
__arch__swab16s(addr);
}

-static __inline__ __attribute_const__ __u32 __fswab32(__u32 x)
+static __always_inline __attribute_const__ __u32 __fswab32(__u32 x)
{
return __arch__swab32(x);
}
-static __inline__ __u32 __swab32p(const __u32 *x)
+static __always_inline __u32 __swab32p(const __u32 *x)
{
return __arch__swab32p(x);
}
-static __inline__ void __swab32s(__u32 *addr)
+static __always_inline void __swab32s(__u32 *addr)
{
__arch__swab32s(addr);
}

#ifdef __BYTEORDER_HAS_U64__
-static __inline__ __attribute_const__ __u64 __fswab64(__u64 x)
+static __always_inline __attribute_const__ __u64 __fswab64(__u64 x)
{
# ifdef __SWAB_64_THRU_32__
__u32 h = x >> 32;
@@ -167,11 +167,11 @@ static __inline__ __attribute_const__ __
return __arch__swab64(x);
# endif
}
-static __inline__ __u64 __swab64p(const __u64 *x)
+static __always_inline __u64 __swab64p(const __u64 *x)
{
return __arch__swab64p(x);
}
-static __inline__ void __swab64s(__u64 *addr)
+static __always_inline void __swab64s(__u64 *addr)
{
__arch__swab64s(addr);
}
Index: linux-gcc.q/include/linux/mm.h
===================================================================
--- linux-gcc.q.orig/include/linux/mm.h
+++ linux-gcc.q/include/linux/mm.h
@@ -507,7 +507,7 @@ static inline void set_page_links(struct
extern struct page *mem_map;
#endif

-static inline void *lowmem_page_address(struct page *page)
+static __always_inline void *lowmem_page_address(struct page *page)
{
return __va(page_to_pfn(page) << PAGE_SHIFT);
}
Index: linux-gcc.q/include/linux/slab.h
===================================================================
--- linux-gcc.q.orig/include/linux/slab.h
+++ linux-gcc.q/include/linux/slab.h
@@ -76,7 +76,7 @@ struct cache_sizes {
extern struct cache_sizes malloc_sizes[];
extern void *__kmalloc(size_t, gfp_t);

-static inline void *kmalloc(size_t size, gfp_t flags)
+static __always_inline void *kmalloc(size_t size, gfp_t flags)
{
if (__builtin_constant_p(size)) {
int i = 0;
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/