[PATCH] enforce function inlining for hot functions

From: Hagen Paul Pfeifer
Date: Thu Apr 23 2015 - 17:40:30 EST


GCC inlining heuristics are sometimes quizzical. Especially with inline
assembler constructs GCC seems to have issues. A allyesconfig show a rather
long list of functions where GCC inlining decisions are questionable (not
inlined). Furthermore, because the functions are declared with static
linkage each function is copied n times - and n can be rather high:

atomic_inc: 544 duplicates
rcu_read_unlock: 453 duplicates
rcu_read_lock: 383 duplicates
get_dma_ops: 271 duplicates
arch_local_irq_restore: 258 duplicates
atomic_dec: 215 duplicates
kzalloc: 185 duplicates
cpumask_check: 157 duplicates
test_and_set_bit: 156 duplicates
cpumask_next: 146 duplicates
list_del: 131 duplicates
kref_get: 126 duplicates

This patch enforces inlining for the hottest functions. Where hottest is a
subjectively biased list of functions where a) inline is a performance must
and/or b) the number of callee's is quite high and/or c) the function call
overhead is too high for a 2 or 3 instructions inline assembler function
(usually all three). E.g.:

static inline void kref_get(struct kref *kref)
{
WARN_ON_ONCE(atomic_inc_return(&kref->refcount) < 2);
}

For the larger part of inlines GCC is able to take proper inlining
decisions. Let's pray that in future versions of GCC inlining heuristic
are further improved so that we even can revert this patch - but this
should last for a while.

Cc: Ingo Molnar <mingo@xxxxxxxxxx>
Cc: "H. Peter Anvin" <hpa@xxxxxxxxx>
Cc: "David S. Miller" <davem@xxxxxxxxxxxxx>
Cc: "Paul E. McKenney" <paulmck@xxxxxxxxxxxxxxxxxx>
Cc: x86@xxxxxxxxxx
Link: http://lkml.kernel.org/r/1429565231-4609-1-git-send-email-hagen@xxxxxxxx
Signed-off-by: Hagen Paul Pfeifer <hagen@xxxxxxxx>
---

net_generic() is a borderline candidate, probably to heavy for inlining.
Possible a candidate for removing static linkage and inline attributes?!
I don't touched the status yet - but I can change it too.

arch/x86/include/asm/bitops.h | 4 ++--
arch/x86/include/asm/dma-mapping.h | 4 ++--
arch/x86/include/asm/irqflags.h | 8 ++++----
arch/x86/include/asm/paravirt.h | 10 +++++-----
arch/x86/include/asm/uaccess.h | 4 ++--
drivers/net/wireless/rtlwifi/wifi.h | 2 +-
include/linux/buffer_head.h | 2 +-
include/linux/clk.h | 2 +-
include/linux/completion.h | 2 +-
include/linux/cpumask.h | 6 +++---
include/linux/interrupt.h | 2 +-
include/linux/kref.h | 2 +-
include/linux/list.h | 4 ++--
include/linux/netdevice.h | 6 +++---
include/linux/rcupdate.h | 6 +++---
include/linux/skbuff.h | 4 ++--
include/linux/slab.h | 2 +-
include/linux/workqueue.h | 10 +++++-----
include/net/netlink.h | 2 +-
include/net/netns/generic.h | 2 +-
include/net/sock.h | 2 +-
21 files changed, 43 insertions(+), 43 deletions(-)

diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index cfe3b95..39ccf6c 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -201,7 +201,7 @@ static inline void change_bit(long nr, volatile unsigned long *addr)
* This operation is atomic and cannot be reordered.
* It also implies a memory barrier.
*/
-static inline int test_and_set_bit(long nr, volatile unsigned long *addr)
+static __always_inline int test_and_set_bit(long nr, volatile unsigned long *addr)
{
GEN_BINARY_RMWcc(LOCK_PREFIX "bts", *addr, "Ir", nr, "%0", "c");
}
@@ -247,7 +247,7 @@ static inline int __test_and_set_bit(long nr, volatile unsigned long *addr)
* This operation is atomic and cannot be reordered.
* It also implies a memory barrier.
*/
-static inline int test_and_clear_bit(long nr, volatile unsigned long *addr)
+static __always_inline int test_and_clear_bit(long nr, volatile unsigned long *addr)
{
GEN_BINARY_RMWcc(LOCK_PREFIX "btr", *addr, "Ir", nr, "%0", "c");
}
diff --git a/arch/x86/include/asm/dma-mapping.h b/arch/x86/include/asm/dma-mapping.h
index 808dae6..55097bc 100644
--- a/arch/x86/include/asm/dma-mapping.h
+++ b/arch/x86/include/asm/dma-mapping.h
@@ -29,7 +29,7 @@ extern int panic_on_overflow;

extern struct dma_map_ops *dma_ops;

-static inline struct dma_map_ops *get_dma_ops(struct device *dev)
+static __always_inline struct dma_map_ops *get_dma_ops(struct device *dev)
{
#ifndef CONFIG_X86_DEV_DMA_OPS
return dma_ops;
@@ -44,7 +44,7 @@ static inline struct dma_map_ops *get_dma_ops(struct device *dev)
#include <asm-generic/dma-mapping-common.h>

/* Make sure we keep the same behaviour */
-static inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
+static __always_inline int dma_mapping_error(struct device *dev, dma_addr_t dma_addr)
{
struct dma_map_ops *ops = get_dma_ops(dev);
debug_dma_mapping_error(dev, dma_addr);
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index b77f5ed..c3842bc 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -62,22 +62,22 @@ static inline void native_halt(void)
#ifndef __ASSEMBLY__
#include <linux/types.h>

-static inline notrace unsigned long arch_local_save_flags(void)
+static __always_inline notrace unsigned long arch_local_save_flags(void)
{
return native_save_fl();
}

-static inline notrace void arch_local_irq_restore(unsigned long flags)
+static __always_inline notrace void arch_local_irq_restore(unsigned long flags)
{
native_restore_fl(flags);
}

-static inline notrace void arch_local_irq_disable(void)
+static __always_inline notrace void arch_local_irq_disable(void)
{
native_irq_disable();
}

-static inline notrace void arch_local_irq_enable(void)
+static __always_inline notrace void arch_local_irq_enable(void)
{
native_irq_enable();
}
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 8957810..2d139e1 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -799,27 +799,27 @@ static __always_inline void __ticket_unlock_kick(struct arch_spinlock *lock,
#define __PV_IS_CALLEE_SAVE(func) \
((struct paravirt_callee_save) { func })

-static inline notrace unsigned long arch_local_save_flags(void)
+static __always_inline notrace unsigned long arch_local_save_flags(void)
{
return PVOP_CALLEE0(unsigned long, pv_irq_ops.save_fl);
}

-static inline notrace void arch_local_irq_restore(unsigned long f)
+static __always_inline notrace void arch_local_irq_restore(unsigned long f)
{
PVOP_VCALLEE1(pv_irq_ops.restore_fl, f);
}

-static inline notrace void arch_local_irq_disable(void)
+static __always_inline notrace void arch_local_irq_disable(void)
{
PVOP_VCALLEE0(pv_irq_ops.irq_disable);
}

-static inline notrace void arch_local_irq_enable(void)
+static __always_inline notrace void arch_local_irq_enable(void)
{
PVOP_VCALLEE0(pv_irq_ops.irq_enable);
}

-static inline notrace unsigned long arch_local_irq_save(void)
+static __always_inline notrace unsigned long arch_local_irq_save(void)
{
unsigned long f;

diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index ace9dec..a34dff0 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -684,7 +684,7 @@ __copy_from_user_overflow(int size, unsigned long count)

#endif

-static inline unsigned long __must_check
+static __always_inline unsigned long __must_check
copy_from_user(void *to, const void __user *from, unsigned long n)
{
int sz = __compiletime_object_size(to);
@@ -719,7 +719,7 @@ copy_from_user(void *to, const void __user *from, unsigned long n)
return n;
}

-static inline unsigned long __must_check
+static __always_inline unsigned long __must_check
copy_to_user(void __user *to, const void *from, unsigned long n)
{
int sz = __compiletime_object_size(from);
diff --git a/drivers/net/wireless/rtlwifi/wifi.h b/drivers/net/wireless/rtlwifi/wifi.h
index 5157291..96d568c 100644
--- a/drivers/net/wireless/rtlwifi/wifi.h
+++ b/drivers/net/wireless/rtlwifi/wifi.h
@@ -2900,7 +2900,7 @@ static inline u32 rtl_read_dword(struct rtl_priv *rtlpriv, u32 addr)
return rtlpriv->io.read32_sync(rtlpriv, addr);
}

-static inline void rtl_write_byte(struct rtl_priv *rtlpriv, u32 addr, u8 val8)
+static __always_inline void rtl_write_byte(struct rtl_priv *rtlpriv, u32 addr, u8 val8)
{
rtlpriv->io.write8_async(rtlpriv, addr, val8);

diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h
index 73b4522..d3942a7 100644
--- a/include/linux/buffer_head.h
+++ b/include/linux/buffer_head.h
@@ -281,7 +281,7 @@ static inline void put_bh(struct buffer_head *bh)
atomic_dec(&bh->b_count);
}

-static inline void brelse(struct buffer_head *bh)
+static __always_inline void brelse(struct buffer_head *bh)
{
if (bh)
__brelse(bh);
diff --git a/include/linux/clk.h b/include/linux/clk.h
index 68c16a6..7fb8314 100644
--- a/include/linux/clk.h
+++ b/include/linux/clk.h
@@ -450,7 +450,7 @@ static inline struct clk *clk_get_parent(struct clk *clk)
#endif

/* clk_prepare_enable helps cases using clk_enable in non-atomic context. */
-static inline int clk_prepare_enable(struct clk *clk)
+static __always_inline int clk_prepare_enable(struct clk *clk)
{
int ret;

diff --git a/include/linux/completion.h b/include/linux/completion.h
index 5d5aaae..06163a6 100644
--- a/include/linux/completion.h
+++ b/include/linux/completion.h
@@ -70,7 +70,7 @@ struct completion {
* This inline function will initialize a dynamically created completion
* structure.
*/
-static inline void init_completion(struct completion *x)
+static __always_inline void init_completion(struct completion *x)
{
x->done = 0;
init_waitqueue_head(&x->wait);
diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index 27e285b..912dede 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -111,7 +111,7 @@ extern const struct cpumask *const cpu_active_mask;
#endif

/* verify cpu argument to cpumask_* operators */
-static inline unsigned int cpumask_check(unsigned int cpu)
+static __always_inline unsigned int cpumask_check(unsigned int cpu)
{
#ifdef CONFIG_DEBUG_PER_CPU_MAPS
WARN_ON_ONCE(cpu >= nr_cpumask_bits);
@@ -183,7 +183,7 @@ static inline unsigned int cpumask_first(const struct cpumask *srcp)
*
* Returns >= nr_cpu_ids if no further cpus set.
*/
-static inline unsigned int cpumask_next(int n, const struct cpumask *srcp)
+static __always_inline unsigned int cpumask_next(int n, const struct cpumask *srcp)
{
/* -1 is a legal arg here. */
if (n != -1)
@@ -473,7 +473,7 @@ static inline bool cpumask_full(const struct cpumask *srcp)
* cpumask_weight - Count of bits in *srcp
* @srcp: the cpumask to count bits (< nr_cpu_ids) in.
*/
-static inline unsigned int cpumask_weight(const struct cpumask *srcp)
+static __always_inline unsigned int cpumask_weight(const struct cpumask *srcp)
{
return bitmap_weight(cpumask_bits(srcp), nr_cpumask_bits);
}
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 950ae45..ef2279a 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -527,7 +527,7 @@ static inline void tasklet_unlock_wait(struct tasklet_struct *t)

extern void __tasklet_schedule(struct tasklet_struct *t);

-static inline void tasklet_schedule(struct tasklet_struct *t)
+static __always_inline void tasklet_schedule(struct tasklet_struct *t)
{
if (!test_and_set_bit(TASKLET_STATE_SCHED, &t->state))
__tasklet_schedule(t);
diff --git a/include/linux/kref.h b/include/linux/kref.h
index 484604d..0a91a21 100644
--- a/include/linux/kref.h
+++ b/include/linux/kref.h
@@ -38,7 +38,7 @@ static inline void kref_init(struct kref *kref)
* kref_get - increment refcount for object.
* @kref: object.
*/
-static inline void kref_get(struct kref *kref)
+static __always_inline void kref_get(struct kref *kref)
{
/* If refcount was 0 before incrementing then we have a race
* condition when this kref is freeing by some other thread right now.
diff --git a/include/linux/list.h b/include/linux/list.h
index feb773c..85d3a1e 100644
--- a/include/linux/list.h
+++ b/include/linux/list.h
@@ -102,7 +102,7 @@ static inline void __list_del_entry(struct list_head *entry)
__list_del(entry->prev, entry->next);
}

-static inline void list_del(struct list_head *entry)
+static __always_inline void list_del(struct list_head *entry)
{
__list_del(entry->prev, entry->next);
entry->next = LIST_POISON1;
@@ -140,7 +140,7 @@ static inline void list_replace_init(struct list_head *old,
* list_del_init - deletes entry from list and reinitialize it.
* @entry: the element to delete from the list.
*/
-static inline void list_del_init(struct list_head *entry)
+static __always_inline void list_del_init(struct list_head *entry)
{
__list_del_entry(entry);
INIT_LIST_HEAD(entry);
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index bcbde79..87d83b4 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -2547,7 +2547,7 @@ static inline void netif_tx_wake_all_queues(struct net_device *dev)
}
}

-static inline void netif_tx_stop_queue(struct netdev_queue *dev_queue)
+static __always_inline void netif_tx_stop_queue(struct netdev_queue *dev_queue)
{
if (WARN_ON(!dev_queue)) {
pr_info("netif_stop_queue() cannot be called before register_netdev()\n");
@@ -2563,12 +2563,12 @@ static inline void netif_tx_stop_queue(struct netdev_queue *dev_queue)
* Stop upper layers calling the device hard_start_xmit routine.
* Used for flow control when transmit resources are unavailable.
*/
-static inline void netif_stop_queue(struct net_device *dev)
+static __always_inline void netif_stop_queue(struct net_device *dev)
{
netif_tx_stop_queue(netdev_get_tx_queue(dev, 0));
}

-static inline void netif_tx_stop_all_queues(struct net_device *dev)
+static __always_inline void netif_tx_stop_all_queues(struct net_device *dev)
{
unsigned int i;

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 573a5af..1acd8ef 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -446,7 +446,7 @@ static inline bool rcu_lockdep_current_cpu_online(void)

#ifdef CONFIG_DEBUG_LOCK_ALLOC

-static inline void rcu_lock_acquire(struct lockdep_map *map)
+static __always_inline void rcu_lock_acquire(struct lockdep_map *map)
{
lock_acquire(map, 0, 0, 2, 0, NULL, _THIS_IP_);
}
@@ -907,7 +907,7 @@ static inline void rcu_preempt_sleep_check(void)
* read-side critical sections may be preempted and they may also block, but
* only when acquiring spinlocks that are subject to priority inheritance.
*/
-static inline void rcu_read_lock(void)
+static __always_inline void rcu_read_lock(void)
{
__rcu_read_lock();
__acquire(RCU);
@@ -961,7 +961,7 @@ static inline void rcu_read_lock(void)
*
* See rcu_read_lock() for more information.
*/
-static inline void rcu_read_unlock(void)
+static __always_inline void rcu_read_unlock(void)
{
rcu_lockdep_assert(rcu_is_watching(),
"rcu_read_unlock() used illegally while idle");
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 06793b5..0f98aa5 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -1328,7 +1328,7 @@ static inline void __skb_queue_head_init(struct sk_buff_head *list)
* network layer or drivers should need annotation to consolidate the
* main types of usage into 3 classes.
*/
-static inline void skb_queue_head_init(struct sk_buff_head *list)
+static __always_inline void skb_queue_head_init(struct sk_buff_head *list)
{
spin_lock_init(&list->lock);
__skb_queue_head_init(list);
@@ -1722,7 +1722,7 @@ static inline unsigned char *pskb_pull(struct sk_buff *skb, unsigned int len)
return unlikely(len > skb->len) ? NULL : __pskb_pull(skb, len);
}

-static inline int pskb_may_pull(struct sk_buff *skb, unsigned int len)
+static __always_inline int pskb_may_pull(struct sk_buff *skb, unsigned int len)
{
if (likely(len <= skb_headlen(skb)))
return 1;
diff --git a/include/linux/slab.h b/include/linux/slab.h
index ffd24c8..06af7bc 100644
--- a/include/linux/slab.h
+++ b/include/linux/slab.h
@@ -578,7 +578,7 @@ static inline void *kmem_cache_zalloc(struct kmem_cache *k, gfp_t flags)
* @size: how many bytes of memory are required.
* @flags: the type of memory to allocate (see kmalloc).
*/
-static inline void *kzalloc(size_t size, gfp_t flags)
+static __always_inline void *kzalloc(size_t size, gfp_t flags)
{
return kmalloc(size, flags | __GFP_ZERO);
}
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index deee212..9f902c9 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -466,8 +466,8 @@ extern void show_workqueue_state(void);
* We queue the work to the CPU on which it was submitted, but if the CPU dies
* it can be processed by another CPU.
*/
-static inline bool queue_work(struct workqueue_struct *wq,
- struct work_struct *work)
+static __always_inline bool queue_work(struct workqueue_struct *wq,
+ struct work_struct *work)
{
return queue_work_on(WORK_CPU_UNBOUND, wq, work);
}
@@ -525,7 +525,7 @@ static inline bool schedule_work_on(int cpu, struct work_struct *work)
* queued and leaves it in the same position on the kernel-global
* workqueue otherwise.
*/
-static inline bool schedule_work(struct work_struct *work)
+static __always_inline bool schedule_work(struct work_struct *work)
{
return queue_work(system_wq, work);
}
@@ -553,8 +553,8 @@ static inline bool schedule_delayed_work_on(int cpu, struct delayed_work *dwork,
* After waiting for a given time this puts a job in the kernel-global
* workqueue.
*/
-static inline bool schedule_delayed_work(struct delayed_work *dwork,
- unsigned long delay)
+static __always_inline bool schedule_delayed_work(struct delayed_work *dwork,
+ unsigned long delay)
{
return queue_delayed_work(system_wq, dwork, delay);
}
diff --git a/include/net/netlink.h b/include/net/netlink.h
index 2a5dbcc..9c5ea29 100644
--- a/include/net/netlink.h
+++ b/include/net/netlink.h
@@ -517,7 +517,7 @@ static inline void *nlmsg_get_pos(struct sk_buff *skb)
*
* Trims the message to the provided mark.
*/
-static inline void nlmsg_trim(struct sk_buff *skb, const void *mark)
+static __always_inline void nlmsg_trim(struct sk_buff *skb, const void *mark)
{
if (mark) {
WARN_ON((unsigned char *) mark < skb->data);
diff --git a/include/net/netns/generic.h b/include/net/netns/generic.h
index 70e1585..b529fa3 100644
--- a/include/net/netns/generic.h
+++ b/include/net/netns/generic.h
@@ -31,7 +31,7 @@ struct net_generic {
void *ptr[0];
};

-static inline void *net_generic(const struct net *net, int id)
+static __always_inline void *net_generic(const struct net *net, int id)
{
struct net_generic *ng;
void *ptr;
diff --git a/include/net/sock.h b/include/net/sock.h
index 3a4898e..92e12fb 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -1619,7 +1619,7 @@ void sock_init_data(struct socket *sock, struct sock *sk);
*/

/* Ungrab socket and destroy it, if it was the last reference. */
-static inline void sock_put(struct sock *sk)
+static __always_inline void sock_put(struct sock *sk)
{
if (atomic_dec_and_test(&sk->sk_refcnt))
sk_free(sk);
--
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/