kmalloc optimization

From: David Wragg (dpw@doc.ic.ac.uk)
Date: Sat Aug 26 2000 - 18:33:45 EST


Hi,

Intel's P6 processors have BSR, a cheap (couple of cycles) instruction
to find the most significant set bit in a word. This can be used in
kmalloc to identify the slab cache to allocate from; it's quicker than
the linear search through to cache_sizes array that is currently used.
There's a proof of concept patch below.

However, on the P5 and earlier Intel processors BSR is very slow, and
other architectures might not have such an instruction at all. There
is also a general optimization possible, though, to use a binary
search rather than a linear search.

A proper patch including both optimizations would clutter up the
kmalloc() definition a bit, and although kmalloc is important() I
doubt the improvement would ever be measurable under real workloads,
so I'm looking for feedback on whether people think this is a good/bad
idea. I'd also like to know whether BSR is similarly cheap on the
Athlon (and the P4; the Willamette information I have doesn't mention
BSR, though since the branch misprediction penalty is horrible this
optmization is almost certainly still valid there).

David Wragg

diff -rua linux-2.4.0test7/include/asm-i386/bitops.h linux-2.4.0test7.mod/include/asm-i386/bitops.h
--- linux-2.4.0test7/include/asm-i386/bitops.h Sat Aug 26 00:18:14 2000
+++ linux-2.4.0test7.mod/include/asm-i386/bitops.h Sun Aug 27 00:07:15 2000
@@ -190,6 +190,17 @@
         return word;
 }
 
+/*
+ * ffzr = Find First Zero in word reverse, i.e. from the MSB down.
+ */
+extern __inline__ unsigned long ffzr(unsigned long word)
+{
+ __asm__("bsrl %1,%0"
+ :"=r" (word)
+ :"r" (~word));
+ return word;
+}
+
 #ifdef __KERNEL__
 
 /*
diff -rua linux-2.4.0test7/mm/slab.c linux-2.4.0test7.mod/mm/slab.c
--- linux-2.4.0test7/mm/slab.c Fri Aug 25 23:48:58 2000
+++ linux-2.4.0test7.mod/mm/slab.c Sun Aug 27 00:07:21 2000
@@ -324,24 +324,16 @@
         kmem_cache_t *cs_dmacachep;
 } cache_sizes_t;
 
-static cache_sizes_t cache_sizes[] = {
 #if PAGE_SIZE == 4096
- { 32, NULL, NULL},
+#define CACHE_SIZE_SHIFT_MIN 5
+#else
+#define CACHE_SIZE_SHIFT_MIN 6
 #endif
- { 64, NULL, NULL},
- { 128, NULL, NULL},
- { 256, NULL, NULL},
- { 512, NULL, NULL},
- { 1024, NULL, NULL},
- { 2048, NULL, NULL},
- { 4096, NULL, NULL},
- { 8192, NULL, NULL},
- { 16384, NULL, NULL},
- { 32768, NULL, NULL},
- { 65536, NULL, NULL},
- {131072, NULL, NULL},
- { 0, NULL, NULL}
-};
+
+#define CACHE_SIZE_SHIFT_MAX 17
+
+static cache_sizes_t cache_sizes[BITS_PER_LONG - CACHE_SIZE_SHIFT_MIN];
+
 
 /* internal cache of cache description objs */
 static kmem_cache_t cache_cache = {
@@ -424,7 +416,7 @@
  */
 void __init kmem_cache_sizes_init(void)
 {
- cache_sizes_t *sizes = cache_sizes;
+ int i;
         char name[20];
         /*
          * Fragmentation resistance on low memory - only use bigger
@@ -432,13 +424,17 @@
          */
         if (num_physpages > (32 << 20) >> PAGE_SHIFT)
                 slab_break_gfp_order = BREAK_GFP_ORDER_HI;
- do {
+
+ for (i = 0; i <= CACHE_SIZE_SHIFT_MAX - CACHE_SIZE_SHIFT_MIN; i++) {
+ cache_sizes_t *sizes = cache_sizes + i;
+
                 /* For performance, all the general caches are L1 aligned.
                  * This should be particularly beneficial on SMP boxes, as it
                  * eliminates "false sharing".
                  * Note for systems short on memory removing the alignment will
                  * allow tighter packing of the smaller caches. */
- sprintf(name,"size-%Zd",sizes->cs_size);
+ sizes->cs_size = 1 << (i + CACHE_SIZE_SHIFT_MIN);
+ sprintf(name,"size-%Zd", sizes->cs_size);
                 if (!(sizes->cs_cachep =
                         kmem_cache_create(name, sizes->cs_size,
                                         0, SLAB_HWCACHE_ALIGN, NULL, NULL))) {
@@ -455,8 +451,7 @@
                               SLAB_CACHE_DMA|SLAB_HWCACHE_ALIGN, NULL, NULL);
                 if (!sizes->cs_dmacachep)
                         BUG();
- sizes++;
- } while (sizes->cs_size);
+ }
 }
 
 int __init kmem_cpucache_init(void)
@@ -1520,14 +1515,15 @@
  */
 void * kmalloc (size_t size, int flags)
 {
- cache_sizes_t *csizep = cache_sizes;
+ cache_sizes_t *csizep;
+
+ size = (size | 1 << CACHE_SIZE_SHIFT_MIN) - 1;
+ csizep = cache_sizes + ffzr(~size) + 1 - CACHE_SIZE_SHIFT_MIN;
 
- for (; csizep->cs_size; csizep++) {
- if (size > csizep->cs_size)
- continue;
+ if (csizep->cs_size)
                 return __kmem_cache_alloc(flags & GFP_DMA ?
                          csizep->cs_dmacachep : csizep->cs_cachep, flags);
- }
+
         BUG(); // too big size
         return NULL;
 }
@@ -1577,17 +1573,15 @@
 
 kmem_cache_t * kmem_find_general_cachep (size_t size, int gfpflags)
 {
- cache_sizes_t *csizep = cache_sizes;
-
         /* This function could be moved to the header file, and
          * made inline so consumers can quickly determine what
          * cache pointer they require.
          */
- for ( ; csizep->cs_size; csizep++) {
- if (size > csizep->cs_size)
- continue;
- break;
- }
+ cache_sizes_t *csizep;
+
+ size = (size | 1 << CACHE_SIZE_SHIFT_MIN) - 1;
+ csizep = cache_sizes + ffzr(~size) + 1 - CACHE_SIZE_SHIFT_MIN;
+
         return (gfpflags & GFP_DMA) ? csizep->cs_dmacachep : csizep->cs_cachep;
 }
 

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@vger.kernel.org
Please read the FAQ at http://www.tux.org/lkml/



This archive was generated by hypermail 2b29 : Thu Aug 31 2000 - 21:00:18 EST