Patch to improve Pentium pipelining in csum_partial_copy()

Tom May (ftom@netcom.com)
Fri, 13 Sep 1996 17:47:30 -0700


A while back someone mentioned that the code in csum_partial_copy() in
arch/i386/lib/checksum.c doesn't use the pipelines optimally on the
Pentium (it looks like it was derived from csum_partial_copy_fromuser()
which does pipeline ok but which is hindered by the segment overrides).

They were correct, but since then nobody has stepped up to make the
patch. So here it is. This patch also deletes an unnecessary `test'
instruction in each of the checksum functions, and standardizes on
`$0x1c' instead of `$28' as an AND mask.

In out-of-kernel testing, this patch has no affect on performance on
my 486/66, and speeds up my Pentium system by 0% to 25% depending on
luck with the cache.

I have been running with this patch for at least a month with slip and
ethernet connections.

Tom.

--- linux/arch/i386/lib/checksum.c.0 Wed Sep 27 00:57:18 1995
+++ linux/arch/i386/lib/checksum.c Fri Sep 13 08:56:14 1996
@@ -70,8 +70,7 @@
2: movl %%edx, %%ecx
andl $0x1c, %%edx
je 4f
- shrl $2, %%edx
- testl %%esi, %%esi
+ shrl $2, %%edx # This clears CF
3: adcl (%%esi), %%eax
lea 4(%%esi), %%esi
dec %%edx
@@ -159,10 +158,9 @@
jne 1b
adcl $0, %%eax
2: movl %%edx, %%ecx
- andl $28, %%edx
+ andl $0x1c, %%edx
je 4f
- shrl $2, %%edx
- testl %%esi, %%esi
+ shrl $2, %%edx # This clears CF
3: movl %%fs:(%%esi), %%ebx
adcl %%ebx, %%eax
movl %%ebx, (%%edi)
@@ -212,52 +210,48 @@
addw %%bx, %%ax
adcl $0, %%eax
2:
- movl %%ecx, %%edx
+ pushl %%ecx
shrl $5, %%ecx
jz 2f
testl %%esi, %%esi
1: movl (%%esi), %%ebx
+ movl 4(%%esi), %%edx
adcl %%ebx, %%eax
movl %%ebx, (%%edi)
-
- movl 4(%%esi), %%ebx
- adcl %%ebx, %%eax
- movl %%ebx, 4(%%edi)
+ adcl %%edx, %%eax
+ movl %%edx, 4(%%edi)

movl 8(%%esi), %%ebx
+ movl 12(%%esi), %%edx
adcl %%ebx, %%eax
movl %%ebx, 8(%%edi)
-
- movl 12(%%esi), %%ebx
- adcl %%ebx, %%eax
- movl %%ebx, 12(%%edi)
+ adcl %%edx, %%eax
+ movl %%edx, 12(%%edi)

movl 16(%%esi), %%ebx
+ movl 20(%%esi), %%edx
adcl %%ebx, %%eax
movl %%ebx, 16(%%edi)
-
- movl 20(%%esi), %%ebx
- adcl %%ebx, %%eax
- movl %%ebx, 20(%%edi)
+ adcl %%edx, %%eax
+ movl %%edx, 20(%%edi)

movl 24(%%esi), %%ebx
+ movl 28(%%esi), %%edx
adcl %%ebx, %%eax
movl %%ebx, 24(%%edi)
-
- movl 28(%%esi), %%ebx
- adcl %%ebx, %%eax
- movl %%ebx, 28(%%edi)
+ adcl %%edx, %%eax
+ movl %%edx, 28(%%edi)

lea 32(%%esi), %%esi
lea 32(%%edi), %%edi
dec %%ecx
jne 1b
adcl $0, %%eax
-2: movl %%edx, %%ecx
- andl $28, %%edx
+2: popl %%edx
+ movl %%edx, %%ecx
+ andl $0x1c, %%edx
je 4f
- shrl $2, %%edx
- testl %%esi, %%esi
+ shrl $2, %%edx # This clears CF
3: movl (%%esi), %%ebx
adcl %%ebx, %%eax
movl %%ebx, (%%edi)

That's all.