[PATCH] SSE3 memcpy in C

From: Borislav Petkov
Date: Thu Aug 11 2011 - 12:43:08 EST


Signed-off-by: Borislav Petkov <borislav.petkov@xxxxxxx>
---
arch/x86/include/asm/string_64.h | 14 ++++-
arch/x86/lib/Makefile | 2 +-
arch/x86/lib/sse_memcpy_64.c | 133 ++++++++++++++++++++++++++++++++++++++
3 files changed, 146 insertions(+), 3 deletions(-)
create mode 100644 arch/x86/lib/sse_memcpy_64.c

diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h
index 19e2c46..7bd51bb 100644
--- a/arch/x86/include/asm/string_64.h
+++ b/arch/x86/include/asm/string_64.h
@@ -28,10 +28,20 @@ static __always_inline void *__inline_memcpy(void *to, const void *from, size_t

#define __HAVE_ARCH_MEMCPY 1
#ifndef CONFIG_KMEMCHECK
+extern void *__memcpy(void *to, const void *from, size_t len);
+extern void *__sse_memcpy(void *to, const void *from, size_t len);
#if (__GNUC__ == 4 && __GNUC_MINOR__ >= 3) || __GNUC__ > 4
-extern void *memcpy(void *to, const void *from, size_t len);
+#define memcpy(dst, src, len) \
+({ \
+ size_t __len = (len); \
+ void *__ret; \
+ if (__len >= 512) \
+ __ret = __sse_memcpy((dst), (src), __len); \
+ else \
+ __ret = __memcpy((dst), (src), __len); \
+ __ret; \
+})
#else
-extern void *__memcpy(void *to, const void *from, size_t len);
#define memcpy(dst, src, len) \
({ \
size_t __len = (len); \
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index f2479f1..5f90709 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -36,7 +36,7 @@ ifneq ($(CONFIG_X86_CMPXCHG64),y)
endif
lib-$(CONFIG_X86_USE_3DNOW) += mmx_32.o
else
- obj-y += iomap_copy_64.o
+ obj-y += iomap_copy_64.o sse_memcpy_64.o
lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o
lib-y += thunk_64.o clear_page_64.o copy_page_64.o
lib-y += memmove_64.o memset_64.o
diff --git a/arch/x86/lib/sse_memcpy_64.c b/arch/x86/lib/sse_memcpy_64.c
new file mode 100644
index 0000000..b53fc31
--- /dev/null
+++ b/arch/x86/lib/sse_memcpy_64.c
@@ -0,0 +1,133 @@
+#include <linux/module.h>
+
+#include <asm/i387.h>
+#include <asm/string_64.h>
+
+void *__sse_memcpy(void *to, const void *from, size_t len)
+{
+ unsigned long src = (unsigned long)from;
+ unsigned long dst = (unsigned long)to;
+ void *p = to;
+ int i;
+
+ if (in_interrupt())
+ return __memcpy(to, from, len);
+
+ if (system_state != SYSTEM_RUNNING)
+ return __memcpy(to, from, len);
+
+ kernel_fpu_begin();
+
+ /* check alignment */
+ if ((src ^ dst) & 0xf)
+ goto unaligned;
+
+ if (src & 0xf) {
+ u8 chunk = 0x10 - (src & 0xf);
+
+ /* copy chunk until next 16-byte */
+ __memcpy(to, from, chunk);
+ len -= chunk;
+ to += chunk;
+ from += chunk;
+ }
+
+ /*
+ * copy in 256 Byte portions
+ */
+ for (i = 0; i < (len & ~0xff); i += 256) {
+ asm volatile(
+ "movaps 0x0(%0), %%xmm0\n\t"
+ "movaps 0x10(%0), %%xmm1\n\t"
+ "movaps 0x20(%0), %%xmm2\n\t"
+ "movaps 0x30(%0), %%xmm3\n\t"
+ "movaps 0x40(%0), %%xmm4\n\t"
+ "movaps 0x50(%0), %%xmm5\n\t"
+ "movaps 0x60(%0), %%xmm6\n\t"
+ "movaps 0x70(%0), %%xmm7\n\t"
+ "movaps 0x80(%0), %%xmm8\n\t"
+ "movaps 0x90(%0), %%xmm9\n\t"
+ "movaps 0xa0(%0), %%xmm10\n\t"
+ "movaps 0xb0(%0), %%xmm11\n\t"
+ "movaps 0xc0(%0), %%xmm12\n\t"
+ "movaps 0xd0(%0), %%xmm13\n\t"
+ "movaps 0xe0(%0), %%xmm14\n\t"
+ "movaps 0xf0(%0), %%xmm15\n\t"
+
+ "movaps %%xmm0, 0x0(%1)\n\t"
+ "movaps %%xmm1, 0x10(%1)\n\t"
+ "movaps %%xmm2, 0x20(%1)\n\t"
+ "movaps %%xmm3, 0x30(%1)\n\t"
+ "movaps %%xmm4, 0x40(%1)\n\t"
+ "movaps %%xmm5, 0x50(%1)\n\t"
+ "movaps %%xmm6, 0x60(%1)\n\t"
+ "movaps %%xmm7, 0x70(%1)\n\t"
+ "movaps %%xmm8, 0x80(%1)\n\t"
+ "movaps %%xmm9, 0x90(%1)\n\t"
+ "movaps %%xmm10, 0xa0(%1)\n\t"
+ "movaps %%xmm11, 0xb0(%1)\n\t"
+ "movaps %%xmm12, 0xc0(%1)\n\t"
+ "movaps %%xmm13, 0xd0(%1)\n\t"
+ "movaps %%xmm14, 0xe0(%1)\n\t"
+ "movaps %%xmm15, 0xf0(%1)\n\t"
+ : : "r" (from), "r" (to) : "memory");
+
+ from += 256;
+ to += 256;
+ }
+
+ goto trailer;
+
+unaligned:
+ /*
+ * copy in 256 Byte portions unaligned
+ */
+ for (i = 0; i < (len & ~0xff); i += 256) {
+ asm volatile(
+ "movups 0x0(%0), %%xmm0\n\t"
+ "movups 0x10(%0), %%xmm1\n\t"
+ "movups 0x20(%0), %%xmm2\n\t"
+ "movups 0x30(%0), %%xmm3\n\t"
+ "movups 0x40(%0), %%xmm4\n\t"
+ "movups 0x50(%0), %%xmm5\n\t"
+ "movups 0x60(%0), %%xmm6\n\t"
+ "movups 0x70(%0), %%xmm7\n\t"
+ "movups 0x80(%0), %%xmm8\n\t"
+ "movups 0x90(%0), %%xmm9\n\t"
+ "movups 0xa0(%0), %%xmm10\n\t"
+ "movups 0xb0(%0), %%xmm11\n\t"
+ "movups 0xc0(%0), %%xmm12\n\t"
+ "movups 0xd0(%0), %%xmm13\n\t"
+ "movups 0xe0(%0), %%xmm14\n\t"
+ "movups 0xf0(%0), %%xmm15\n\t"
+
+ "movups %%xmm0, 0x0(%1)\n\t"
+ "movups %%xmm1, 0x10(%1)\n\t"
+ "movups %%xmm2, 0x20(%1)\n\t"
+ "movups %%xmm3, 0x30(%1)\n\t"
+ "movups %%xmm4, 0x40(%1)\n\t"
+ "movups %%xmm5, 0x50(%1)\n\t"
+ "movups %%xmm6, 0x60(%1)\n\t"
+ "movups %%xmm7, 0x70(%1)\n\t"
+ "movups %%xmm8, 0x80(%1)\n\t"
+ "movups %%xmm9, 0x90(%1)\n\t"
+ "movups %%xmm10, 0xa0(%1)\n\t"
+ "movups %%xmm11, 0xb0(%1)\n\t"
+ "movups %%xmm12, 0xc0(%1)\n\t"
+ "movups %%xmm13, 0xd0(%1)\n\t"
+ "movups %%xmm14, 0xe0(%1)\n\t"
+ "movups %%xmm15, 0xf0(%1)\n\t"
+ : : "r" (from), "r" (to) : "memory");
+
+ from += 256;
+ to += 256;
+ }
+
+trailer:
+ __memcpy(to, from, len & 0xff);
+
+ kernel_fpu_end();
+
+ return p;
+}
+EXPORT_SYMBOL_GPL(__sse_memcpy);
--
1.7.6.134.gcf13f6


--
Regards/Gruss,
Boris.

--fdj2RfSjLxBAspz7
Content-Type: text/plain; charset=utf-8
Content-Disposition: attachment; filename="kernel_build.sizes"

Bytes Count
===== =====
0 5447
1 3850
2 16255
3 11113
4 68870
5 4256
6 30433
7 19188
8 50490
9 5999
10 78275
11 5628
12 6870
13 7371
14 4742
15 4911
16 143835
17 14096
18 1573
19 13603
20 424321
21 741
22 584
23 450
24 472
25 685
26 367
27 365
28 333
29 301
30 300
31 269
32 489
33 272
34 266
35 220
36 239
37 209
38 249
39 235
40 207
41 181
42 150
43 98
44 194
45 66
46 62
47 52
48 67226
49 138
50 171
51 26
52 20
53 12
54 15
55 4
56 13
57 8
58 6
59 6
60 115
61 10
62 5
63 12
64 67353
65 6
66 2363
67 9
68 11
69 6
70 5
71 6
72 10
73 4
74 9
75 8
76 4
77 6
78 3
79 4
80 3
81 4
82 4
83 4
84 4
85 8
86 6
87 2
88 3
89 2
90 2
91 1
92 9
93 1
94 2
96 2
97 2
98 3
100 2
102 1
104 1
105 1
106 1
107 2
109 1
110 1
111 1
112 1
113 2
115 2
117 1
118 1
119 1
120 14
127 1
128 1
130 1
131 2
134 2
137 1
144 100092
149 1
151 1
153 1
158 1
185 1
217 4
224 3
225 3
227 3
244 1
254 5
255 13
256 21708
512 21746
848 12907
1920 36536
2048 21708

--fdj2RfSjLxBAspz7--
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majordomo@xxxxxxxxxxxxxxx
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/