diff options
author | Ondřej Bílka <neleai@seznam.cz> | 2013-10-08 15:46:48 +0200 |
---|---|---|
committer | Ondřej Bílka <neleai@seznam.cz> | 2013-10-08 15:46:48 +0200 |
commit | e7044ea76bd95f8adc0eab0b2bdcab7f51055b48 (patch) | |
tree | 262f397226e64df368b266a681622e7e25c30e5a /sysdeps/x86_64 | |
parent | 41500766f71fd072b6b6a9e4603fb7f85bddcfe2 (diff) | |
download | glibc-e7044ea76bd95f8adc0eab0b2bdcab7f51055b48.tar.gz glibc-e7044ea76bd95f8adc0eab0b2bdcab7f51055b48.tar.xz glibc-e7044ea76bd95f8adc0eab0b2bdcab7f51055b48.zip |
Use p2align instead ALIGN
Diffstat (limited to 'sysdeps/x86_64')
-rw-r--r-- | sysdeps/x86_64/memset.S | 10 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/memcmp-sse4.S | 84 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/memcmp-ssse3.S | 126 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S | 10 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/memcpy-ssse3-back.S | 86 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/memcpy-ssse3.S | 254 | ||||
-rw-r--r-- | sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S | 9 | ||||
-rw-r--r-- | sysdeps/x86_64/strchr.S | 15 | ||||
-rw-r--r-- | sysdeps/x86_64/strrchr.S | 17 |
9 files changed, 288 insertions, 323 deletions
diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S index 6c69f4b442..9b1de89d98 100644 --- a/sysdeps/x86_64/memset.S +++ b/sysdeps/x86_64/memset.S @@ -19,10 +19,6 @@ #include <sysdep.h> -#ifndef ALIGN -# define ALIGN(n) .p2align n -#endif - .text #if !defined NOT_IN_libc ENTRY(__bzero) @@ -71,12 +67,12 @@ L(entry_from_bzero): L(return): rep ret - ALIGN (4) + .p2align 4 L(between_32_64_bytes): movdqu %xmm8, 16(%rdi) movdqu %xmm8, -32(%rdi,%rdx) ret - ALIGN (4) + .p2align 4 L(loop_start): leaq 64(%rdi), %rcx movdqu %xmm8, (%rdi) @@ -92,7 +88,7 @@ L(loop_start): andq $-64, %rdx cmpq %rdx, %rcx je L(return) - ALIGN (4) + .p2align 4 L(loop): movdqa %xmm8, (%rcx) movdqa %xmm8, 16(%rcx) diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S index 1ed4200f4c..d7b147e5ce 100644 --- a/sysdeps/x86_64/multiarch/memcmp-sse4.S +++ b/sysdeps/x86_64/multiarch/memcmp-sse4.S @@ -25,10 +25,6 @@ # define MEMCMP __memcmp_sse4_1 # endif -# ifndef ALIGN -# define ALIGN(n) .p2align n -# endif - # define JMPTBL(I, B) (I - B) # define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ @@ -60,7 +56,7 @@ ENTRY (MEMCMP) BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) # ifndef USE_AS_WMEMCMP - ALIGN (4) + .p2align 4 L(firstbyte): movzbl (%rdi), %eax movzbl (%rsi), %ecx @@ -68,7 +64,7 @@ L(firstbyte): ret # endif - ALIGN (4) + .p2align 4 L(79bytesormore): movdqu (%rsi), %xmm1 movdqu (%rdi), %xmm2 @@ -316,7 +312,7 @@ L(less32bytesin256): add %rdx, %rdi BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(512bytesormore): # ifdef DATA_CACHE_SIZE_HALF mov $DATA_CACHE_SIZE_HALF, %R8_LP @@ -329,7 +325,7 @@ L(512bytesormore): cmp %r8, %rdx ja L(L2_L3_cache_unaglined) sub $64, %rdx - ALIGN (4) + .p2align 4 L(64bytesormore_loop): movdqu (%rdi), %xmm2 pxor (%rsi), %xmm2 @@ -361,7 +357,7 @@ L(64bytesormore_loop): L(L2_L3_cache_unaglined): sub $64, %rdx - ALIGN (4) + .p2align 4 L(L2_L3_unaligned_128bytes_loop): prefetchnta 0x1c0(%rdi) prefetchnta 0x1c0(%rsi) @@ -396,7 +392,7 @@ L(L2_L3_unaligned_128bytes_loop): /* * This case is for machines which are sensitive for unaligned instructions. */ - ALIGN (4) + .p2align 4 L(2aligned): cmp $128, %rdx ja L(128bytesormorein2aligned) @@ -444,7 +440,7 @@ L(less32bytesin64in2alinged): add %rdx, %rdi BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(128bytesormorein2aligned): cmp $512, %rdx ja L(512bytesormorein2aligned) @@ -519,7 +515,7 @@ L(less32bytesin128in2aligned): add %rdx, %rdi BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(256bytesormorein2aligned): sub $256, %rdx @@ -632,7 +628,7 @@ L(less32bytesin256in2alinged): add %rdx, %rdi BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(512bytesormorein2aligned): # ifdef DATA_CACHE_SIZE_HALF mov $DATA_CACHE_SIZE_HALF, %R8_LP @@ -646,7 +642,7 @@ L(512bytesormorein2aligned): ja L(L2_L3_cache_aglined) sub $64, %rdx - ALIGN (4) + .p2align 4 L(64bytesormore_loopin2aligned): movdqa (%rdi), %xmm2 pxor (%rsi), %xmm2 @@ -678,7 +674,7 @@ L(64bytesormore_loopin2aligned): L(L2_L3_cache_aglined): sub $64, %rdx - ALIGN (4) + .p2align 4 L(L2_L3_aligned_128bytes_loop): prefetchnta 0x1c0(%rdi) prefetchnta 0x1c0(%rsi) @@ -711,7 +707,7 @@ L(L2_L3_aligned_128bytes_loop): BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(64bytesormore_loop_end): add $16, %rdi add $16, %rsi @@ -806,7 +802,7 @@ L(8bytes): xor %eax, %eax ret - ALIGN (4) + .p2align 4 L(12bytes): mov -12(%rdi), %rax mov -12(%rsi), %rcx @@ -827,7 +823,7 @@ L(0bytes): # ifndef USE_AS_WMEMCMP /* unreal case for wmemcmp */ - ALIGN (4) + .p2align 4 L(65bytes): movdqu -65(%rdi), %xmm1 movdqu -65(%rsi), %xmm2 @@ -864,7 +860,7 @@ L(9bytes): sub %edx, %eax ret - ALIGN (4) + .p2align 4 L(13bytes): mov -13(%rdi), %rax mov -13(%rsi), %rcx @@ -877,7 +873,7 @@ L(13bytes): xor %eax, %eax ret - ALIGN (4) + .p2align 4 L(5bytes): mov -5(%rdi), %eax mov -5(%rsi), %ecx @@ -888,7 +884,7 @@ L(5bytes): sub %edx, %eax ret - ALIGN (4) + .p2align 4 L(66bytes): movdqu -66(%rdi), %xmm1 movdqu -66(%rsi), %xmm2 @@ -929,7 +925,7 @@ L(10bytes): sub %ecx, %eax ret - ALIGN (4) + .p2align 4 L(14bytes): mov -14(%rdi), %rax mov -14(%rsi), %rcx @@ -942,7 +938,7 @@ L(14bytes): xor %eax, %eax ret - ALIGN (4) + .p2align 4 L(6bytes): mov -6(%rdi), %eax mov -6(%rsi), %ecx @@ -958,7 +954,7 @@ L(2bytes): sub %ecx, %eax ret - ALIGN (4) + .p2align 4 L(67bytes): movdqu -67(%rdi), %xmm2 movdqu -67(%rsi), %xmm1 @@ -997,7 +993,7 @@ L(11bytes): xor %eax, %eax ret - ALIGN (4) + .p2align 4 L(15bytes): mov -15(%rdi), %rax mov -15(%rsi), %rcx @@ -1010,7 +1006,7 @@ L(15bytes): xor %eax, %eax ret - ALIGN (4) + .p2align 4 L(7bytes): mov -7(%rdi), %eax mov -7(%rsi), %ecx @@ -1023,7 +1019,7 @@ L(7bytes): xor %eax, %eax ret - ALIGN (4) + .p2align 4 L(3bytes): movzwl -3(%rdi), %eax movzwl -3(%rsi), %ecx @@ -1036,7 +1032,7 @@ L(1bytes): ret # endif - ALIGN (4) + .p2align 4 L(68bytes): movdqu -68(%rdi), %xmm2 movdqu -68(%rsi), %xmm1 @@ -1079,7 +1075,7 @@ L(20bytes): # ifndef USE_AS_WMEMCMP /* unreal cases for wmemcmp */ - ALIGN (4) + .p2align 4 L(69bytes): movdqu -69(%rsi), %xmm1 movdqu -69(%rdi), %xmm2 @@ -1115,7 +1111,7 @@ L(21bytes): xor %eax, %eax ret - ALIGN (4) + .p2align 4 L(70bytes): movdqu -70(%rsi), %xmm1 movdqu -70(%rdi), %xmm2 @@ -1151,7 +1147,7 @@ L(22bytes): xor %eax, %eax ret - ALIGN (4) + .p2align 4 L(71bytes): movdqu -71(%rsi), %xmm1 movdqu -71(%rdi), %xmm2 @@ -1188,7 +1184,7 @@ L(23bytes): ret # endif - ALIGN (4) + .p2align 4 L(72bytes): movdqu -72(%rsi), %xmm1 movdqu -72(%rdi), %xmm2 @@ -1227,7 +1223,7 @@ L(24bytes): # ifndef USE_AS_WMEMCMP /* unreal cases for wmemcmp */ - ALIGN (4) + .p2align 4 L(73bytes): movdqu -73(%rsi), %xmm1 movdqu -73(%rdi), %xmm2 @@ -1265,7 +1261,7 @@ L(25bytes): sub %ecx, %eax ret - ALIGN (4) + .p2align 4 L(74bytes): movdqu -74(%rsi), %xmm1 movdqu -74(%rdi), %xmm2 @@ -1302,7 +1298,7 @@ L(26bytes): movzwl -2(%rsi), %ecx jmp L(diffin2bytes) - ALIGN (4) + .p2align 4 L(75bytes): movdqu -75(%rsi), %xmm1 movdqu -75(%rdi), %xmm2 @@ -1342,7 +1338,7 @@ L(27bytes): xor %eax, %eax ret # endif - ALIGN (4) + .p2align 4 L(76bytes): movdqu -76(%rsi), %xmm1 movdqu -76(%rdi), %xmm2 @@ -1388,7 +1384,7 @@ L(28bytes): # ifndef USE_AS_WMEMCMP /* unreal cases for wmemcmp */ - ALIGN (4) + .p2align 4 L(77bytes): movdqu -77(%rsi), %xmm1 movdqu -77(%rdi), %xmm2 @@ -1430,7 +1426,7 @@ L(29bytes): xor %eax, %eax ret - ALIGN (4) + .p2align 4 L(78bytes): movdqu -78(%rsi), %xmm1 movdqu -78(%rdi), %xmm2 @@ -1470,7 +1466,7 @@ L(30bytes): xor %eax, %eax ret - ALIGN (4) + .p2align 4 L(79bytes): movdqu -79(%rsi), %xmm1 movdqu -79(%rdi), %xmm2 @@ -1510,7 +1506,7 @@ L(31bytes): xor %eax, %eax ret # endif - ALIGN (4) + .p2align 4 L(64bytes): movdqu -64(%rdi), %xmm2 movdqu -64(%rsi), %xmm1 @@ -1548,7 +1544,7 @@ L(32bytes): /* * Aligned 8 bytes to avoid 2 branch "taken" in one 16 alinged code block. */ - ALIGN (3) + .p2align 3 L(less16bytes): movsbq %dl, %rdx mov (%rsi, %rdx), %rcx @@ -1585,7 +1581,7 @@ L(diffin2bytes): sub %ecx, %eax ret - ALIGN (4) + .p2align 4 L(end): and $0xff, %eax and $0xff, %ecx @@ -1599,7 +1595,7 @@ L(end): neg %eax ret - ALIGN (4) + .p2align 4 L(nequal_bigger): ret @@ -1611,7 +1607,7 @@ L(unreal_case): END (MEMCMP) .section .rodata.sse4.1,"a",@progbits - ALIGN (3) + .p2align 3 # ifndef USE_AS_WMEMCMP L(table_64bytes): .int JMPTBL (L(0bytes), L(table_64bytes)) diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S index e319df926e..e04f918dff 100644 --- a/sysdeps/x86_64/multiarch/memcmp-ssse3.S +++ b/sysdeps/x86_64/multiarch/memcmp-ssse3.S @@ -25,10 +25,6 @@ # define MEMCMP __memcmp_ssse3 # endif -# ifndef ALIGN -# define ALIGN(n) .p2align n -# endif - /* Warning! wmemcmp has to use SIGNED comparison for elements. memcmp has to use UNSIGNED comparison for elemnts. @@ -50,7 +46,7 @@ ENTRY (MEMCMP) add %rcx, %rdi jmp L(less48bytes) - ALIGN (4) + .p2align 4 /* ECX >= 32. */ L(48bytesormore): movdqu (%rdi), %xmm3 @@ -90,7 +86,7 @@ L(48bytesormore): je L(shr_6) jmp L(shr_7) - ALIGN (2) + .p2align 2 L(next_unaligned_table): cmp $8, %edx je L(shr_8) @@ -117,7 +113,7 @@ L(next_unaligned_table): jmp L(shr_12) # endif - ALIGN (4) + .p2align 4 L(shr_0): cmp $80, %rcx lea -48(%rcx), %rcx @@ -137,7 +133,7 @@ L(shr_0): add %rcx, %rdi jmp L(less48bytes) - ALIGN (4) + .p2align 4 L(shr_0_gobble): movdqa (%rsi), %xmm0 xor %eax, %eax @@ -180,7 +176,7 @@ L(next): # ifndef USE_AS_WMEMCMP - ALIGN (4) + .p2align 4 L(shr_1): cmp $80, %rcx lea -48(%rcx), %rcx @@ -207,7 +203,7 @@ L(shr_1): add %rcx, %rdi jmp L(less48bytes) - ALIGN (4) + .p2align 4 L(shr_1_gobble): sub $32, %rcx movdqa 16(%rsi), %xmm0 @@ -258,7 +254,7 @@ L(shr_1_gobble_next): jmp L(less48bytes) - ALIGN (4) + .p2align 4 L(shr_2): cmp $80, %rcx lea -48(%rcx), %rcx @@ -285,7 +281,7 @@ L(shr_2): add %rcx, %rdi jmp L(less48bytes) - ALIGN (4) + .p2align 4 L(shr_2_gobble): sub $32, %rcx movdqa 16(%rsi), %xmm0 @@ -335,7 +331,7 @@ L(shr_2_gobble_next): add %rcx, %rdi jmp L(less48bytes) - ALIGN (4) + .p2align 4 L(shr_3): cmp $80, %rcx lea -48(%rcx), %rcx @@ -362,7 +358,7 @@ L(shr_3): add %rcx, %rdi jmp L(less48bytes) - ALIGN (4) + .p2align 4 L(shr_3_gobble): sub $32, %rcx movdqa 16(%rsi), %xmm0 @@ -414,7 +410,7 @@ L(shr_3_gobble_next): # endif - ALIGN (4) + .p2align 4 L(shr_4): cmp $80, %rcx lea -48(%rcx), %rcx @@ -441,7 +437,7 @@ L(shr_4): add %rcx, %rdi jmp L(less48bytes) - ALIGN (4) + .p2align 4 L(shr_4_gobble): sub $32, %rcx movdqa 16(%rsi), %xmm0 @@ -493,7 +489,7 @@ L(shr_4_gobble_next): # ifndef USE_AS_WMEMCMP - ALIGN (4) + .p2align 4 L(shr_5): cmp $80, %rcx lea -48(%rcx), %rcx @@ -520,7 +516,7 @@ L(shr_5): add %rcx, %rdi jmp L(less48bytes) - ALIGN (4) + .p2align 4 L(shr_5_gobble): sub $32, %rcx movdqa 16(%rsi), %xmm0 @@ -570,7 +566,7 @@ L(shr_5_gobble_next): add %rcx, %rdi jmp L(less48bytes) - ALIGN (4) + .p2align 4 L(shr_6): cmp $80, %rcx lea -48(%rcx), %rcx @@ -597,7 +593,7 @@ L(shr_6): add %rcx, %rdi jmp L(less48bytes) - ALIGN (4) + .p2align 4 L(shr_6_gobble): sub $32, %rcx movdqa 16(%rsi), %xmm0 @@ -647,7 +643,7 @@ L(shr_6_gobble_next): add %rcx, %rdi jmp L(less48bytes) - ALIGN (4) + .p2align 4 L(shr_7): cmp $80, %rcx lea -48(%rcx), %rcx @@ -674,7 +670,7 @@ L(shr_7): add %rcx, %rdi jmp L(less48bytes) - ALIGN (4) + .p2align 4 L(shr_7_gobble): sub $32, %rcx movdqa 16(%rsi), %xmm0 @@ -726,7 +722,7 @@ L(shr_7_gobble_next): # endif - ALIGN (4) + .p2align 4 L(shr_8): cmp $80, %rcx lea -48(%rcx), %rcx @@ -753,7 +749,7 @@ L(shr_8): add %rcx, %rdi jmp L(less48bytes) - ALIGN (4) + .p2align 4 L(shr_8_gobble): sub $32, %rcx movdqa 16(%rsi), %xmm0 @@ -805,7 +801,7 @@ L(shr_8_gobble_next): # ifndef USE_AS_WMEMCMP - ALIGN (4) + .p2align 4 L(shr_9): cmp $80, %rcx lea -48(%rcx), %rcx @@ -832,7 +828,7 @@ L(shr_9): add %rcx, %rdi jmp L(less48bytes) - ALIGN (4) + .p2align 4 L(shr_9_gobble): sub $32, %rcx movdqa 16(%rsi), %xmm0 @@ -882,7 +878,7 @@ L(shr_9_gobble_next): add %rcx, %rdi jmp L(less48bytes) - ALIGN (4) + .p2align 4 L(shr_10): cmp $80, %rcx lea -48(%rcx), %rcx @@ -909,7 +905,7 @@ L(shr_10): add %rcx, %rdi jmp L(less48bytes) - ALIGN (4) + .p2align 4 L(shr_10_gobble): sub $32, %rcx movdqa 16(%rsi), %xmm0 @@ -959,7 +955,7 @@ L(shr_10_gobble_next): add %rcx, %rdi jmp L(less48bytes) - ALIGN (4) + .p2align 4 L(shr_11): cmp $80, %rcx lea -48(%rcx), %rcx @@ -986,7 +982,7 @@ L(shr_11): add %rcx, %rdi jmp L(less48bytes) - ALIGN (4) + .p2align 4 L(shr_11_gobble): sub $32, %rcx movdqa 16(%rsi), %xmm0 @@ -1038,7 +1034,7 @@ L(shr_11_gobble_next): # endif - ALIGN (4) + .p2align 4 L(shr_12): cmp $80, %rcx lea -48(%rcx), %rcx @@ -1065,7 +1061,7 @@ L(shr_12): add %rcx, %rdi jmp L(less48bytes) - ALIGN (4) + .p2align 4 L(shr_12_gobble): sub $32, %rcx movdqa 16(%rsi), %xmm0 @@ -1117,7 +1113,7 @@ L(shr_12_gobble_next): # ifndef USE_AS_WMEMCMP - ALIGN (4) + .p2align 4 L(shr_13): cmp $80, %rcx lea -48(%rcx), %rcx @@ -1144,7 +1140,7 @@ L(shr_13): add %rcx, %rdi jmp L(less48bytes) - ALIGN (4) + .p2align 4 L(shr_13_gobble): sub $32, %rcx movdqa 16(%rsi), %xmm0 @@ -1194,7 +1190,7 @@ L(shr_13_gobble_next): add %rcx, %rdi jmp L(less48bytes) - ALIGN (4) + .p2align 4 L(shr_14): cmp $80, %rcx lea -48(%rcx), %rcx @@ -1221,7 +1217,7 @@ L(shr_14): add %rcx, %rdi jmp L(less48bytes) - ALIGN (4) + .p2align 4 L(shr_14_gobble): sub $32, %rcx movdqa 16(%rsi), %xmm0 @@ -1271,7 +1267,7 @@ L(shr_14_gobble_next): add %rcx, %rdi jmp L(less48bytes) - ALIGN (4) + .p2align 4 L(shr_15): cmp $80, %rcx lea -48(%rcx), %rcx @@ -1298,7 +1294,7 @@ L(shr_15): add %rcx, %rdi jmp L(less48bytes) - ALIGN (4) + .p2align 4 L(shr_15_gobble): sub $32, %rcx movdqa 16(%rsi), %xmm0 @@ -1348,7 +1344,7 @@ L(shr_15_gobble_next): add %rcx, %rdi jmp L(less48bytes) # endif - ALIGN (4) + .p2align 4 L(exit): pmovmskb %xmm1, %r8d sub $0xffff, %r8d @@ -1389,56 +1385,56 @@ L(less16bytes): sub %edx, %eax ret - ALIGN (4) + .p2align 4 L(Byte16): movzbl -16(%rdi), %eax movzbl -16(%rsi), %edx sub %edx, %eax ret - ALIGN (4) + .p2align 4 L(Byte17): movzbl -15(%rdi), %eax movzbl -15(%rsi), %edx sub %edx, %eax ret - ALIGN (4) + .p2align 4 L(Byte18): movzbl -14(%rdi), %eax movzbl -14(%rsi), %edx sub %edx, %eax ret - ALIGN (4) + .p2align 4 L(Byte19): movzbl -13(%rdi), %eax movzbl -13(%rsi), %edx sub %edx, %eax ret - ALIGN (4) + .p2align 4 L(Byte20): movzbl -12(%rdi), %eax movzbl -12(%rsi), %edx sub %edx, %eax ret - ALIGN (4) + .p2align 4 L(Byte21): movzbl -11(%rdi), %eax movzbl -11(%rsi), %edx sub %edx, %eax ret - ALIGN (4) + .p2align 4 L(Byte22): movzbl -10(%rdi), %eax movzbl -10(%rsi), %edx sub %edx, %eax ret - ALIGN (4) + .p2align 4 L(next_24_bytes): lea 8(%rdi), %rdi lea 8(%rsi), %rsi @@ -1479,14 +1475,14 @@ L(next_24_bytes): jne L(find_diff) ret - ALIGN (4) + .p2align 4 L(second_double_word): mov -12(%rdi), %eax cmp -12(%rsi), %eax jne L(find_diff) ret - ALIGN (4) + .p2align 4 L(next_two_double_words): and $15, %dh jz L(fourth_double_word) @@ -1495,7 +1491,7 @@ L(next_two_double_words): jne L(find_diff) ret - ALIGN (4) + .p2align 4 L(fourth_double_word): mov -4(%rdi), %eax cmp -4(%rsi), %eax @@ -1503,7 +1499,7 @@ L(fourth_double_word): ret # endif - ALIGN (4) + .p2align 4 L(less48bytes): cmp $8, %ecx jae L(more8bytes) @@ -1527,7 +1523,7 @@ L(less48bytes): jmp L(4bytes) # endif - ALIGN (4) + .p2align 4 L(more8bytes): cmp $16, %ecx jae L(more16bytes) @@ -1551,7 +1547,7 @@ L(more8bytes): jmp L(12bytes) # endif - ALIGN (4) + .p2align 4 L(more16bytes): cmp $24, %ecx jae L(more24bytes) @@ -1575,7 +1571,7 @@ L(more16bytes): jmp L(20bytes) # endif - ALIGN (4) + .p2align 4 L(more24bytes): cmp $32, %ecx jae L(more32bytes) @@ -1599,7 +1595,7 @@ L(more24bytes): jmp L(28bytes) # endif - ALIGN (4) + .p2align 4 L(more32bytes): cmp $40, %ecx jae L(more40bytes) @@ -1623,7 +1619,7 @@ L(more32bytes): jmp L(36bytes) # endif - ALIGN (4) + .p2align 4 L(more40bytes): cmp $40, %ecx je L(40bytes) @@ -1642,7 +1638,7 @@ L(more40bytes): je L(46bytes) jmp L(47bytes) - ALIGN (4) + .p2align 4 L(44bytes): movl -44(%rdi), %eax movl -44(%rsi), %ecx @@ -1702,7 +1698,7 @@ L(0bytes): xor %eax, %eax ret # else - ALIGN (4) + .p2align 4 L(44bytes): movl -44(%rdi), %eax cmp -44(%rsi), %eax @@ -1753,7 +1749,7 @@ L(0bytes): # endif # ifndef USE_AS_WMEMCMP - ALIGN (4) + .p2align 4 L(45bytes): movl -45(%rdi), %eax movl -45(%rsi), %ecx @@ -1816,7 +1812,7 @@ L(1bytes): xor %eax, %eax ret - ALIGN (4) + .p2align 4 L(46bytes): movl -46(%rdi), %eax movl -46(%rsi), %ecx @@ -1882,7 +1878,7 @@ L(2bytes): xor %eax, %eax ret - ALIGN (4) + .p2align 4 L(47bytes): movl -47(%rdi), %eax movl -47(%rsi), %ecx @@ -1951,7 +1947,7 @@ L(3bytes): xor %eax, %eax ret - ALIGN (4) + .p2align 4 L(find_diff): cmpb %cl, %al jne L(set) @@ -1973,19 +1969,19 @@ L(set): # else /* for wmemcmp */ - ALIGN (4) + .p2align 4 L(find_diff): mov $1, %eax jg L(find_diff_bigger) neg %eax ret - ALIGN (4) + .p2align 4 L(find_diff_bigger): ret # endif - ALIGN (4) + .p2align 4 L(equal): xor %eax, %eax ret diff --git a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S index efdfea238f..df6578ebc9 100644 --- a/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S +++ b/sysdeps/x86_64/multiarch/memcpy-sse2-unaligned.S @@ -20,10 +20,6 @@ #include "asm-syntax.h" -#ifndef ALIGN -# define ALIGN(n) .p2align n -#endif - ENTRY(__memcpy_sse2_unaligned) movq %rsi, %rax @@ -44,7 +40,7 @@ L(return): movq %rdi, %rax ret .p2align 4,,10 - ALIGN(4) + .p2align 4 .L31: movdqu 16(%rsi), %xmm8 cmpq $64, %rdx @@ -77,7 +73,7 @@ L(return): leaq 32(%r10), %r8 leaq 48(%r10), %rax .p2align 4,,10 - ALIGN(4) + .p2align 4 L(loop): movdqu (%rcx,%r10), %xmm8 movdqa %xmm8, (%rcx) @@ -151,7 +147,7 @@ L(less_16): .L3: leaq -1(%rdx), %rax .p2align 4,,10 - ALIGN(4) + .p2align 4 .L11: movzbl (%rsi,%rax), %edx movb %dl, (%rdi,%rax) diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S index fc9fcef27d..0eb7d9b758 100644 --- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S +++ b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S @@ -31,10 +31,6 @@ # define MEMCPY_CHK __memcpy_chk_ssse3_back #endif -#ifndef ALIGN -# define ALIGN(n) .p2align n -#endif - #define JMPTBL(I, B) I - B /* Branch to an entry in a jump table. TABLE is a jump table with @@ -87,7 +83,7 @@ L(bk_write): BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) #endif - ALIGN (4) + .p2align 4 L(144bytesormore): #ifndef USE_AS_MEMMOVE @@ -119,7 +115,7 @@ L(144bytesormore): jmp *%r9 ud2 - ALIGN (4) + .p2align 4 L(copy_backward): #ifdef DATA_CACHE_SIZE mov $DATA_CACHE_SIZE, %RCX_LP @@ -149,7 +145,7 @@ L(copy_backward): jmp *%r9 ud2 - ALIGN (4) + .p2align 4 L(shl_0): mov %rdx, %r9 @@ -162,7 +158,7 @@ L(shl_0): #endif jae L(gobble_mem_fwd) sub $0x80, %rdx - ALIGN (4) + .p2align 4 L(shl_0_loop): movdqa (%rsi), %xmm1 movdqa %xmm1, (%rdi) @@ -190,7 +186,7 @@ L(shl_0_loop): add %rdx, %rdi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_0_bwd): sub $0x80, %rdx L(copy_backward_loop): @@ -221,7 +217,7 @@ L(copy_backward_loop): sub %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_1): sub $0x80, %rdx movaps -0x01(%rsi), %xmm1 @@ -258,7 +254,7 @@ L(shl_1): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_1_bwd): movaps -0x01(%rsi), %xmm1 @@ -304,7 +300,7 @@ L(shl_1_bwd): sub %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_2): sub $0x80, %rdx movaps -0x02(%rsi), %xmm1 @@ -341,7 +337,7 @@ L(shl_2): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_2_bwd): movaps -0x02(%rsi), %xmm1 @@ -387,7 +383,7 @@ L(shl_2_bwd): sub %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_3): sub $0x80, %rdx movaps -0x03(%rsi), %xmm1 @@ -424,7 +420,7 @@ L(shl_3): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_3_bwd): movaps -0x03(%rsi), %xmm1 @@ -470,7 +466,7 @@ L(shl_3_bwd): sub %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_4): sub $0x80, %rdx movaps -0x04(%rsi), %xmm1 @@ -507,7 +503,7 @@ L(shl_4): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_4_bwd): movaps -0x04(%rsi), %xmm1 @@ -553,7 +549,7 @@ L(shl_4_bwd): sub %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_5): sub $0x80, %rdx movaps -0x05(%rsi), %xmm1 @@ -590,7 +586,7 @@ L(shl_5): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_5_bwd): movaps -0x05(%rsi), %xmm1 @@ -636,7 +632,7 @@ L(shl_5_bwd): sub %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_6): sub $0x80, %rdx movaps -0x06(%rsi), %xmm1 @@ -673,7 +669,7 @@ L(shl_6): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_6_bwd): movaps -0x06(%rsi), %xmm1 @@ -719,7 +715,7 @@ L(shl_6_bwd): sub %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_7): sub $0x80, %rdx movaps -0x07(%rsi), %xmm1 @@ -756,7 +752,7 @@ L(shl_7): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_7_bwd): movaps -0x07(%rsi), %xmm1 @@ -802,7 +798,7 @@ L(shl_7_bwd): sub %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_8): sub $0x80, %rdx movaps -0x08(%rsi), %xmm1 @@ -839,7 +835,7 @@ L(shl_8): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_8_bwd): movaps -0x08(%rsi), %xmm1 @@ -886,7 +882,7 @@ L(shl_8_end_bwd): sub %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_9): sub $0x80, %rdx movaps -0x09(%rsi), %xmm1 @@ -923,7 +919,7 @@ L(shl_9): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_9_bwd): movaps -0x09(%rsi), %xmm1 @@ -969,7 +965,7 @@ L(shl_9_bwd): sub %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_10): sub $0x80, %rdx movaps -0x0a(%rsi), %xmm1 @@ -1006,7 +1002,7 @@ L(shl_10): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_10_bwd): movaps -0x0a(%rsi), %xmm1 @@ -1052,7 +1048,7 @@ L(shl_10_bwd): sub %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_11): sub $0x80, %rdx movaps -0x0b(%rsi), %xmm1 @@ -1089,7 +1085,7 @@ L(shl_11): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_11_bwd): movaps -0x0b(%rsi), %xmm1 @@ -1135,7 +1131,7 @@ L(shl_11_bwd): sub %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_12): sub $0x80, %rdx movdqa -0x0c(%rsi), %xmm1 @@ -1173,7 +1169,7 @@ L(shl_12): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_12_bwd): movaps -0x0c(%rsi), %xmm1 @@ -1219,7 +1215,7 @@ L(shl_12_bwd): sub %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_13): sub $0x80, %rdx movaps -0x0d(%rsi), %xmm1 @@ -1256,7 +1252,7 @@ L(shl_13): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_13_bwd): movaps -0x0d(%rsi), %xmm1 @@ -1302,7 +1298,7 @@ L(shl_13_bwd): sub %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_14): sub $0x80, %rdx movaps -0x0e(%rsi), %xmm1 @@ -1339,7 +1335,7 @@ L(shl_14): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_14_bwd): movaps -0x0e(%rsi), %xmm1 @@ -1385,7 +1381,7 @@ L(shl_14_bwd): sub %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_15): sub $0x80, %rdx movaps -0x0f(%rsi), %xmm1 @@ -1422,7 +1418,7 @@ L(shl_15): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_15_bwd): movaps -0x0f(%rsi), %xmm1 @@ -1468,7 +1464,7 @@ L(shl_15_bwd): sub %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(gobble_mem_fwd): movdqu (%rsi), %xmm1 movdqu %xmm0, (%r8) @@ -1570,7 +1566,7 @@ L(gobble_mem_fwd_end): add %rdx, %rdi BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - ALIGN (4) + .p2align 4 L(gobble_mem_bwd): add %rdx, %rsi add %rdx, %rdi @@ -2833,7 +2829,7 @@ L(bwd_write_1bytes): END (MEMCPY) .section .rodata.ssse3,"a",@progbits - ALIGN (3) + .p2align 3 L(table_144_bytes_bwd): .int JMPTBL (L(bwd_write_0bytes), L(table_144_bytes_bwd)) .int JMPTBL (L(bwd_write_1bytes), L(table_144_bytes_bwd)) @@ -2980,7 +2976,7 @@ L(table_144_bytes_bwd): .int JMPTBL (L(bwd_write_142bytes), L(table_144_bytes_bwd)) .int JMPTBL (L(bwd_write_143bytes), L(table_144_bytes_bwd)) - ALIGN (3) + .p2align 3 L(table_144_bytes_fwd): .int JMPTBL (L(fwd_write_0bytes), L(table_144_bytes_fwd)) .int JMPTBL (L(fwd_write_1bytes), L(table_144_bytes_fwd)) @@ -3127,7 +3123,7 @@ L(table_144_bytes_fwd): .int JMPTBL (L(fwd_write_142bytes), L(table_144_bytes_fwd)) .int JMPTBL (L(fwd_write_143bytes), L(table_144_bytes_fwd)) - ALIGN (3) + .p2align 3 L(shl_table_fwd): .int JMPTBL (L(shl_0), L(shl_table_fwd)) .int JMPTBL (L(shl_1), L(shl_table_fwd)) @@ -3146,7 +3142,7 @@ L(shl_table_fwd): .int JMPTBL (L(shl_14), L(shl_table_fwd)) .int JMPTBL (L(shl_15), L(shl_table_fwd)) - ALIGN (3) + .p2align 3 L(shl_table_bwd): .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd)) .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd)) diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S index 9642ceecd9..0cedab2447 100644 --- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S +++ b/sysdeps/x86_64/multiarch/memcpy-ssse3.S @@ -31,10 +31,6 @@ # define MEMCPY_CHK __memcpy_chk_ssse3 #endif -#ifndef ALIGN -# define ALIGN(n) .p2align n -#endif - #define JMPTBL(I, B) I - B /* Branch to an entry in a jump table. TABLE is a jump table with @@ -80,7 +76,7 @@ L(copy_forward): jmp *%r9 ud2 - ALIGN (4) + .p2align 4 L(80bytesormore): #ifndef USE_AS_MEMMOVE cmp %dil, %sil @@ -113,7 +109,7 @@ L(80bytesormore): #endif BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %r9, 4) - ALIGN (4) + .p2align 4 L(copy_backward): movdqu -16(%rsi, %rdx), %xmm0 add %rdx, %rsi @@ -144,7 +140,7 @@ L(copy_backward): #endif BRANCH_TO_JMPTBL_ENTRY (L(shl_table_bwd), %r9, 4) - ALIGN (4) + .p2align 4 L(shl_0): sub $16, %rdx movdqa (%rsi), %xmm1 @@ -172,7 +168,7 @@ L(shl_0_less_64bytes): add %rdx, %rdi BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_0_gobble): #ifdef DATA_CACHE_SIZE_HALF cmp $DATA_CACHE_SIZE_HALF, %RDX_LP @@ -228,7 +224,7 @@ L(shl_0_cache_less_64bytes): add %rdx, %rdi BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_0_gobble_mem_loop): prefetcht0 0x1c0(%rsi) prefetcht0 0x280(%rsi) @@ -287,7 +283,7 @@ L(shl_0_mem_less_32bytes): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_0_bwd): sub $16, %rdx movdqa -0x10(%rsi), %xmm1 @@ -313,7 +309,7 @@ L(shl_0_bwd): L(shl_0_less_64bytes_bwd): BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_0_gobble_bwd): #ifdef DATA_CACHE_SIZE_HALF cmp $DATA_CACHE_SIZE_HALF, %RDX_LP @@ -367,7 +363,7 @@ L(shl_0_gobble_bwd_loop): L(shl_0_gobble_bwd_less_64bytes): BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_0_gobble_mem_bwd_loop): prefetcht0 -0x1c0(%rsi) prefetcht0 -0x280(%rsi) @@ -423,7 +419,7 @@ L(shl_0_mem_bwd_less_64bytes): L(shl_0_mem_bwd_less_32bytes): BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_1): lea (L(shl_1_loop_L1)-L(shl_1))(%r9), %r9 cmp %rcx, %rdx @@ -466,7 +462,7 @@ L(shl_1_end): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_1_bwd): lea (L(shl_1_bwd_loop_L1)-L(shl_1_bwd))(%r9), %r9 cmp %rcx, %rdx @@ -508,7 +504,7 @@ L(shl_1_bwd_end): movdqu %xmm0, (%r8) BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_2): lea (L(shl_2_loop_L1)-L(shl_2))(%r9), %r9 cmp %rcx, %rdx @@ -551,7 +547,7 @@ L(shl_2_end): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_2_bwd): lea (L(shl_2_bwd_loop_L1)-L(shl_2_bwd))(%r9), %r9 cmp %rcx, %rdx @@ -593,7 +589,7 @@ L(shl_2_bwd_end): movdqu %xmm0, (%r8) BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_3): lea (L(shl_3_loop_L1)-L(shl_3))(%r9), %r9 cmp %rcx, %rdx @@ -636,7 +632,7 @@ L(shl_3_end): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_3_bwd): lea (L(shl_3_bwd_loop_L1)-L(shl_3_bwd))(%r9), %r9 cmp %rcx, %rdx @@ -678,7 +674,7 @@ L(shl_3_bwd_end): movdqu %xmm0, (%r8) BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_4): lea (L(shl_4_loop_L1)-L(shl_4))(%r9), %r9 cmp %rcx, %rdx @@ -721,7 +717,7 @@ L(shl_4_end): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_4_bwd): lea (L(shl_4_bwd_loop_L1)-L(shl_4_bwd))(%r9), %r9 cmp %rcx, %rdx @@ -763,7 +759,7 @@ L(shl_4_bwd_end): movdqu %xmm0, (%r8) BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_5): lea (L(shl_5_loop_L1)-L(shl_5))(%r9), %r9 cmp %rcx, %rdx @@ -806,7 +802,7 @@ L(shl_5_end): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_5_bwd): lea (L(shl_5_bwd_loop_L1)-L(shl_5_bwd))(%r9), %r9 cmp %rcx, %rdx @@ -848,7 +844,7 @@ L(shl_5_bwd_end): movdqu %xmm0, (%r8) BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_6): lea (L(shl_6_loop_L1)-L(shl_6))(%r9), %r9 cmp %rcx, %rdx @@ -891,7 +887,7 @@ L(shl_6_end): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_6_bwd): lea (L(shl_6_bwd_loop_L1)-L(shl_6_bwd))(%r9), %r9 cmp %rcx, %rdx @@ -933,7 +929,7 @@ L(shl_6_bwd_end): movdqu %xmm0, (%r8) BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_7): lea (L(shl_7_loop_L1)-L(shl_7))(%r9), %r9 cmp %rcx, %rdx @@ -976,7 +972,7 @@ L(shl_7_end): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_7_bwd): lea (L(shl_7_bwd_loop_L1)-L(shl_7_bwd))(%r9), %r9 cmp %rcx, %rdx @@ -1018,7 +1014,7 @@ L(shl_7_bwd_end): movdqu %xmm0, (%r8) BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_8): lea (L(shl_8_loop_L1)-L(shl_8))(%r9), %r9 cmp %rcx, %rdx @@ -1051,7 +1047,7 @@ L(shl_8_loop_L1): movaps %xmm5, -0x10(%rdi) jmp *%r9 ud2 - ALIGN (4) + .p2align 4 L(shl_8_end): lea 64(%rdx), %rdx movaps %xmm4, -0x20(%rdi) @@ -1061,7 +1057,7 @@ L(shl_8_end): movdqu %xmm0, (%r8) BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_8_bwd): lea (L(shl_8_bwd_loop_L1)-L(shl_8_bwd))(%r9), %r9 cmp %rcx, %rdx @@ -1103,7 +1099,7 @@ L(shl_8_bwd_end): movdqu %xmm0, (%r8) BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_9): lea (L(shl_9_loop_L1)-L(shl_9))(%r9), %r9 cmp %rcx, %rdx @@ -1146,7 +1142,7 @@ L(shl_9_end): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_9_bwd): lea (L(shl_9_bwd_loop_L1)-L(shl_9_bwd))(%r9), %r9 cmp %rcx, %rdx @@ -1188,7 +1184,7 @@ L(shl_9_bwd_end): movdqu %xmm0, (%r8) BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_10): lea (L(shl_10_loop_L1)-L(shl_10))(%r9), %r9 cmp %rcx, %rdx @@ -1231,7 +1227,7 @@ L(shl_10_end): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_10_bwd): lea (L(shl_10_bwd_loop_L1)-L(shl_10_bwd))(%r9), %r9 cmp %rcx, %rdx @@ -1273,7 +1269,7 @@ L(shl_10_bwd_end): movdqu %xmm0, (%r8) BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_11): lea (L(shl_11_loop_L1)-L(shl_11))(%r9), %r9 cmp %rcx, %rdx @@ -1316,7 +1312,7 @@ L(shl_11_end): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_11_bwd): lea (L(shl_11_bwd_loop_L1)-L(shl_11_bwd))(%r9), %r9 cmp %rcx, %rdx @@ -1358,7 +1354,7 @@ L(shl_11_bwd_end): movdqu %xmm0, (%r8) BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_12): lea (L(shl_12_loop_L1)-L(shl_12))(%r9), %r9 cmp %rcx, %rdx @@ -1401,7 +1397,7 @@ L(shl_12_end): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_12_bwd): lea (L(shl_12_bwd_loop_L1)-L(shl_12_bwd))(%r9), %r9 cmp %rcx, %rdx @@ -1443,7 +1439,7 @@ L(shl_12_bwd_end): movdqu %xmm0, (%r8) BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_13): lea (L(shl_13_loop_L1)-L(shl_13))(%r9), %r9 cmp %rcx, %rdx @@ -1486,7 +1482,7 @@ L(shl_13_end): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_13_bwd): lea (L(shl_13_bwd_loop_L1)-L(shl_13_bwd))(%r9), %r9 cmp %rcx, %rdx @@ -1528,7 +1524,7 @@ L(shl_13_bwd_end): movdqu %xmm0, (%r8) BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_14): lea (L(shl_14_loop_L1)-L(shl_14))(%r9), %r9 cmp %rcx, %rdx @@ -1571,7 +1567,7 @@ L(shl_14_end): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_14_bwd): lea (L(shl_14_bwd_loop_L1)-L(shl_14_bwd))(%r9), %r9 cmp %rcx, %rdx @@ -1613,7 +1609,7 @@ L(shl_14_bwd_end): movdqu %xmm0, (%r8) BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_15): lea (L(shl_15_loop_L1)-L(shl_15))(%r9), %r9 cmp %rcx, %rdx @@ -1656,7 +1652,7 @@ L(shl_15_end): add %rdx, %rsi BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(shl_15_bwd): lea (L(shl_15_bwd_loop_L1)-L(shl_15_bwd))(%r9), %r9 cmp %rcx, %rdx @@ -1698,7 +1694,7 @@ L(shl_15_bwd_end): movdqu %xmm0, (%r8) BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - ALIGN (4) + .p2align 4 L(write_72bytes): movdqu -72(%rsi), %xmm0 movdqu -56(%rsi), %xmm1 @@ -1716,7 +1712,7 @@ L(write_72bytes): mov %rcx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_64bytes): movdqu -64(%rsi), %xmm0 mov -48(%rsi), %rcx @@ -1734,7 +1730,7 @@ L(write_64bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_56bytes): movdqu -56(%rsi), %xmm0 mov -40(%rsi), %r8 @@ -1750,7 +1746,7 @@ L(write_56bytes): mov %rcx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_48bytes): mov -48(%rsi), %rcx mov -40(%rsi), %r8 @@ -1766,7 +1762,7 @@ L(write_48bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_40bytes): mov -40(%rsi), %r8 mov -32(%rsi), %r9 @@ -1780,7 +1776,7 @@ L(write_40bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_32bytes): mov -32(%rsi), %r9 mov -24(%rsi), %r10 @@ -1792,7 +1788,7 @@ L(write_32bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_24bytes): mov -24(%rsi), %r10 mov -16(%rsi), %r11 @@ -1802,7 +1798,7 @@ L(write_24bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_16bytes): mov -16(%rsi), %r11 mov -8(%rsi), %rdx @@ -1810,14 +1806,14 @@ L(write_16bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_8bytes): mov -8(%rsi), %rdx mov %rdx, -8(%rdi) L(write_0bytes): ret - ALIGN (4) + .p2align 4 L(write_73bytes): movdqu -73(%rsi), %xmm0 movdqu -57(%rsi), %xmm1 @@ -1837,7 +1833,7 @@ L(write_73bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_65bytes): movdqu -65(%rsi), %xmm0 movdqu -49(%rsi), %xmm1 @@ -1855,7 +1851,7 @@ L(write_65bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_57bytes): movdqu -57(%rsi), %xmm0 mov -41(%rsi), %r8 @@ -1873,7 +1869,7 @@ L(write_57bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_49bytes): movdqu -49(%rsi), %xmm0 mov -33(%rsi), %r9 @@ -1889,7 +1885,7 @@ L(write_49bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_41bytes): mov -41(%rsi), %r8 mov -33(%rsi), %r9 @@ -1905,7 +1901,7 @@ L(write_41bytes): mov %dl, -1(%rdi) ret - ALIGN (4) + .p2align 4 L(write_33bytes): mov -33(%rsi), %r9 mov -25(%rsi), %r10 @@ -1919,7 +1915,7 @@ L(write_33bytes): mov %dl, -1(%rdi) ret - ALIGN (4) + .p2align 4 L(write_25bytes): mov -25(%rsi), %r10 mov -17(%rsi), %r11 @@ -1931,7 +1927,7 @@ L(write_25bytes): mov %dl, -1(%rdi) ret - ALIGN (4) + .p2align 4 L(write_17bytes): mov -17(%rsi), %r11 mov -9(%rsi), %rcx @@ -1941,7 +1937,7 @@ L(write_17bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_9bytes): mov -9(%rsi), %rcx mov -4(%rsi), %edx @@ -1949,13 +1945,13 @@ L(write_9bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_1bytes): mov -1(%rsi), %dl mov %dl, -1(%rdi) ret - ALIGN (4) + .p2align 4 L(write_74bytes): movdqu -74(%rsi), %xmm0 movdqu -58(%rsi), %xmm1 @@ -1975,7 +1971,7 @@ L(write_74bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_66bytes): movdqu -66(%rsi), %xmm0 movdqu -50(%rsi), %xmm1 @@ -1995,7 +1991,7 @@ L(write_66bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_58bytes): movdqu -58(%rsi), %xmm1 mov -42(%rsi), %r8 @@ -2013,7 +2009,7 @@ L(write_58bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_50bytes): movdqu -50(%rsi), %xmm0 mov -34(%rsi), %r9 @@ -2029,7 +2025,7 @@ L(write_50bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_42bytes): mov -42(%rsi), %r8 mov -34(%rsi), %r9 @@ -2045,7 +2041,7 @@ L(write_42bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_34bytes): mov -34(%rsi), %r9 mov -26(%rsi), %r10 @@ -2059,7 +2055,7 @@ L(write_34bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_26bytes): mov -26(%rsi), %r10 mov -18(%rsi), %r11 @@ -2071,7 +2067,7 @@ L(write_26bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_18bytes): mov -18(%rsi), %r11 mov -10(%rsi), %rcx @@ -2081,7 +2077,7 @@ L(write_18bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_10bytes): mov -10(%rsi), %rcx mov -4(%rsi), %edx @@ -2089,13 +2085,13 @@ L(write_10bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_2bytes): mov -2(%rsi), %dx mov %dx, -2(%rdi) ret - ALIGN (4) + .p2align 4 L(write_75bytes): movdqu -75(%rsi), %xmm0 movdqu -59(%rsi), %xmm1 @@ -2115,7 +2111,7 @@ L(write_75bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_67bytes): movdqu -67(%rsi), %xmm0 movdqu -59(%rsi), %xmm1 @@ -2135,7 +2131,7 @@ L(write_67bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_59bytes): movdqu -59(%rsi), %xmm0 mov -43(%rsi), %r8 @@ -2153,7 +2149,7 @@ L(write_59bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_51bytes): movdqu -51(%rsi), %xmm0 mov -35(%rsi), %r9 @@ -2169,7 +2165,7 @@ L(write_51bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_43bytes): mov -43(%rsi), %r8 mov -35(%rsi), %r9 @@ -2185,7 +2181,7 @@ L(write_43bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_35bytes): mov -35(%rsi), %r9 mov -27(%rsi), %r10 @@ -2199,7 +2195,7 @@ L(write_35bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_27bytes): mov -27(%rsi), %r10 mov -19(%rsi), %r11 @@ -2211,7 +2207,7 @@ L(write_27bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_19bytes): mov -19(%rsi), %r11 mov -11(%rsi), %rcx @@ -2221,7 +2217,7 @@ L(write_19bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_11bytes): mov -11(%rsi), %rcx mov -4(%rsi), %edx @@ -2229,7 +2225,7 @@ L(write_11bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_3bytes): mov -3(%rsi), %dx mov -2(%rsi), %cx @@ -2237,7 +2233,7 @@ L(write_3bytes): mov %cx, -2(%rdi) ret - ALIGN (4) + .p2align 4 L(write_76bytes): movdqu -76(%rsi), %xmm0 movdqu -60(%rsi), %xmm1 @@ -2257,7 +2253,7 @@ L(write_76bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_68bytes): movdqu -68(%rsi), %xmm0 movdqu -52(%rsi), %xmm1 @@ -2275,7 +2271,7 @@ L(write_68bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_60bytes): movdqu -60(%rsi), %xmm0 mov -44(%rsi), %r8 @@ -2293,7 +2289,7 @@ L(write_60bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_52bytes): movdqu -52(%rsi), %xmm0 mov -36(%rsi), %r9 @@ -2309,7 +2305,7 @@ L(write_52bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_44bytes): mov -44(%rsi), %r8 mov -36(%rsi), %r9 @@ -2325,7 +2321,7 @@ L(write_44bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_36bytes): mov -36(%rsi), %r9 mov -28(%rsi), %r10 @@ -2339,7 +2335,7 @@ L(write_36bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_28bytes): mov -28(%rsi), %r10 mov -20(%rsi), %r11 @@ -2351,7 +2347,7 @@ L(write_28bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_20bytes): mov -20(%rsi), %r11 mov -12(%rsi), %rcx @@ -2361,7 +2357,7 @@ L(write_20bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_12bytes): mov -12(%rsi), %rcx mov -4(%rsi), %edx @@ -2369,13 +2365,13 @@ L(write_12bytes): mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_4bytes): mov -4(%rsi), %edx mov %edx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_77bytes): movdqu -77(%rsi), %xmm0 movdqu -61(%rsi), %xmm1 @@ -2395,7 +2391,7 @@ L(write_77bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_69bytes): movdqu -69(%rsi), %xmm0 movdqu -53(%rsi), %xmm1 @@ -2413,7 +2409,7 @@ L(write_69bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_61bytes): movdqu -61(%rsi), %xmm0 mov -45(%rsi), %r8 @@ -2431,7 +2427,7 @@ L(write_61bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_53bytes): movdqu -53(%rsi), %xmm0 mov -45(%rsi), %r8 @@ -2448,7 +2444,7 @@ L(write_53bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_45bytes): mov -45(%rsi), %r8 mov -37(%rsi), %r9 @@ -2464,7 +2460,7 @@ L(write_45bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_37bytes): mov -37(%rsi), %r9 mov -29(%rsi), %r10 @@ -2478,7 +2474,7 @@ L(write_37bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_29bytes): mov -29(%rsi), %r10 mov -21(%rsi), %r11 @@ -2490,7 +2486,7 @@ L(write_29bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_21bytes): mov -21(%rsi), %r11 mov -13(%rsi), %rcx @@ -2500,7 +2496,7 @@ L(write_21bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_13bytes): mov -13(%rsi), %rcx mov -8(%rsi), %rdx @@ -2508,7 +2504,7 @@ L(write_13bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_5bytes): mov -5(%rsi), %edx mov -4(%rsi), %ecx @@ -2516,7 +2512,7 @@ L(write_5bytes): mov %ecx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_78bytes): movdqu -78(%rsi), %xmm0 movdqu -62(%rsi), %xmm1 @@ -2536,7 +2532,7 @@ L(write_78bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_70bytes): movdqu -70(%rsi), %xmm0 movdqu -54(%rsi), %xmm1 @@ -2554,7 +2550,7 @@ L(write_70bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_62bytes): movdqu -62(%rsi), %xmm0 mov -46(%rsi), %r8 @@ -2572,7 +2568,7 @@ L(write_62bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_54bytes): movdqu -54(%rsi), %xmm0 mov -38(%rsi), %r9 @@ -2588,7 +2584,7 @@ L(write_54bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_46bytes): mov -46(%rsi), %r8 mov -38(%rsi), %r9 @@ -2604,7 +2600,7 @@ L(write_46bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_38bytes): mov -38(%rsi), %r9 mov -30(%rsi), %r10 @@ -2618,7 +2614,7 @@ L(write_38bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_30bytes): mov -30(%rsi), %r10 mov -22(%rsi), %r11 @@ -2630,7 +2626,7 @@ L(write_30bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_22bytes): mov -22(%rsi), %r11 mov -14(%rsi), %rcx @@ -2640,7 +2636,7 @@ L(write_22bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_14bytes): mov -14(%rsi), %rcx mov -8(%rsi), %rdx @@ -2648,7 +2644,7 @@ L(write_14bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_6bytes): mov -6(%rsi), %edx mov -4(%rsi), %ecx @@ -2656,7 +2652,7 @@ L(write_6bytes): mov %ecx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(write_79bytes): movdqu -79(%rsi), %xmm0 movdqu -63(%rsi), %xmm1 @@ -2676,7 +2672,7 @@ L(write_79bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_71bytes): movdqu -71(%rsi), %xmm0 movdqu -55(%rsi), %xmm1 @@ -2694,7 +2690,7 @@ L(write_71bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_63bytes): movdqu -63(%rsi), %xmm0 mov -47(%rsi), %r8 @@ -2712,7 +2708,7 @@ L(write_63bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_55bytes): movdqu -55(%rsi), %xmm0 mov -39(%rsi), %r9 @@ -2728,7 +2724,7 @@ L(write_55bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_47bytes): mov -47(%rsi), %r8 mov -39(%rsi), %r9 @@ -2744,7 +2740,7 @@ L(write_47bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_39bytes): mov -39(%rsi), %r9 mov -31(%rsi), %r10 @@ -2758,7 +2754,7 @@ L(write_39bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_31bytes): mov -31(%rsi), %r10 mov -23(%rsi), %r11 @@ -2770,7 +2766,7 @@ L(write_31bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_23bytes): mov -23(%rsi), %r11 mov -15(%rsi), %rcx @@ -2780,7 +2776,7 @@ L(write_23bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_15bytes): mov -15(%rsi), %rcx mov -8(%rsi), %rdx @@ -2788,7 +2784,7 @@ L(write_15bytes): mov %rdx, -8(%rdi) ret - ALIGN (4) + .p2align 4 L(write_7bytes): mov -7(%rsi), %edx mov -4(%rsi), %ecx @@ -2796,7 +2792,7 @@ L(write_7bytes): mov %ecx, -4(%rdi) ret - ALIGN (4) + .p2align 4 L(large_page_fwd): movdqu (%rsi), %xmm1 lea 16(%rsi), %rsi @@ -2859,7 +2855,7 @@ L(large_page_less_64bytes): BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) #ifdef USE_AS_MEMMOVE - ALIGN (4) + .p2align 4 L(ll_cache_copy_fwd_start): prefetcht0 0x1c0(%rsi) prefetcht0 0x200(%rsi) @@ -2906,7 +2902,7 @@ L(large_page_ll_less_fwd_64bytes): BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) #endif - ALIGN (4) + .p2align 4 L(large_page_bwd): movdqu -0x10(%rsi), %xmm1 lea -16(%rsi), %rsi @@ -2966,7 +2962,7 @@ L(large_page_less_bwd_64bytes): BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) #ifdef USE_AS_MEMMOVE - ALIGN (4) + .p2align 4 L(ll_cache_copy_bwd_start): prefetcht0 -0x1c0(%rsi) prefetcht0 -0x200(%rsi) @@ -3014,7 +3010,7 @@ L(large_page_ll_less_bwd_64bytes): END (MEMCPY) .section .rodata.ssse3,"a",@progbits - ALIGN (3) + .p2align 3 L(table_less_80bytes): .int JMPTBL (L(write_0bytes), L(table_less_80bytes)) .int JMPTBL (L(write_1bytes), L(table_less_80bytes)) @@ -3097,7 +3093,7 @@ L(table_less_80bytes): .int JMPTBL (L(write_78bytes), L(table_less_80bytes)) .int JMPTBL (L(write_79bytes), L(table_less_80bytes)) - ALIGN (3) + .p2align 3 L(shl_table): .int JMPTBL (L(shl_0), L(shl_table)) .int JMPTBL (L(shl_1), L(shl_table)) @@ -3116,7 +3112,7 @@ L(shl_table): .int JMPTBL (L(shl_14), L(shl_table)) .int JMPTBL (L(shl_15), L(shl_table)) - ALIGN (3) + .p2align 3 L(shl_table_bwd): .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd)) .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd)) diff --git a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S index eed8432973..4a8e57a243 100644 --- a/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S +++ b/sysdeps/x86_64/multiarch/strcmp-sse2-unaligned.S @@ -17,7 +17,6 @@ <http://www.gnu.org/licenses/>. */ #include "sysdep.h" -#define ALIGN(x) .p2align x ENTRY ( __strcmp_sse2_unaligned) movl %edi, %eax @@ -43,7 +42,7 @@ L(return): subl %edx, %eax ret - ALIGN (4) + .p2align 4 L(next_48_bytes): movdqu 16(%rdi), %xmm6 movdqu 16(%rsi), %xmm3 @@ -85,7 +84,7 @@ L(main_loop_header): movq %rcx, %rsi jmp L(loop_start) - ALIGN (4) + .p2align 4 L(loop): addq $64, %rax addq $64, %rdx @@ -141,7 +140,7 @@ L(back_to_loop): subl %edx, %eax ret - ALIGN (4) + .p2align 4 L(loop_cross_page): xor %r10, %r10 movq %rdx, %r9 @@ -191,7 +190,7 @@ L(loop_cross_page): subl %edx, %eax ret - ALIGN (4) + .p2align 4 L(cross_page_loop): cmpb %cl, %al jne L(different) diff --git a/sysdeps/x86_64/strchr.S b/sysdeps/x86_64/strchr.S index 1900b37e63..7440500a67 100644 --- a/sysdeps/x86_64/strchr.S +++ b/sysdeps/x86_64/strchr.S @@ -19,11 +19,6 @@ #include <sysdep.h> -# ifndef ALIGN -# define ALIGN(n) .p2align n -# endif - - .text ENTRY (strchr) movd %esi, %xmm1 @@ -54,7 +49,7 @@ ENTRY (strchr) #endif ret - ALIGN(3) + .p2align 3 L(next_48_bytes): movdqu 16(%rdi), %xmm0 movdqa %xmm0, %xmm4 @@ -83,10 +78,10 @@ ENTRY (strchr) L(loop_start): /* We use this alignment to force loop be aligned to 8 but not 16 bytes. This gives better sheduling on AMD processors. */ - ALIGN(4) + .p2align 4 pxor %xmm6, %xmm6 andq $-64, %rdi - ALIGN(3) + .p2align 3 L(loop64): addq $64, %rdi movdqa (%rdi), %xmm5 @@ -129,7 +124,7 @@ L(loop64): orq %rcx, %rax salq $48, %rdx orq %rdx, %rax - ALIGN(3) + .p2align 3 L(return): bsfq %rax, %rax #ifdef AS_STRCHRNUL @@ -141,7 +136,7 @@ L(return): cmovne %rdx, %rax #endif ret - ALIGN(4) + .p2align 4 L(cross_page): movq %rdi, %rdx diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S index 514765b87f..2a07ff75ac 100644 --- a/sysdeps/x86_64/strrchr.S +++ b/sysdeps/x86_64/strrchr.S @@ -19,11 +19,6 @@ #include <sysdep.h> -# ifndef ALIGN -# define ALIGN(n) .p2align n -# endif - - .text ENTRY (strrchr) movd %esi, %xmm1 @@ -51,7 +46,7 @@ ENTRY (strrchr) addq %rdi, %rax ret - ALIGN(4) + .p2align 4 L(next_48_bytes): movdqu 16(%rdi), %xmm4 movdqa %xmm4, %xmm5 @@ -91,7 +86,7 @@ L(next_48_bytes): leaq (%rdi,%rsi), %rax ret - ALIGN(4) + .p2align 4 L(loop_header2): testq %rsi, %rsi movq %rdi, %rcx @@ -102,7 +97,7 @@ L(loop_header): andq $-64, %rdi jmp L(loop_entry) - ALIGN(4) + .p2align 4 L(loop64): testq %rdx, %rdx cmovne %rdx, %rsi @@ -163,18 +158,18 @@ L(loop_entry): leaq (%rcx,%rsi), %rax ret - ALIGN(4) + .p2align 4 L(no_c_found): movl $1, %esi xorl %ecx, %ecx jmp L(loop_header) - ALIGN(4) + .p2align 4 L(exit): xorl %eax, %eax ret - ALIGN(4) + .p2align 4 L(cross_page): movq %rdi, %rax pxor %xmm0, %xmm0 |