diff options
Diffstat (limited to 'sysdeps')
-rw-r--r-- | sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S | 246 | ||||
-rw-r--r-- | sysdeps/i386/i686/multiarch/memcpy-ssse3.S | 113 | ||||
-rw-r--r-- | sysdeps/i386/i686/multiarch/memset-sse2-rep.S | 14 | ||||
-rw-r--r-- | sysdeps/i386/i686/multiarch/memset-sse2.S | 19 | ||||
-rw-r--r-- | sysdeps/i386/i686/multiarch/strcmp-sse4.S | 4 | ||||
-rw-r--r-- | sysdeps/x86_64/Implies | 1 |
6 files changed, 235 insertions, 162 deletions
diff --git a/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S b/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S index b26037d279..48a109ccd6 100644 --- a/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S +++ b/sysdeps/i386/i686/multiarch/memcpy-ssse3-rep.S @@ -127,10 +127,8 @@ ENTRY (MEMCPY) cmp %eax, %edx jb L(copy_forward) je L(fwd_write_0bytes) - cmp $32, %ecx - jge L(memmove_bwd) - jmp L(bk_write_less32bytes_2) -L(memmove_bwd): + cmp $48, %ecx + jb L(bk_write_less48bytes) add %ecx, %eax cmp %eax, %edx movl SRC(%esp), %eax @@ -139,12 +137,12 @@ L(memmove_bwd): L(copy_forward): #endif cmp $48, %ecx - jge L(48bytesormore) + jae L(48bytesormore) L(fwd_write_less32bytes): #ifndef USE_AS_MEMMOVE cmp %dl, %al - jl L(bk_write) + jb L(bk_write) #endif add %ecx, %edx add %ecx, %eax @@ -162,6 +160,7 @@ L(48bytesormore): movl %edx, %edi and $-16, %edx PUSH (%esi) + cfi_remember_state add $16, %edx movl %edi, %esi sub %edx, %edi @@ -181,7 +180,7 @@ L(48bytesormore): #endif mov %eax, %edi - jge L(large_page) + jae L(large_page) and $0xf, %edi jz L(shl_0) @@ -201,7 +200,7 @@ L(shl_0_loop): movdqa %xmm0, (%edx, %edi) movdqa %xmm1, 16(%edx, %edi) lea 32(%edi), %edi - jl L(shl_0_end) + jb L(shl_0_end) movdqa (%eax, %edi), %xmm0 movdqa 16(%eax, %edi), %xmm1 @@ -209,7 +208,7 @@ L(shl_0_loop): movdqa %xmm0, (%edx, %edi) movdqa %xmm1, 16(%edx, %edi) lea 32(%edi), %edi - jl L(shl_0_end) + jb L(shl_0_end) movdqa (%eax, %edi), %xmm0 movdqa 16(%eax, %edi), %xmm1 @@ -217,7 +216,7 @@ L(shl_0_loop): movdqa %xmm0, (%edx, %edi) movdqa %xmm1, 16(%edx, %edi) lea 32(%edi), %edi - jl L(shl_0_end) + jb L(shl_0_end) movdqa (%eax, %edi), %xmm0 movdqa 16(%eax, %edi), %xmm1 @@ -234,6 +233,8 @@ L(shl_0_end): POP (%edi) BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) + cfi_restore_state + cfi_remember_state L(shl_0_gobble): #ifdef DATA_CACHE_SIZE_HALF @@ -251,8 +252,8 @@ L(shl_0_gobble): shr $3, %esi sub %esi, %edi cmp %edi, %ecx - jge L(shl_0_gobble_mem_start) - lea -128(%ecx), %ecx + jae L(shl_0_gobble_mem_start) + sub $128, %ecx ALIGN (4) L(shl_0_gobble_cache_loop): movdqa (%eax), %xmm0 @@ -275,11 +276,10 @@ L(shl_0_gobble_cache_loop): movaps %xmm7, 0x70(%edx) lea 0x80(%edx), %edx - jge L(shl_0_gobble_cache_loop) -L(shl_0_gobble_cache_loop_tail): - cmp $-0x40, %ecx - lea 0x80(%ecx), %ecx - jl L(shl_0_cache_less_64bytes) + jae L(shl_0_gobble_cache_loop) + add $0x80, %ecx + cmp $0x40, %ecx + jb L(shl_0_cache_less_64bytes) movdqa (%eax), %xmm0 sub $0x40, %ecx @@ -297,7 +297,7 @@ L(shl_0_gobble_cache_loop_tail): add $0x40, %edx L(shl_0_cache_less_64bytes): cmp $0x20, %ecx - jl L(shl_0_cache_less_32bytes) + jb L(shl_0_cache_less_32bytes) movdqa (%eax), %xmm0 sub $0x20, %ecx movdqa 0x10(%eax), %xmm1 @@ -307,7 +307,7 @@ L(shl_0_cache_less_64bytes): add $0x20, %edx L(shl_0_cache_less_32bytes): cmp $0x10, %ecx - jl L(shl_0_cache_less_16bytes) + jb L(shl_0_cache_less_16bytes) sub $0x10, %ecx movdqa (%eax), %xmm0 add $0x10, %eax @@ -320,12 +320,13 @@ L(shl_0_cache_less_16bytes): POP (%edi) BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) - + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_0_gobble_mem_start): cmp %al, %dl je L(copy_page_by_rep) - lea -128(%ecx), %ecx + sub $128, %ecx L(shl_0_gobble_mem_loop): prefetchnta 0x1c0(%eax) prefetchnta 0x280(%eax) @@ -352,10 +353,10 @@ L(shl_0_gobble_mem_loop): movaps %xmm7, 0x70(%edx) lea 0x80(%edx), %edx - jge L(shl_0_gobble_mem_loop) - cmp $-0x40, %ecx - lea 0x80(%ecx), %ecx - jl L(shl_0_mem_less_64bytes) + jae L(shl_0_gobble_mem_loop) + add $0x80, %ecx + cmp $0x40, %ecx + jb L(shl_0_mem_less_64bytes) movdqa (%eax), %xmm0 sub $0x40, %ecx @@ -373,7 +374,7 @@ L(shl_0_gobble_mem_loop): add $0x40, %edx L(shl_0_mem_less_64bytes): cmp $0x20, %ecx - jl L(shl_0_mem_less_32bytes) + jb L(shl_0_mem_less_32bytes) movdqa (%eax), %xmm0 sub $0x20, %ecx movdqa 0x10(%eax), %xmm1 @@ -383,7 +384,7 @@ L(shl_0_mem_less_64bytes): add $0x20, %edx L(shl_0_mem_less_32bytes): cmp $0x10, %ecx - jl L(shl_0_mem_less_16bytes) + jb L(shl_0_mem_less_16bytes) sub $0x10, %ecx movdqa (%eax), %xmm0 add $0x10, %eax @@ -396,14 +397,15 @@ L(shl_0_mem_less_16bytes): POP (%edi) BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) - + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_1): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -1(%eax), %eax + sub $1, %eax movaps (%eax), %xmm1 xor %edi, %edi - lea -32(%ecx), %ecx + sub $32, %ecx movdqu %xmm0, (%esi) POP (%esi) L(shl_1_loop): @@ -418,7 +420,7 @@ L(shl_1_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_1_end) + jb L(shl_1_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -433,20 +435,22 @@ L(shl_1_loop): jae L(shl_1_loop) L(shl_1_end): - lea 32(%ecx), %ecx + add $32, %ecx add %ecx, %edi add %edi, %edx lea 1(%edi, %eax), %eax POP (%edi) BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_2): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -2(%eax), %eax + sub $2, %eax movaps (%eax), %xmm1 xor %edi, %edi - lea -32(%ecx), %ecx + sub $32, %ecx movdqu %xmm0, (%esi) POP (%esi) L(shl_2_loop): @@ -461,7 +465,7 @@ L(shl_2_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_2_end) + jb L(shl_2_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -476,20 +480,22 @@ L(shl_2_loop): jae L(shl_2_loop) L(shl_2_end): - lea 32(%ecx), %ecx + add $32, %ecx add %ecx, %edi add %edi, %edx lea 2(%edi, %eax), %eax POP (%edi) BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_3): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -3(%eax), %eax + sub $3, %eax movaps (%eax), %xmm1 xor %edi, %edi - lea -32(%ecx), %ecx + sub $32, %ecx movdqu %xmm0, (%esi) POP (%esi) L(shl_3_loop): @@ -504,7 +510,7 @@ L(shl_3_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_3_end) + jb L(shl_3_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -519,20 +525,22 @@ L(shl_3_loop): jae L(shl_3_loop) L(shl_3_end): - lea 32(%ecx), %ecx + add $32, %ecx add %ecx, %edi add %edi, %edx lea 3(%edi, %eax), %eax POP (%edi) BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_4): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -4(%eax), %eax + sub $4, %eax movaps (%eax), %xmm1 xor %edi, %edi - lea -32(%ecx), %ecx + sub $32, %ecx movdqu %xmm0, (%esi) POP (%esi) L(shl_4_loop): @@ -547,7 +555,7 @@ L(shl_4_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_4_end) + jb L(shl_4_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -562,20 +570,22 @@ L(shl_4_loop): jae L(shl_4_loop) L(shl_4_end): - lea 32(%ecx), %ecx + add $32, %ecx add %ecx, %edi add %edi, %edx lea 4(%edi, %eax), %eax POP (%edi) BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_5): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -5(%eax), %eax + sub $5, %eax movaps (%eax), %xmm1 xor %edi, %edi - lea -32(%ecx), %ecx + sub $32, %ecx movdqu %xmm0, (%esi) POP (%esi) L(shl_5_loop): @@ -590,7 +600,7 @@ L(shl_5_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_5_end) + jb L(shl_5_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -605,21 +615,22 @@ L(shl_5_loop): jae L(shl_5_loop) L(shl_5_end): - lea 32(%ecx), %ecx + add $32, %ecx add %ecx, %edi add %edi, %edx lea 5(%edi, %eax), %eax POP (%edi) BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) - + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_6): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -6(%eax), %eax + sub $6, %eax movaps (%eax), %xmm1 xor %edi, %edi - lea -32(%ecx), %ecx + sub $32, %ecx movdqu %xmm0, (%esi) POP (%esi) L(shl_6_loop): @@ -634,7 +645,7 @@ L(shl_6_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_6_end) + jb L(shl_6_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -649,20 +660,22 @@ L(shl_6_loop): jae L(shl_6_loop) L(shl_6_end): - lea 32(%ecx), %ecx + add $32, %ecx add %ecx, %edi add %edi, %edx lea 6(%edi, %eax), %eax POP (%edi) BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_7): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -7(%eax), %eax + sub $7, %eax movaps (%eax), %xmm1 xor %edi, %edi - lea -32(%ecx), %ecx + sub $32, %ecx movdqu %xmm0, (%esi) POP (%esi) L(shl_7_loop): @@ -677,7 +690,7 @@ L(shl_7_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_7_end) + jb L(shl_7_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -692,20 +705,22 @@ L(shl_7_loop): jae L(shl_7_loop) L(shl_7_end): - lea 32(%ecx), %ecx + add $32, %ecx add %ecx, %edi add %edi, %edx lea 7(%edi, %eax), %eax POP (%edi) BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_8): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -8(%eax), %eax + sub $8, %eax movaps (%eax), %xmm1 xor %edi, %edi - lea -32(%ecx), %ecx + sub $32, %ecx movdqu %xmm0, (%esi) POP (%esi) L(shl_8_loop): @@ -720,7 +735,7 @@ L(shl_8_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_8_end) + jb L(shl_8_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -735,20 +750,22 @@ L(shl_8_loop): jae L(shl_8_loop) L(shl_8_end): - lea 32(%ecx), %ecx + add $32, %ecx add %ecx, %edi add %edi, %edx lea 8(%edi, %eax), %eax POP (%edi) BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_9): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -9(%eax), %eax + sub $9, %eax movaps (%eax), %xmm1 xor %edi, %edi - lea -32(%ecx), %ecx + sub $32, %ecx movdqu %xmm0, (%esi) POP (%esi) L(shl_9_loop): @@ -763,7 +780,7 @@ L(shl_9_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_9_end) + jb L(shl_9_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -778,20 +795,22 @@ L(shl_9_loop): jae L(shl_9_loop) L(shl_9_end): - lea 32(%ecx), %ecx + add $32, %ecx add %ecx, %edi add %edi, %edx lea 9(%edi, %eax), %eax POP (%edi) BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_10): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -10(%eax), %eax + sub $10, %eax movaps (%eax), %xmm1 xor %edi, %edi - lea -32(%ecx), %ecx + sub $32, %ecx movdqu %xmm0, (%esi) POP (%esi) L(shl_10_loop): @@ -806,7 +825,7 @@ L(shl_10_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_10_end) + jb L(shl_10_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -821,20 +840,22 @@ L(shl_10_loop): jae L(shl_10_loop) L(shl_10_end): - lea 32(%ecx), %ecx + add $32, %ecx add %ecx, %edi add %edi, %edx lea 10(%edi, %eax), %eax POP (%edi) BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_11): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -11(%eax), %eax + sub $11, %eax movaps (%eax), %xmm1 xor %edi, %edi - lea -32(%ecx), %ecx + sub $32, %ecx movdqu %xmm0, (%esi) POP (%esi) L(shl_11_loop): @@ -849,7 +870,7 @@ L(shl_11_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_11_end) + jb L(shl_11_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -864,20 +885,22 @@ L(shl_11_loop): jae L(shl_11_loop) L(shl_11_end): - lea 32(%ecx), %ecx + add $32, %ecx add %ecx, %edi add %edi, %edx lea 11(%edi, %eax), %eax POP (%edi) BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_12): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -12(%eax), %eax + sub $12, %eax movaps (%eax), %xmm1 xor %edi, %edi - lea -32(%ecx), %ecx + sub $32, %ecx movdqu %xmm0, (%esi) POP (%esi) L(shl_12_loop): @@ -892,7 +915,7 @@ L(shl_12_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_12_end) + jb L(shl_12_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -907,20 +930,22 @@ L(shl_12_loop): jae L(shl_12_loop) L(shl_12_end): - lea 32(%ecx), %ecx + add $32, %ecx add %ecx, %edi add %edi, %edx lea 12(%edi, %eax), %eax POP (%edi) BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_13): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -13(%eax), %eax + sub $13, %eax movaps (%eax), %xmm1 xor %edi, %edi - lea -32(%ecx), %ecx + sub $32, %ecx movdqu %xmm0, (%esi) POP (%esi) L(shl_13_loop): @@ -935,7 +960,7 @@ L(shl_13_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_13_end) + jb L(shl_13_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -950,20 +975,22 @@ L(shl_13_loop): jae L(shl_13_loop) L(shl_13_end): - lea 32(%ecx), %ecx + add $32, %ecx add %ecx, %edi add %edi, %edx lea 13(%edi, %eax), %eax POP (%edi) BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_14): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -14(%eax), %eax + sub $14, %eax movaps (%eax), %xmm1 xor %edi, %edi - lea -32(%ecx), %ecx + sub $32, %ecx movdqu %xmm0, (%esi) POP (%esi) L(shl_14_loop): @@ -978,7 +1005,7 @@ L(shl_14_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_14_end) + jb L(shl_14_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -993,21 +1020,22 @@ L(shl_14_loop): jae L(shl_14_loop) L(shl_14_end): - lea 32(%ecx), %ecx + add $32, %ecx add %ecx, %edi add %edi, %edx lea 14(%edi, %eax), %eax POP (%edi) BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) - + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_15): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) - lea -15(%eax), %eax + sub $15, %eax movaps (%eax), %xmm1 xor %edi, %edi - lea -32(%ecx), %ecx + sub $32, %ecx movdqu %xmm0, (%esi) POP (%esi) L(shl_15_loop): @@ -1022,7 +1050,7 @@ L(shl_15_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_15_end) + jb L(shl_15_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -1037,7 +1065,7 @@ L(shl_15_loop): jae L(shl_15_loop) L(shl_15_end): - lea 32(%ecx), %ecx + add $32, %ecx add %ecx, %edi add %edi, %edx lea 15(%edi, %eax), %eax @@ -1241,20 +1269,23 @@ L(fwd_write_3bytes): movl DEST(%esp), %eax # endif #endif - RETURN + RETURN_END + cfi_restore_state + cfi_remember_state ALIGN (4) L(large_page): movdqu (%eax), %xmm1 - lea 16(%eax), %eax movdqu %xmm0, (%esi) movntdq %xmm1, (%edx) - lea 16(%edx), %edx + add $0x10, %eax + add $0x10, %edx + sub $0x10, %ecx cmp %al, %dl je L(copy_page_by_rep) L(large_page_loop_init): POP (%esi) - lea -0x90(%ecx), %ecx + sub $0x80, %ecx POP (%edi) L(large_page_loop): prefetchnta 0x1c0(%eax) @@ -1280,9 +1311,9 @@ L(large_page_loop): movntdq %xmm7, 0x70(%edx) lea 0x80(%edx), %edx jae L(large_page_loop) - cmp $-0x40, %ecx - lea 0x80(%ecx), %ecx - jl L(large_page_less_64bytes) + add $0x80, %ecx + cmp $0x40, %ecx + jb L(large_page_less_64bytes) movdqu (%eax), %xmm0 movdqu 0x10(%eax), %xmm1 @@ -1298,7 +1329,7 @@ L(large_page_loop): sub $0x40, %ecx L(large_page_less_64bytes): cmp $32, %ecx - jl L(large_page_less_32bytes) + jb L(large_page_less_32bytes) movdqu (%eax), %xmm0 movdqu 0x10(%eax), %xmm1 lea 0x20(%eax), %eax @@ -1312,6 +1343,8 @@ L(large_page_less_32bytes): sfence BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) + cfi_restore_state + cfi_remember_state ALIGN (4) L(copy_page_by_rep): mov %eax, %esi @@ -1658,18 +1691,18 @@ L(table_48_bytes_bwd): L(copy_backward): PUSH (%esi) movl %eax, %esi - lea (%ecx,%edx,1),%edx - lea (%ecx,%esi,1),%esi + add %ecx, %edx + add %ecx, %esi testl $0x3, %edx jnz L(bk_align) L(bk_aligned_4): cmp $64, %ecx - jge L(bk_write_more64bytes) + jae L(bk_write_more64bytes) L(bk_write_64bytesless): cmp $32, %ecx - jl L(bk_write_less32bytes) + jb L(bk_write_less32bytes) L(bk_write_more32bytes): /* Copy 32 bytes at a time. */ @@ -1698,13 +1731,14 @@ L(bk_write_less32bytes): sub %ecx, %edx sub %ecx, %eax POP (%esi) -L(bk_write_less32bytes_2): +L(bk_write_less48bytes): BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) + CFI_PUSH (%esi) ALIGN (4) L(bk_align): cmp $8, %ecx - jle L(bk_write_less32bytes) + jbe L(bk_write_less32bytes) testl $1, %edx /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0, then (EDX & 2) must be != 0. */ @@ -1760,7 +1794,7 @@ L(bk_ssse3_align): L(bk_ssse3_cpy_pre): cmp $64, %ecx - jl L(bk_write_more32bytes) + jb L(bk_write_more32bytes) L(bk_ssse3_cpy): sub $64, %esi @@ -1775,7 +1809,7 @@ L(bk_ssse3_cpy): movdqu (%esi), %xmm0 movdqa %xmm0, (%edx) cmp $64, %ecx - jge L(bk_ssse3_cpy) + jae L(bk_ssse3_cpy) jmp L(bk_write_64bytesless) #endif diff --git a/sysdeps/i386/i686/multiarch/memcpy-ssse3.S b/sysdeps/i386/i686/multiarch/memcpy-ssse3.S index 749c82d379..ec9eeb95e4 100644 --- a/sysdeps/i386/i686/multiarch/memcpy-ssse3.S +++ b/sysdeps/i386/i686/multiarch/memcpy-ssse3.S @@ -128,7 +128,7 @@ ENTRY (MEMCPY) jb L(copy_forward) je L(fwd_write_0bytes) cmp $32, %ecx - jge L(memmove_bwd) + jae L(memmove_bwd) jmp L(bk_write_less32bytes_2) L(memmove_bwd): add %ecx, %eax @@ -139,12 +139,12 @@ L(memmove_bwd): L(copy_forward): #endif cmp $48, %ecx - jge L(48bytesormore) + jae L(48bytesormore) L(fwd_write_less32bytes): #ifndef USE_AS_MEMMOVE cmp %dl, %al - jl L(bk_write) + jb L(bk_write) #endif add %ecx, %edx add %ecx, %eax @@ -162,6 +162,7 @@ L(48bytesormore): movl %edx, %edi and $-16, %edx PUSH (%esi) + cfi_remember_state add $16, %edx movl %edi, %esi sub %edx, %edi @@ -181,12 +182,14 @@ L(48bytesormore): #endif mov %eax, %edi - jge L(large_page) + jae L(large_page) and $0xf, %edi jz L(shl_0) BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %edi, 4) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_0): movdqu %xmm0, (%esi) @@ -202,7 +205,7 @@ L(shl_0_loop): movdqa %xmm0, (%edx, %edi) movdqa %xmm1, 16(%edx, %edi) lea 32(%edi), %edi - jl L(shl_0_end) + jb L(shl_0_end) movdqa (%eax, %edi), %xmm0 movdqa 16(%eax, %edi), %xmm1 @@ -210,7 +213,7 @@ L(shl_0_loop): movdqa %xmm0, (%edx, %edi) movdqa %xmm1, 16(%edx, %edi) lea 32(%edi), %edi - jl L(shl_0_end) + jb L(shl_0_end) movdqa (%eax, %edi), %xmm0 movdqa 16(%eax, %edi), %xmm1 @@ -218,7 +221,7 @@ L(shl_0_loop): movdqa %xmm0, (%edx, %edi) movdqa %xmm1, 16(%edx, %edi) lea 32(%edi), %edi - jl L(shl_0_end) + jb L(shl_0_end) movdqa (%eax, %edi), %xmm0 movdqa 16(%eax, %edi), %xmm1 @@ -234,6 +237,7 @@ L(shl_0_end): POP (%edi) BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) + CFI_PUSH (%edi) L(shl_0_gobble): #ifdef DATA_CACHE_SIZE_HALF @@ -250,7 +254,7 @@ L(shl_0_gobble): POP (%edi) lea -128(%ecx), %ecx - jge L(shl_0_gobble_mem_loop) + jae L(shl_0_gobble_mem_loop) L(shl_0_gobble_cache_loop): movdqa (%eax), %xmm0 movdqa 0x10(%eax), %xmm1 @@ -272,8 +276,7 @@ L(shl_0_gobble_cache_loop): movdqa %xmm7, 0x70(%edx) lea 0x80(%edx), %edx - jge L(shl_0_gobble_cache_loop) -L(shl_0_gobble_cache_loop_tail): + jae L(shl_0_gobble_cache_loop) cmp $-0x40, %ecx lea 0x80(%ecx), %ecx jl L(shl_0_cache_less_64bytes) @@ -294,7 +297,7 @@ L(shl_0_gobble_cache_loop_tail): add $0x40, %edx L(shl_0_cache_less_64bytes): cmp $0x20, %ecx - jl L(shl_0_cache_less_32bytes) + jb L(shl_0_cache_less_32bytes) movdqa (%eax), %xmm0 sub $0x20, %ecx movdqa 0x10(%eax), %xmm1 @@ -304,7 +307,7 @@ L(shl_0_cache_less_64bytes): add $0x20, %edx L(shl_0_cache_less_32bytes): cmp $0x10, %ecx - jl L(shl_0_cache_less_16bytes) + jb L(shl_0_cache_less_16bytes) sub $0x10, %ecx movdqa (%eax), %xmm0 add $0x10, %eax @@ -342,7 +345,7 @@ L(shl_0_gobble_mem_loop): movdqa %xmm7, 0x70(%edx) lea 0x80(%edx), %edx - jge L(shl_0_gobble_mem_loop) + jae L(shl_0_gobble_mem_loop) cmp $-0x40, %ecx lea 0x80(%ecx), %ecx jl L(shl_0_mem_less_64bytes) @@ -363,7 +366,7 @@ L(shl_0_gobble_mem_loop): add $0x40, %edx L(shl_0_mem_less_64bytes): cmp $0x20, %ecx - jl L(shl_0_mem_less_32bytes) + jb L(shl_0_mem_less_32bytes) movdqa (%eax), %xmm0 sub $0x20, %ecx movdqa 0x10(%eax), %xmm1 @@ -373,7 +376,7 @@ L(shl_0_mem_less_64bytes): add $0x20, %edx L(shl_0_mem_less_32bytes): cmp $0x10, %ecx - jl L(shl_0_mem_less_16bytes) + jb L(shl_0_mem_less_16bytes) sub $0x10, %ecx movdqa (%eax), %xmm0 add $0x10, %eax @@ -384,7 +387,8 @@ L(shl_0_mem_less_16bytes): add %ecx, %eax BRANCH_TO_JMPTBL_ENTRY (L(table_48bytes_fwd), %ecx, 4) - + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_1): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) @@ -406,7 +410,7 @@ L(shl_1_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_1_end) + jb L(shl_1_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -428,6 +432,8 @@ L(shl_1_end): POP (%edi) BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_2): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) @@ -449,7 +455,7 @@ L(shl_2_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_2_end) + jb L(shl_2_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -471,6 +477,8 @@ L(shl_2_end): POP (%edi) BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_3): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) @@ -492,7 +500,7 @@ L(shl_3_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_3_end) + jb L(shl_3_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -514,6 +522,8 @@ L(shl_3_end): POP (%edi) BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_4): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) @@ -535,7 +545,7 @@ L(shl_4_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_4_end) + jb L(shl_4_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -557,6 +567,8 @@ L(shl_4_end): POP (%edi) BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_5): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) @@ -578,7 +590,7 @@ L(shl_5_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_5_end) + jb L(shl_5_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -600,7 +612,8 @@ L(shl_5_end): POP (%edi) BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) - + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_6): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) @@ -622,7 +635,7 @@ L(shl_6_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_6_end) + jb L(shl_6_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -644,6 +657,8 @@ L(shl_6_end): POP (%edi) BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_7): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) @@ -665,7 +680,7 @@ L(shl_7_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_7_end) + jb L(shl_7_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -687,6 +702,8 @@ L(shl_7_end): POP (%edi) BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_8): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) @@ -708,7 +725,7 @@ L(shl_8_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_8_end) + jb L(shl_8_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -730,6 +747,8 @@ L(shl_8_end): POP (%edi) BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_9): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) @@ -751,7 +770,7 @@ L(shl_9_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_9_end) + jb L(shl_9_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -773,6 +792,8 @@ L(shl_9_end): POP (%edi) BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_10): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) @@ -794,7 +815,7 @@ L(shl_10_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_10_end) + jb L(shl_10_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -816,6 +837,8 @@ L(shl_10_end): POP (%edi) BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_11): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) @@ -837,7 +860,7 @@ L(shl_11_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_11_end) + jb L(shl_11_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -859,6 +882,8 @@ L(shl_11_end): POP (%edi) BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_12): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) @@ -880,7 +905,7 @@ L(shl_12_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_12_end) + jb L(shl_12_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -902,6 +927,8 @@ L(shl_12_end): POP (%edi) BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_13): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) @@ -923,7 +950,7 @@ L(shl_13_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_13_end) + jb L(shl_13_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -945,6 +972,8 @@ L(shl_13_end): POP (%edi) BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_14): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) @@ -966,7 +995,7 @@ L(shl_14_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_14_end) + jb L(shl_14_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -988,7 +1017,8 @@ L(shl_14_end): POP (%edi) BRANCH_TO_JMPTBL_ENTRY_TAIL(L(table_48bytes_fwd), %ecx, 4) - + cfi_restore_state + cfi_remember_state ALIGN (4) L(shl_15): BRANCH_TO_JMPTBL_ENTRY_VALUE(L(table_48bytes_fwd)) @@ -1010,7 +1040,7 @@ L(shl_15_loop): movdqa %xmm2, -32(%edx, %edi) movdqa %xmm3, -16(%edx, %edi) - jl L(shl_15_end) + jb L(shl_15_end) movdqa 16(%eax, %edi), %xmm2 sub $32, %ecx @@ -1229,8 +1259,10 @@ L(fwd_write_3bytes): movl DEST(%esp), %eax # endif #endif - RETURN + RETURN_END + cfi_restore_state + cfi_remember_state ALIGN (4) L(large_page): movdqu (%eax), %xmm1 @@ -1281,7 +1313,7 @@ L(large_page_loop): sub $0x40, %ecx L(large_page_less_64bytes): cmp $32, %ecx - jl L(large_page_less_32bytes) + jb L(large_page_less_32bytes) movdqu (%eax), %xmm0 movdqu 0x10(%eax), %xmm1 lea 0x20(%eax), %eax @@ -1617,11 +1649,11 @@ L(copy_backward): L(bk_aligned_4): cmp $64, %ecx - jge L(bk_write_more64bytes) + jae L(bk_write_more64bytes) L(bk_write_64bytesless): cmp $32, %ecx - jl L(bk_write_less32bytes) + jb L(bk_write_less32bytes) L(bk_write_more32bytes): /* Copy 32 bytes at a time. */ @@ -1653,10 +1685,11 @@ L(bk_write_less32bytes): L(bk_write_less32bytes_2): BRANCH_TO_JMPTBL_ENTRY (L(table_48_bytes_bwd), %ecx, 4) + CFI_PUSH (%esi) ALIGN (4) L(bk_align): cmp $8, %ecx - jle L(bk_write_less32bytes) + jbe L(bk_write_less32bytes) testl $1, %edx /* We get here only if (EDX & 3 ) != 0 so if (EDX & 1) ==0, then (EDX & 2) must be != 0. */ @@ -1712,7 +1745,7 @@ L(bk_ssse3_align): L(bk_ssse3_cpy_pre): cmp $64, %ecx - jl L(bk_write_more32bytes) + jb L(bk_write_more32bytes) L(bk_ssse3_cpy): sub $64, %esi @@ -1727,7 +1760,7 @@ L(bk_ssse3_cpy): movdqu (%esi), %xmm0 movdqa %xmm0, (%edx) cmp $64, %ecx - jge L(bk_ssse3_cpy) + jae L(bk_ssse3_cpy) jmp L(bk_write_64bytesless) #endif diff --git a/sysdeps/i386/i686/multiarch/memset-sse2-rep.S b/sysdeps/i386/i686/multiarch/memset-sse2-rep.S index 84afffeb66..f9a0b13d0c 100644 --- a/sysdeps/i386/i686/multiarch/memset-sse2-rep.S +++ b/sysdeps/i386/i686/multiarch/memset-sse2-rep.S @@ -243,7 +243,6 @@ L(32bytesormore): pxor %xmm0, %xmm0 #else movd %eax, %xmm0 - punpcklbw %xmm0, %xmm0 pshufd $0, %xmm0, %xmm0 #endif testl $0xf, %edx @@ -261,7 +260,7 @@ L(not_aligned_16): ALIGN (4) L(aligned_16): cmp $128, %ecx - jge L(128bytesormore) + jae L(128bytesormore) L(aligned_16_less128bytes): BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) @@ -293,7 +292,7 @@ L(128bytesormore): * fast string will prefetch and combine data efficiently. */ cmp %edi, %ecx - jae L(128bytesormore_nt) + jae L(128bytesormore_endof_L1) subl $128, %ecx L(128bytesormore_normal): sub $128, %ecx @@ -306,7 +305,7 @@ L(128bytesormore_normal): movdqa %xmm0, 0x60(%edx) movdqa %xmm0, 0x70(%edx) lea 128(%edx), %edx - jl L(128bytesless_normal) + jb L(128bytesless_normal) sub $128, %ecx @@ -319,15 +318,16 @@ L(128bytesormore_normal): movdqa %xmm0, 0x60(%edx) movdqa %xmm0, 0x70(%edx) lea 128(%edx), %edx - jge L(128bytesormore_normal) + jae L(128bytesormore_normal) L(128bytesless_normal): POP (%edi) - lea 128(%ecx), %ecx + add $128, %ecx BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) + CFI_PUSH (%edi) ALIGN (4) -L(128bytesormore_nt): +L(128bytesormore_endof_L1): mov %edx, %edi mov %ecx, %edx shr $2, %ecx diff --git a/sysdeps/i386/i686/multiarch/memset-sse2.S b/sysdeps/i386/i686/multiarch/memset-sse2.S index b2b979193e..92ad601bf2 100644 --- a/sysdeps/i386/i686/multiarch/memset-sse2.S +++ b/sysdeps/i386/i686/multiarch/memset-sse2.S @@ -243,7 +243,6 @@ L(32bytesormore): pxor %xmm0, %xmm0 #else movd %eax, %xmm0 - punpcklbw %xmm0, %xmm0 pshufd $0, %xmm0, %xmm0 #endif testl $0xf, %edx @@ -261,7 +260,7 @@ L(not_aligned_16): ALIGN (4) L(aligned_16): cmp $128, %ecx - jge L(128bytesormore) + jae L(128bytesormore) L(aligned_16_less128bytes): BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) @@ -287,14 +286,17 @@ L(128bytesormore): #ifdef DATA_CACHE_SIZE POP (%ebx) +# define RESTORE_EBX_STATE CFI_PUSH (%ebx) cmp $DATA_CACHE_SIZE, %ecx #else # ifdef SHARED +# define RESTORE_EBX_STATE call __i686.get_pc_thunk.bx add $_GLOBAL_OFFSET_TABLE_, %ebx cmp __x86_data_cache_size@GOTOFF(%ebx), %ecx # else POP (%ebx) +# define RESTORE_EBX_STATE CFI_PUSH (%ebx) cmp __x86_data_cache_size, %ecx # endif #endif @@ -312,7 +314,7 @@ L(128bytesormore_normal): movdqa %xmm0, 0x60(%edx) movdqa %xmm0, 0x70(%edx) lea 128(%edx), %edx - jl L(128bytesless_normal) + jb L(128bytesless_normal) sub $128, %ecx @@ -325,10 +327,10 @@ L(128bytesormore_normal): movdqa %xmm0, 0x60(%edx) movdqa %xmm0, 0x70(%edx) lea 128(%edx), %edx - jge L(128bytesormore_normal) + jae L(128bytesormore_normal) L(128bytesless_normal): - lea 128(%ecx), %ecx + add $128, %ecx BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) ALIGN (4) @@ -346,11 +348,12 @@ L(128bytes_L2_normal): movaps %xmm0, 0x70(%edx) add $128, %edx cmp $128, %ecx - jge L(128bytes_L2_normal) + jae L(128bytes_L2_normal) L(128bytesless_L2_normal): BRANCH_TO_JMPTBL_ENTRY (L(table_16_128bytes)) + RESTORE_EBX_STATE L(128bytesormore_nt_start): sub %ebx, %ecx ALIGN (4) @@ -368,7 +371,7 @@ L(128bytesormore_shared_cache_loop): movdqa %xmm0, 0x70(%edx) add $0x80, %edx cmp $0x80, %ebx - jge L(128bytesormore_shared_cache_loop) + jae L(128bytesormore_shared_cache_loop) cmp $0x80, %ecx jb L(shared_cache_loop_end) ALIGN (4) @@ -384,7 +387,7 @@ L(128bytesormore_nt): movntdq %xmm0, 0x70(%edx) add $0x80, %edx cmp $0x80, %ecx - jge L(128bytesormore_nt) + jae L(128bytesormore_nt) sfence L(shared_cache_loop_end): #if defined DATA_CACHE_SIZE || !defined SHARED diff --git a/sysdeps/i386/i686/multiarch/strcmp-sse4.S b/sysdeps/i386/i686/multiarch/strcmp-sse4.S index d5fd23e15c..81d6ec66f7 100644 --- a/sysdeps/i386/i686/multiarch/strcmp-sse4.S +++ b/sysdeps/i386/i686/multiarch/strcmp-sse4.S @@ -178,7 +178,9 @@ L(first4bytes): PUSH (%ebx) PUSH (%edi) PUSH (%esi) +#ifdef USE_AS_STRNCMP cfi_remember_state +#endif mov %edx, %edi mov %eax, %esi xorl %eax, %eax @@ -246,8 +248,8 @@ L(ret): ret .p2align 4 - cfi_restore_state #ifdef USE_AS_STRNCMP + cfi_restore_state L(more16byteseq): POP (%esi) POP (%edi) diff --git a/sysdeps/x86_64/Implies b/sysdeps/x86_64/Implies index 2b8412b0b6..2e0a323e13 100644 --- a/sysdeps/x86_64/Implies +++ b/sysdeps/x86_64/Implies @@ -1,4 +1,5 @@ wordsize-64 ieee754/ldbl-96 +ieee754/dbl-64/wordsize-64 ieee754/dbl-64 ieee754/flt-32 |