summary refs log tree commit diff
path: root/sysdeps/s390/s390-64/chacha20-s390x.S
diff options
context:
space:
mode:
Diffstat (limited to 'sysdeps/s390/s390-64/chacha20-s390x.S')
-rw-r--r--sysdeps/s390/s390-64/chacha20-s390x.S573
1 files changed, 0 insertions, 573 deletions
diff --git a/sysdeps/s390/s390-64/chacha20-s390x.S b/sysdeps/s390/s390-64/chacha20-s390x.S
deleted file mode 100644
index e38504d370..0000000000
--- a/sysdeps/s390/s390-64/chacha20-s390x.S
+++ /dev/null
@@ -1,573 +0,0 @@
-/* Optimized s390x implementation of ChaCha20 cipher.
-   Copyright (C) 2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-/* chacha20-s390x.S  -  zSeries implementation of ChaCha20 cipher
-
-   Copyright (C) 2020 Jussi Kivilinna <jussi.kivilinna@iki.fi>
-
-   This file is part of Libgcrypt.
-
-   Libgcrypt is free software; you can redistribute it and/or modify
-   it under the terms of the GNU Lesser General Public License as
-   published by the Free Software Foundation; either version 2.1 of
-   the License, or (at your option) any later version.
-
-   Libgcrypt is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-   GNU Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with this program; if not, see <https://www.gnu.org/licenses/>.
- */
-
-#include <sysdep.h>
-
-#ifdef HAVE_S390_VX_ASM_SUPPORT
-
-/* CFA expressions are used for pointing CFA and registers to
- * SP relative offsets. */
-# define DW_REGNO_SP 15
-
-/* Fixed length encoding used for integers for now. */
-# define DW_SLEB128_7BIT(value) \
-        0x00|((value) & 0x7f)
-# define DW_SLEB128_28BIT(value) \
-        0x80|((value)&0x7f), \
-        0x80|(((value)>>7)&0x7f), \
-        0x80|(((value)>>14)&0x7f), \
-        0x00|(((value)>>21)&0x7f)
-
-# define cfi_cfa_on_stack(rsp_offs,cfa_depth) \
-        .cfi_escape \
-          0x0f, /* DW_CFA_def_cfa_expression */ \
-            DW_SLEB128_7BIT(11), /* length */ \
-          0x7f, /* DW_OP_breg15, rsp + constant */ \
-            DW_SLEB128_28BIT(rsp_offs), \
-          0x06, /* DW_OP_deref */ \
-          0x23, /* DW_OP_plus_constu */ \
-            DW_SLEB128_28BIT((cfa_depth)+160)
-
-.machine "z13+vx"
-.text
-
-.balign 16
-.Lconsts:
-.Lwordswap:
-	.byte 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3
-.Lbswap128:
-	.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0
-.Lbswap32:
-	.byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
-.Lone:
-	.long 0, 0, 0, 1
-.Ladd_counter_0123:
-	.long 0, 1, 2, 3
-.Ladd_counter_4567:
-	.long 4, 5, 6, 7
-
-/* register macros */
-#define INPUT %r2
-#define DST   %r3
-#define SRC   %r4
-#define NBLKS %r0
-#define ROUND %r1
-
-/* stack structure */
-
-#define STACK_FRAME_STD    (8 * 16 + 8 * 4)
-#define STACK_FRAME_F8_F15 (8 * 8)
-#define STACK_FRAME_Y0_Y15 (16 * 16)
-#define STACK_FRAME_CTR    (4 * 16)
-#define STACK_FRAME_PARAMS (6 * 8)
-
-#define STACK_MAX   (STACK_FRAME_STD + STACK_FRAME_F8_F15 + \
-		     STACK_FRAME_Y0_Y15 + STACK_FRAME_CTR + \
-		     STACK_FRAME_PARAMS)
-
-#define STACK_F8     (STACK_MAX - STACK_FRAME_F8_F15)
-#define STACK_F9     (STACK_F8 + 8)
-#define STACK_F10    (STACK_F9 + 8)
-#define STACK_F11    (STACK_F10 + 8)
-#define STACK_F12    (STACK_F11 + 8)
-#define STACK_F13    (STACK_F12 + 8)
-#define STACK_F14    (STACK_F13 + 8)
-#define STACK_F15    (STACK_F14 + 8)
-#define STACK_Y0_Y15 (STACK_F8 - STACK_FRAME_Y0_Y15)
-#define STACK_CTR    (STACK_Y0_Y15 - STACK_FRAME_CTR)
-#define STACK_INPUT  (STACK_CTR - STACK_FRAME_PARAMS)
-#define STACK_DST    (STACK_INPUT + 8)
-#define STACK_SRC    (STACK_DST + 8)
-#define STACK_NBLKS  (STACK_SRC + 8)
-#define STACK_POCTX  (STACK_NBLKS + 8)
-#define STACK_POSRC  (STACK_POCTX + 8)
-
-#define STACK_G0_H3  STACK_Y0_Y15
-
-/* vector registers */
-#define A0 %v0
-#define A1 %v1
-#define A2 %v2
-#define A3 %v3
-
-#define B0 %v4
-#define B1 %v5
-#define B2 %v6
-#define B3 %v7
-
-#define C0 %v8
-#define C1 %v9
-#define C2 %v10
-#define C3 %v11
-
-#define D0 %v12
-#define D1 %v13
-#define D2 %v14
-#define D3 %v15
-
-#define E0 %v16
-#define E1 %v17
-#define E2 %v18
-#define E3 %v19
-
-#define F0 %v20
-#define F1 %v21
-#define F2 %v22
-#define F3 %v23
-
-#define G0 %v24
-#define G1 %v25
-#define G2 %v26
-#define G3 %v27
-
-#define H0 %v28
-#define H1 %v29
-#define H2 %v30
-#define H3 %v31
-
-#define IO0 E0
-#define IO1 E1
-#define IO2 E2
-#define IO3 E3
-#define IO4 F0
-#define IO5 F1
-#define IO6 F2
-#define IO7 F3
-
-#define S0 G0
-#define S1 G1
-#define S2 G2
-#define S3 G3
-
-#define TMP0 H0
-#define TMP1 H1
-#define TMP2 H2
-#define TMP3 H3
-
-#define X0 A0
-#define X1 A1
-#define X2 A2
-#define X3 A3
-#define X4 B0
-#define X5 B1
-#define X6 B2
-#define X7 B3
-#define X8 C0
-#define X9 C1
-#define X10 C2
-#define X11 C3
-#define X12 D0
-#define X13 D1
-#define X14 D2
-#define X15 D3
-
-#define Y0 E0
-#define Y1 E1
-#define Y2 E2
-#define Y3 E3
-#define Y4 F0
-#define Y5 F1
-#define Y6 F2
-#define Y7 F3
-#define Y8 G0
-#define Y9 G1
-#define Y10 G2
-#define Y11 G3
-#define Y12 H0
-#define Y13 H1
-#define Y14 H2
-#define Y15 H3
-
-/**********************************************************************
-  helper macros
- **********************************************************************/
-
-#define _ /*_*/
-
-#define START_STACK(last_r) \
-	lgr %r0, %r15; \
-	lghi %r1, ~15; \
-	stmg %r6, last_r, 6 * 8(%r15); \
-	aghi %r0, -STACK_MAX; \
-	ngr %r0, %r1; \
-	lgr %r1, %r15; \
-	cfi_def_cfa_register(1); \
-	lgr %r15, %r0; \
-	stg %r1, 0(%r15); \
-	cfi_cfa_on_stack(0, 0); \
-	std %f8, STACK_F8(%r15); \
-	std %f9, STACK_F9(%r15); \
-	std %f10, STACK_F10(%r15); \
-	std %f11, STACK_F11(%r15); \
-	std %f12, STACK_F12(%r15); \
-	std %f13, STACK_F13(%r15); \
-	std %f14, STACK_F14(%r15); \
-	std %f15, STACK_F15(%r15);
-
-#define END_STACK(last_r) \
-	lg %r1, 0(%r15); \
-	ld %f8, STACK_F8(%r15); \
-	ld %f9, STACK_F9(%r15); \
-	ld %f10, STACK_F10(%r15); \
-	ld %f11, STACK_F11(%r15); \
-	ld %f12, STACK_F12(%r15); \
-	ld %f13, STACK_F13(%r15); \
-	ld %f14, STACK_F14(%r15); \
-	ld %f15, STACK_F15(%r15); \
-	lmg %r6, last_r, 6 * 8(%r1); \
-	lgr %r15, %r1; \
-	cfi_def_cfa_register(DW_REGNO_SP);
-
-#define PLUS(dst,src) \
-	vaf dst, dst, src;
-
-#define XOR(dst,src) \
-	vx dst, dst, src;
-
-#define ROTATE(v1,c) \
-	verllf v1, v1, (c)(0);
-
-#define WORD_ROTATE(v1,s) \
-	vsldb v1, v1, v1, ((s) * 4);
-
-#define DST_8(OPER, I, J) \
-	OPER(A##I, J); OPER(B##I, J); OPER(C##I, J); OPER(D##I, J); \
-	OPER(E##I, J); OPER(F##I, J); OPER(G##I, J); OPER(H##I, J);
-
-/**********************************************************************
-  round macros
- **********************************************************************/
-
-/**********************************************************************
-  8-way chacha20 ("vertical")
- **********************************************************************/
-
-#define QUARTERROUND4_V8_POLY(x0,x1,x2,x3,x4,x5,x6,x7,\
-			      x8,x9,x10,x11,x12,x13,x14,x15,\
-			      y0,y1,y2,y3,y4,y5,y6,y7,\
-			      y8,y9,y10,y11,y12,y13,y14,y15,\
-			      op1,op2,op3,op4,op5,op6,op7,op8,\
-			      op9,op10,op11,op12) \
-	op1;							\
-	PLUS(x0, x1); PLUS(x4, x5);				\
-	PLUS(x8, x9); PLUS(x12, x13);				\
-	PLUS(y0, y1); PLUS(y4, y5);				\
-	PLUS(y8, y9); PLUS(y12, y13);				\
-	    op2;						\
-	    XOR(x3, x0);  XOR(x7, x4);				\
-	    XOR(x11, x8); XOR(x15, x12);			\
-	    XOR(y3, y0);  XOR(y7, y4);				\
-	    XOR(y11, y8); XOR(y15, y12);			\
-		op3;						\
-		ROTATE(x3, 16); ROTATE(x7, 16);			\
-		ROTATE(x11, 16); ROTATE(x15, 16);		\
-		ROTATE(y3, 16); ROTATE(y7, 16);			\
-		ROTATE(y11, 16); ROTATE(y15, 16);		\
-	op4;							\
-	PLUS(x2, x3); PLUS(x6, x7);				\
-	PLUS(x10, x11); PLUS(x14, x15);				\
-	PLUS(y2, y3); PLUS(y6, y7);				\
-	PLUS(y10, y11); PLUS(y14, y15);				\
-	    op5;						\
-	    XOR(x1, x2); XOR(x5, x6);				\
-	    XOR(x9, x10); XOR(x13, x14);			\
-	    XOR(y1, y2); XOR(y5, y6);				\
-	    XOR(y9, y10); XOR(y13, y14);			\
-		op6;						\
-		ROTATE(x1,12); ROTATE(x5,12);			\
-		ROTATE(x9,12); ROTATE(x13,12);			\
-		ROTATE(y1,12); ROTATE(y5,12);			\
-		ROTATE(y9,12); ROTATE(y13,12);			\
-	op7;							\
-	PLUS(x0, x1); PLUS(x4, x5);				\
-	PLUS(x8, x9); PLUS(x12, x13);				\
-	PLUS(y0, y1); PLUS(y4, y5);				\
-	PLUS(y8, y9); PLUS(y12, y13);				\
-	    op8;						\
-	    XOR(x3, x0); XOR(x7, x4);				\
-	    XOR(x11, x8); XOR(x15, x12);			\
-	    XOR(y3, y0); XOR(y7, y4);				\
-	    XOR(y11, y8); XOR(y15, y12);			\
-		op9;						\
-		ROTATE(x3,8); ROTATE(x7,8);			\
-		ROTATE(x11,8); ROTATE(x15,8);			\
-		ROTATE(y3,8); ROTATE(y7,8);			\
-		ROTATE(y11,8); ROTATE(y15,8);			\
-	op10;							\
-	PLUS(x2, x3); PLUS(x6, x7);				\
-	PLUS(x10, x11); PLUS(x14, x15);				\
-	PLUS(y2, y3); PLUS(y6, y7);				\
-	PLUS(y10, y11); PLUS(y14, y15);				\
-	    op11;						\
-	    XOR(x1, x2); XOR(x5, x6);				\
-	    XOR(x9, x10); XOR(x13, x14);			\
-	    XOR(y1, y2); XOR(y5, y6);				\
-	    XOR(y9, y10); XOR(y13, y14);			\
-		op12;						\
-		ROTATE(x1,7); ROTATE(x5,7);			\
-		ROTATE(x9,7); ROTATE(x13,7);			\
-		ROTATE(y1,7); ROTATE(y5,7);			\
-		ROTATE(y9,7); ROTATE(y13,7);
-
-#define QUARTERROUND4_V8(x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,\
-			 y0,y1,y2,y3,y4,y5,y6,y7,y8,y9,y10,y11,y12,y13,y14,y15) \
-	QUARTERROUND4_V8_POLY(x0,x1,x2,x3,x4,x5,x6,x7,\
-			      x8,x9,x10,x11,x12,x13,x14,x15,\
-			      y0,y1,y2,y3,y4,y5,y6,y7,\
-			      y8,y9,y10,y11,y12,y13,y14,y15,\
-			      ,,,,,,,,,,,)
-
-#define TRANSPOSE_4X4_2(v0,v1,v2,v3,va,vb,vc,vd,tmp0,tmp1,tmp2,tmpa,tmpb,tmpc) \
-	  vmrhf tmp0, v0, v1;					\
-	  vmrhf tmp1, v2, v3;					\
-	  vmrlf tmp2, v0, v1;					\
-	  vmrlf   v3, v2, v3;					\
-	  vmrhf tmpa, va, vb;					\
-	  vmrhf tmpb, vc, vd;					\
-	  vmrlf tmpc, va, vb;					\
-	  vmrlf   vd, vc, vd;					\
-	  vpdi v0, tmp0, tmp1, 0;				\
-	  vpdi v1, tmp0, tmp1, 5;				\
-	  vpdi v2, tmp2,   v3, 0;				\
-	  vpdi v3, tmp2,   v3, 5;				\
-	  vpdi va, tmpa, tmpb, 0;				\
-	  vpdi vb, tmpa, tmpb, 5;				\
-	  vpdi vc, tmpc,   vd, 0;				\
-	  vpdi vd, tmpc,   vd, 5;
-
-.balign 8
-.globl __chacha20_s390x_vx_blocks8
-ENTRY (__chacha20_s390x_vx_blocks8)
-	/* input:
-	 *	%r2: input
-	 *	%r3: dst
-	 *	%r4: src
-	 *	%r5: nblks (multiple of 8)
-	 */
-
-	START_STACK(%r8);
-	lgr NBLKS, %r5;
-
-	larl %r7, .Lconsts;
-
-	/* Load counter. */
-	lg %r8, (12 * 4)(INPUT);
-	rllg %r8, %r8, 32;
-
-.balign 4
-	/* Process eight chacha20 blocks per loop. */
-.Lloop8:
-	vlm Y0, Y3, 0(INPUT);
-
-	slgfi NBLKS, 8;
-	lghi ROUND, (20 / 2);
-
-	/* Construct counter vectors X12/X13 & Y12/Y13. */
-	vl X4, (.Ladd_counter_0123 - .Lconsts)(%r7);
-	vl Y4, (.Ladd_counter_4567 - .Lconsts)(%r7);
-	vrepf Y12, Y3, 0;
-	vrepf Y13, Y3, 1;
-	vaccf X5, Y12, X4;
-	vaccf Y5, Y12, Y4;
-	vaf X12, Y12, X4;
-	vaf Y12, Y12, Y4;
-	vaf X13, Y13, X5;
-	vaf Y13, Y13, Y5;
-
-	vrepf X0, Y0, 0;
-	vrepf X1, Y0, 1;
-	vrepf X2, Y0, 2;
-	vrepf X3, Y0, 3;
-	vrepf X4, Y1, 0;
-	vrepf X5, Y1, 1;
-	vrepf X6, Y1, 2;
-	vrepf X7, Y1, 3;
-	vrepf X8, Y2, 0;
-	vrepf X9, Y2, 1;
-	vrepf X10, Y2, 2;
-	vrepf X11, Y2, 3;
-	vrepf X14, Y3, 2;
-	vrepf X15, Y3, 3;
-
-	/* Store counters for blocks 0-7. */
-	vstm X12, X13, (STACK_CTR + 0 * 16)(%r15);
-	vstm Y12, Y13, (STACK_CTR + 2 * 16)(%r15);
-
-	vlr Y0, X0;
-	vlr Y1, X1;
-	vlr Y2, X2;
-	vlr Y3, X3;
-	vlr Y4, X4;
-	vlr Y5, X5;
-	vlr Y6, X6;
-	vlr Y7, X7;
-	vlr Y8, X8;
-	vlr Y9, X9;
-	vlr Y10, X10;
-	vlr Y11, X11;
-	vlr Y14, X14;
-	vlr Y15, X15;
-
-	/* Update and store counter. */
-	agfi %r8, 8;
-	rllg %r5, %r8, 32;
-	stg %r5, (12 * 4)(INPUT);
-
-.balign 4
-.Lround2_8:
-	QUARTERROUND4_V8(X0, X4,  X8, X12,   X1, X5,  X9, X13,
-			 X2, X6, X10, X14,   X3, X7, X11, X15,
-			 Y0, Y4,  Y8, Y12,   Y1, Y5,  Y9, Y13,
-			 Y2, Y6, Y10, Y14,   Y3, Y7, Y11, Y15);
-	QUARTERROUND4_V8(X0, X5, X10, X15,   X1, X6, X11, X12,
-			 X2, X7,  X8, X13,   X3, X4,  X9, X14,
-			 Y0, Y5, Y10, Y15,   Y1, Y6, Y11, Y12,
-			 Y2, Y7,  Y8, Y13,   Y3, Y4,  Y9, Y14);
-	brctg ROUND, .Lround2_8;
-
-	/* Store blocks 4-7. */
-	vstm Y0, Y15, STACK_Y0_Y15(%r15);
-
-	/* Load counters for blocks 0-3. */
-	vlm Y0, Y1, (STACK_CTR + 0 * 16)(%r15);
-
-	lghi ROUND, 1;
-	j .Lfirst_output_4blks_8;
-
-.balign 4
-.Lsecond_output_4blks_8:
-	/* Load blocks 4-7. */
-	vlm X0, X15, STACK_Y0_Y15(%r15);
-
-	/* Load counters for blocks 4-7. */
-	vlm Y0, Y1, (STACK_CTR + 2 * 16)(%r15);
-
-	lghi ROUND, 0;
-
-.balign 4
-	/* Output four chacha20 blocks per loop. */
-.Lfirst_output_4blks_8:
-	vlm Y12, Y15, 0(INPUT);
-	PLUS(X12, Y0);
-	PLUS(X13, Y1);
-	vrepf Y0, Y12, 0;
-	vrepf Y1, Y12, 1;
-	vrepf Y2, Y12, 2;
-	vrepf Y3, Y12, 3;
-	vrepf Y4, Y13, 0;
-	vrepf Y5, Y13, 1;
-	vrepf Y6, Y13, 2;
-	vrepf Y7, Y13, 3;
-	vrepf Y8, Y14, 0;
-	vrepf Y9, Y14, 1;
-	vrepf Y10, Y14, 2;
-	vrepf Y11, Y14, 3;
-	vrepf Y14, Y15, 2;
-	vrepf Y15, Y15, 3;
-	PLUS(X0, Y0);
-	PLUS(X1, Y1);
-	PLUS(X2, Y2);
-	PLUS(X3, Y3);
-	PLUS(X4, Y4);
-	PLUS(X5, Y5);
-	PLUS(X6, Y6);
-	PLUS(X7, Y7);
-	PLUS(X8, Y8);
-	PLUS(X9, Y9);
-	PLUS(X10, Y10);
-	PLUS(X11, Y11);
-	PLUS(X14, Y14);
-	PLUS(X15, Y15);
-
-	vl Y15, (.Lbswap32 - .Lconsts)(%r7);
-	TRANSPOSE_4X4_2(X0, X1, X2, X3, X4, X5, X6, X7,
-			Y9, Y10, Y11, Y12, Y13, Y14);
-	TRANSPOSE_4X4_2(X8, X9, X10, X11, X12, X13, X14, X15,
-			Y9, Y10, Y11, Y12, Y13, Y14);
-
-	vlm Y0, Y14, 0(SRC);
-	vperm X0, X0, X0, Y15;
-	vperm X1, X1, X1, Y15;
-	vperm X2, X2, X2, Y15;
-	vperm X3, X3, X3, Y15;
-	vperm X4, X4, X4, Y15;
-	vperm X5, X5, X5, Y15;
-	vperm X6, X6, X6, Y15;
-	vperm X7, X7, X7, Y15;
-	vperm X8, X8, X8, Y15;
-	vperm X9, X9, X9, Y15;
-	vperm X10, X10, X10, Y15;
-	vperm X11, X11, X11, Y15;
-	vperm X12, X12, X12, Y15;
-	vperm X13, X13, X13, Y15;
-	vperm X14, X14, X14, Y15;
-	vperm X15, X15, X15, Y15;
-	vl Y15, (15 * 16)(SRC);
-
-	XOR(Y0, X0);
-	XOR(Y1, X4);
-	XOR(Y2, X8);
-	XOR(Y3, X12);
-	XOR(Y4, X1);
-	XOR(Y5, X5);
-	XOR(Y6, X9);
-	XOR(Y7, X13);
-	XOR(Y8, X2);
-	XOR(Y9, X6);
-	XOR(Y10, X10);
-	XOR(Y11, X14);
-	XOR(Y12, X3);
-	XOR(Y13, X7);
-	XOR(Y14, X11);
-	XOR(Y15, X15);
-	vstm Y0, Y15, 0(DST);
-
-	aghi SRC, 256;
-	aghi DST, 256;
-
-	clgije ROUND, 1, .Lsecond_output_4blks_8;
-
-	clgijhe NBLKS, 8, .Lloop8;
-
-
-	END_STACK(%r8);
-	xgr %r2, %r2;
-	br %r14;
-END (__chacha20_s390x_vx_blocks8)
-
-#endif /* HAVE_S390_VX_ASM_SUPPORT */