Skip to content

Commit

Permalink
v24.2
Browse files Browse the repository at this point in the history
  • Loading branch information
JayDDee committed May 21, 2024
1 parent 4f93057 commit 042d13d
Show file tree
Hide file tree
Showing 129 changed files with 835 additions and 538 deletions.
6 changes: 6 additions & 0 deletions RELEASE_NOTES
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,12 @@ If not what makes it happen or not happen?
Change Log
----------

v24.2

x86_64: Fixed blakes2s for AVX2 & AVX512, x25x for AVX512, broken in v3.23.4.
x86_64: Initial support for CPUs with AVX10, needs GCC-14.
ARM NEON: Various code optimisations.

v24.1

#414: fix bug in merkle error handling.
Expand Down
2 changes: 1 addition & 1 deletion algo-gate-api.c
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ int scanhash_4way_64in_32out( struct work *work, uint32_t max_nonce,

#endif

#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)

//int scanhash_8way_64_64( struct work *work, uint32_t max_nonce,
// uint64_t *hashes_done, struct thr_info *mythr )
Expand Down
5 changes: 4 additions & 1 deletion algo-gate-api.h
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,11 @@ typedef uint32_t set_t;
#define AES_OPT 1 << 7 // Intel Westmere, AArch64
#define VAES_OPT 1 << 8 // Icelake, Zen3
#define SHA_OPT 1 << 9 // Zen1, Icelake, AArch64
#define SHA256_OPT 1 << 9 // Zen1, Icelake, AArch64
#define SHA512_OPT 1 << 10 // Intel Arrow Lake, AArch64
#define NEON_OPT 1 << 11 // AArch64
#define AVX10_256 1 << 12
#define AVX10_512 1 << 13

// AVX10 does not have explicit algo features:
// AVX10_512 is compatible with AVX512 + VAES
Expand Down Expand Up @@ -246,7 +249,7 @@ int scanhash_4way_64in_32out( struct work *work, uint32_t max_nonce,

#endif

#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)

//int scanhash_8way_64in_64out( struct work *work, uint32_t max_nonce,
// uint64_t *hashes_done, struct thr_info *mythr );
Expand Down
4 changes: 2 additions & 2 deletions algo/argon2d/argon2d/opt.c
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
* @pre all block pointers must be valid
*/

#if defined(__AVX512F__)
#if defined(SIMD512)

static inline __m512i blamka( __m512i x, __m512i y )
{
Expand Down Expand Up @@ -237,7 +237,7 @@ void fill_segment(const argon2_instance_t *instance,
uint64_t pseudo_rand, ref_index, ref_lane;
uint32_t prev_offset, curr_offset;
uint32_t starting_index, i;
#if defined(__AVX512F__)
#if defined(SIMD512)
__m512i state[ARGON2_512BIT_WORDS_IN_BLOCK];
#elif defined(__AVX2__)
__m256i state[ARGON2_HWORDS_IN_BLOCK];
Expand Down
2 changes: 1 addition & 1 deletion algo/argon2d/blake2/blamka-round-opt.h
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
#include "blake2-impl.h"
#include "simd-utils.h"

#if !defined(__AVX512F__)
#if !defined(SIMD512)

#if !defined(__AVX2__)

Expand Down
4 changes: 2 additions & 2 deletions algo/blake/blake256-hash.c
Original file line number Diff line number Diff line change
Expand Up @@ -1611,7 +1611,7 @@ void blake256_8way_final_rounds_le( void *final_hash, const void *midstate,

#endif

#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)

///////////////////////////////////////
//
Expand Down Expand Up @@ -2617,7 +2617,7 @@ blake32_8way_close_le( blake_8way_small_context *sc, unsigned ub, unsigned n,
#endif


#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)

//Blake-256 16 way AVX512

Expand Down
2 changes: 1 addition & 1 deletion algo/blake/blake256-hash.h
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ void blake256r8_8way_close(void *cc, void *dst);
#define blake256r8_8x32_update blake256r14_8way_update
#define blake256r8_8x32_close blake256r14_8way_close

#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)

///////////////////////////////////
//
Expand Down
2 changes: 1 addition & 1 deletion algo/blake/blake2b-hash.c
Original file line number Diff line number Diff line change
Expand Up @@ -226,7 +226,7 @@ static const uint8_t sigma[12][16] =
#define Mx_(n) Mx__(n)
#define Mx__(n) M ## n

#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)

#define B2B8W_G(a, b, c, d, x, y) \
{ \
Expand Down
2 changes: 1 addition & 1 deletion algo/blake/blake2b-hash.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
#endif


#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)

typedef struct ALIGN( 64 ) {
__m512i b[16]; // input buffer
Expand Down
2 changes: 1 addition & 1 deletion algo/blake/blake2b.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
#include <stdint.h>
#include "blake2b-hash.h"

#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define BLAKE2B_8WAY
#elif defined(__AVX2__)
#define BLAKE2B_4WAY
Expand Down
2 changes: 1 addition & 1 deletion algo/blake/blake2s-hash.c
Original file line number Diff line number Diff line change
Expand Up @@ -497,7 +497,7 @@ int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,

#endif // __AVX2__

#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)

// Blake2s-256 16 way

Expand Down
34 changes: 17 additions & 17 deletions algo/blake/blake2s-hash.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,20 +29,20 @@
#define ALIGN(x) __attribute__((aligned(x)))
#endif

typedef struct __blake2s_nway_param
{
uint8_t digest_length; // 1
uint8_t key_length; // 2
uint8_t fanout; // 3
uint8_t depth; // 4
uint32_t leaf_length; // 8
uint8_t node_offset[6];// 14
uint8_t node_depth; // 15
uint8_t inner_length; // 16
// uint8_t reserved[0];
uint8_t salt[8]; // 24
uint8_t personal[8]; // 32
} blake2s_nway_param;
typedef struct __blake2s_nway_param
{
uint8_t digest_length; // 1
uint8_t key_length; // 2
uint8_t fanout; // 3
uint8_t depth; // 4
uint32_t leaf_length; // 8
uint8_t node_offset[6];// 14
uint8_t node_depth; // 15
uint8_t inner_length; // 16
// uint8_t reserved[0];
uint8_t salt[8]; // 24
uint8_t personal[8]; // 32
} blake2s_nway_param;

typedef struct ALIGN( 64 ) __blake2s_4way_state
{
Expand All @@ -67,7 +67,7 @@ int blake2s_4way_full_blocks( blake2s_4way_state *S, void *out,
typedef struct ALIGN( 64 ) __blake2s_8way_state
{
__m256i h[8];
uint8_t buf[ 32 * 8 ];
uint8_t buf[ 64 * 8 ];
uint32_t t[2];
uint32_t f[2];
size_t buflen;
Expand All @@ -83,12 +83,12 @@ int blake2s_8way_full_blocks( blake2s_8way_state *S, void *out,

#endif

#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)

typedef struct ALIGN( 64 ) __blake2s_16way_state
{
__m512i h[8];
uint8_t buf[ 32 * 16 ];
uint8_t buf[ 64 * 16 ];
uint32_t t[2];
uint32_t f[2];
size_t buflen;
Expand Down
2 changes: 1 addition & 1 deletion algo/blake/blake2s.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
#include <string.h>
#include <stdint.h>

#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define BLAKE2S_16WAY
#elif defined(__AVX2__)
#define BLAKE2S_8WAY
Expand Down
50 changes: 25 additions & 25 deletions algo/blake/blake512-hash.c
Original file line number Diff line number Diff line change
Expand Up @@ -349,16 +349,16 @@ void blake512_transform( uint64_t *H, const uint64_t *buf, const uint64_t T0,
Va = v128_add64( Va, v128_add64( Vb, \
v128_set64( CBx( r, Sd ) ^ Mx( r, Sc ), \
CBx( r, Sb ) ^ Mx( r, Sa ) ) ) ); \
Vd = v128_ror64( v128_xor( Vd, Va ), 32 ); \
Vd = v128_ror64xor( Vd, Va, 32 ); \
Vc = v128_add64( Vc, Vd ); \
Vb = v128_ror64( v128_xor( Vb, Vc ), 25 ); \
Vb = v128_ror64xor( Vb, Vc, 25 ); \
\
Va = v128_add64( Va, v128_add64( Vb, \
v128_set64( CBx( r, Sc ) ^ Mx( r, Sd ), \
CBx( r, Sa ) ^ Mx( r, Sb ) ) ) ); \
Vd = v128_ror64( v128_xor( Vd, Va ), 16 ); \
Vd = v128_ror64xor( Vd, Va, 16 ); \
Vc = v128_add64( Vc, Vd ); \
Vb = v128_ror64( v128_xor( Vb, Vc ), 11 ); \
Vb = v128_ror64xor( Vb, Vc, 11 ); \
}

#define BLAKE512_ROUND( R ) \
Expand Down Expand Up @@ -559,7 +559,7 @@ void blake512_full( blake512_context *sc, void *dst, const void *data,

#if defined(__AVX2__)

#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)

////////////////////////////////////
//
Expand Down Expand Up @@ -1887,13 +1887,13 @@ blake512_4x64_close(void *cc, void *dst)
#define GB_2X64( m0, m1, c0, c1, a, b, c, d ) \
{ \
a = v128_add64( v128_add64( v128_xor( v128_64( c1 ), m0 ), b ), a ); \
d = v128_ror64( v128_xor( d, a ), 32 ); \
d = v128_ror64xor( d, a, 32 ); \
c = v128_add64( c, d ); \
b = v128_ror64( v128_xor( b, c ), 25 ); \
b = v128_ror64xor( b, c, 25 ); \
a = v128_add64( v128_add64( v128_xor( v128_64( c0 ), m1 ), b ), a ); \
d = v128_ror64( v128_xor( d, a ), 16 ); \
d = v128_ror64xor( d, a, 16 ); \
c = v128_add64( c, d ); \
b = v128_ror64( v128_xor( b, c ), 11 ); \
b = v128_ror64xor( b, c, 11 ); \
}

#define ROUND_B_2X64(r) \
Expand Down Expand Up @@ -2054,9 +2054,9 @@ void blake512_2x64_prehash_part1_le( blake_2x64_big_context *sc,
// G4 skip nonce
V0 = v128_add64( v128_add64( v128_xor( v128_64( CB9 ), sc->buf[ 8] ), V5 ),
V0 );
VF = v128_ror64( v128_xor( VF, V0 ), 32 );
VF = v128_ror64xor( VF, V0, 32 );
VA = v128_add64( VA, VF );
V5 = v128_ror64( v128_xor( V5, VA ), 25 );
V5 = v128_ror64xor( V5, VA, 25 );
V0 = v128_add64( V0, V5 );

GB_2X64( sc->buf[10], sc->buf[11], CBA, CBB, V1, V6, VB, VC );
Expand Down Expand Up @@ -2137,44 +2137,44 @@ void blake512_2x64_prehash_part2_le( blake_2x64_big_context *sc, void *hash,

// finish round 0, with the nonce now available
V0 = v128_add64( V0, v128_xor( v128_64( CB8 ), M9 ) );
VF = v128_ror64( v128_xor( VF, V0 ), 16 );
VF = v128_ror64xor( VF, V0, 16 );
VA = v128_add64( VA, VF );
V5 = v128_ror64( v128_xor( V5, VA ), 11 );
V5 = v128_ror64xor( V5, VA, 11 );

// Round 1
// G0
GB_2X64(Mx(1, 0), Mx(1, 1), CBx(1, 0), CBx(1, 1), V0, V4, V8, VC);

// G1
V1 = v128_add64( V1, V5 );
VD = v128_ror64( v128_xor( VD, V1 ), 32 );
VD = v128_ror64xor( VD, V1, 32 );
V9 = v128_add64( V9, VD );
V5 = v128_ror64( v128_xor( V5, V9 ), 25 );
V5 = v128_ror64xor( V5, V9, 25 );
V1 = v128_add64( V1, v128_add64( v128_xor( v128_64( CBx(1,2) ), Mx(1,3) ),
V5 ) );
VD = v128_ror64( v128_xor( VD, V1 ), 16 );
VD = v128_ror64xor( VD, V1, 16 );
V9 = v128_add64( V9, VD );
V5 = v128_ror64( v128_xor( V5, V9 ), 11 );
V5 = v128_ror64xor( V5, V9, 11 );

// G2
V2 = v128_add64( V2, v128_xor( v128_64( CBF ), M9 ) );
VE = v128_ror64( v128_xor( VE, V2 ), 32 );
VE = v128_ror64xor( VE, V2, 32 );
VA = v128_add64( VA, VE );
V6 = v128_ror64( v128_xor( V6, VA ), 25 );
V6 = v128_ror64xor( V6, VA, 25 );
V2 = v128_add64( V2, v128_add64( v128_xor( v128_64( CB9 ), MF ), V6 ) );
VE = v128_ror64( v128_xor( VE, V2 ), 16 );
VE = v128_ror64xor( VE, V2, 16 );
VA = v128_add64( VA, VE );
V6 = v128_ror64( v128_xor( V6, VA ), 11 );
V6 = v128_ror64xor( V6, VA, 11 );

// G3
VF = v128_ror64( v128_xor( VF, V3 ), 32 );
VF = v128_ror64xor( VF, V3, 32 );
VB = v128_add64( VB, VF );
V7 = v128_ror64( v128_xor( V7, VB ), 25 );
V7 = v128_ror64xor( V7, VB, 25 );
V3 = v128_add64( V3, v128_add64( v128_xor( v128_64( CBx(1, 6) ), Mx(1, 7) ),
V7 ) );
VF = v128_ror64( v128_xor( VF, V3 ), 16 );
VF = v128_ror64xor( VF, V3, 16 );
VB = v128_add64( VB, VF );
V7 = v128_ror64( v128_xor( V7, VB ), 11 );
V7 = v128_ror64xor( V7, VB, 11 );

// G4, G5, G6, G7
GB_2X64(Mx(1, 8), Mx(1, 9), CBx(1, 8), CBx(1, 9), V0, V5, VA, VF);
Expand Down
2 changes: 1 addition & 1 deletion algo/blake/blake512-hash.h
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ void blake512_4x64_final_le( blake_4x64_big_context *sc, void *hash,
#define blake512_4way_prehash_le blake512_4x64_prehash_le
#define blake512_4way_final_le blake512_4x64_final_le

#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)

////////////////////////////
//
Expand Down
2 changes: 1 addition & 1 deletion algo/blake/blakecoin-gate.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#include "algo-gate-api.h"
#include <stdint.h>

#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define BLAKECOIN_16WAY
#elif defined(__AVX2__)
#define BLAKECOIN_8WAY
Expand Down
8 changes: 4 additions & 4 deletions algo/blake/sph_blake2b.c
Original file line number Diff line number Diff line change
Expand Up @@ -101,15 +101,15 @@
{ \
Va = v128_add64( Va, v128_add64( Vb, \
v128_set64( m[ sigmaR[ Sc ] ], m[ sigmaR[ Sa ] ] ) ) ); \
Vd = v128_ror64( v128_xor( Vd, Va ), 32 ); \
Vd = v128_ror64xor( Vd, Va, 32 ); \
Vc = v128_add64( Vc, Vd ); \
Vb = v128_ror64( v128_xor( Vb, Vc ), 24 ); \
Vb = v128_ror64xor( Vb, Vc, 24 ); \
\
Va = v128_add64( Va, v128_add64( Vb, \
v128_set64( m[ sigmaR[ Sd ] ], m[ sigmaR[ Sb ] ] ) ) ); \
Vd = v128_ror64( v128_xor( Vd, Va ), 16 ); \
Vd = v128_ror64xor( Vd, Va, 16 ); \
Vc = v128_add64( Vc, Vd ); \
Vb = v128_ror64( v128_xor( Vb, Vc ), 63 ); \
Vb = v128_ror64xor( Vb, Vc, 63 ); \
}

#define BLAKE2B_ROUND( R ) \
Expand Down
4 changes: 2 additions & 2 deletions algo/bmw/bmw-hash-4way.h
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ void bmw256_8way_close( bmw256_8way_context *ctx, void *dst );

#endif

#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)

// BMW-256 16 way 32

Expand Down Expand Up @@ -157,7 +157,7 @@ void bmw512_4way_addbits_and_close(

#endif // __AVX2__

#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)

// BMW-512 64 bit 8 way
typedef struct
Expand Down
2 changes: 1 addition & 1 deletion algo/bmw/bmw256-hash-4way.c
Original file line number Diff line number Diff line change
Expand Up @@ -1057,7 +1057,7 @@ void bmw256_8way_close( bmw256_8way_context *ctx, void *dst )

#endif // __AVX2__

#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)

// BMW-256 16 way 32

Expand Down
2 changes: 1 addition & 1 deletion algo/bmw/bmw512-gate.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
#include "algo-gate-api.h"
#include <stdint.h>

#if defined(__AVX512F__) && defined(__AVX512VL__) && defined(__AVX512DQ__) && defined(__AVX512BW__)
#if defined(SIMD512)
#define BMW512_8WAY 1
#elif defined(__AVX2__)
#define BMW512_4WAY 1
Expand Down
Loading

0 comments on commit 042d13d

Please sign in to comment.