From c47c4a88850a4bf389465780be2132050cb32005 Mon Sep 17 00:00:00 2001 From: Jay D Dee Date: Tue, 28 May 2024 18:20:19 -0400 Subject: [PATCH] v24.3 --- RELEASE_NOTES | 7 + algo-gate-api.h | 1 - algo/blake/blake2b-hash.c | 24 ++-- algo/blake/blake2b-hash.h | 30 ++-- algo/blake/blake2s-hash.h | 4 +- algo/groestl/myrgr-gate.c | 2 +- algo/lyra2/sponge.h | 4 - algo/m7m/m7m.c | 2 +- algo/ripemd/lbry-4way.c | 32 ++--- algo/ripemd/lbry-gate.c | 3 +- algo/scrypt/scrypt-core-4way.c | 100 ++++++------- algo/scrypt/scrypt.c | 2 +- algo/sha/sha256d.c | 6 +- algo/sha/sha256dt.c | 4 +- algo/sha/sha256t-gate.c | 5 +- algo/sha/sha512-hash-4way.c | 21 ++- algo/skein/skein-gate.c | 6 +- algo/sm3/sm3-hash-4way.c | 38 ++--- algo/verthash/Verthash.c | 53 +------ algo/verthash/verthash-gate.c | 6 +- algo/x16/minotaur.c | 2 +- algo/x22/x22i-gate.c | 4 +- algo/yespower/yescrypt-r8g.c | 2 +- algo/yespower/yespower-gate.c | 12 +- armbuild-all.sh | 30 ++-- clean-all.sh | 2 +- configure | 28 ++-- configure.ac | 2 +- configure~ | 20 +-- cpu-miner.c | 143 ++++++++----------- simd-utils/simd-128.h | 57 +++----- simd-utils/simd-256.h | 4 +- simd-utils/simd-int.h | 5 + simd-utils/simd-neon.h | 22 ++- sysinfos.c | 251 +++++++++++++++++++++------------ util.c | 2 +- 36 files changed, 473 insertions(+), 463 deletions(-) diff --git a/RELEASE_NOTES b/RELEASE_NOTES index feff95f0..4bba5cca 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -75,6 +75,13 @@ If not what makes it happen or not happen? Change Log ---------- +v24.3 + +ARM: CPU feature detection and reporting is now working. +ARM: Verthash is now working. +ARM: Small speedup for yescrypt, yespower & argon2d. +Code cleanup. + v24.2 x86_64: Fixed blakes2s for AVX2 & AVX512, x25x for AVX512, broken in v3.23.4. diff --git a/algo-gate-api.h b/algo-gate-api.h index 3626a521..1573476d 100644 --- a/algo-gate-api.h +++ b/algo-gate-api.h @@ -98,7 +98,6 @@ typedef uint32_t set_t; #define AVX512_OPT 1 << 6 // Skylake-X, Zen4 (AVX512[F,VL,DQ,BW]) #define AES_OPT 1 << 7 // Intel Westmere, AArch64 #define VAES_OPT 1 << 8 // Icelake, Zen3 -#define SHA_OPT 1 << 9 // Zen1, Icelake, AArch64 #define SHA256_OPT 1 << 9 // Zen1, Icelake, AArch64 #define SHA512_OPT 1 << 10 // Intel Arrow Lake, AArch64 #define NEON_OPT 1 << 11 // AArch64 diff --git a/algo/blake/blake2b-hash.c b/algo/blake/blake2b-hash.c index cf178375..546d5483 100644 --- a/algo/blake/blake2b-hash.c +++ b/algo/blake/blake2b-hash.c @@ -240,7 +240,7 @@ static const uint8_t sigma[12][16] = v[b] = mm512_ror_64( _mm512_xor_si512( v[b], v[c] ), 63 ); \ } -static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last ) +static void blake2b_8x64_compress( blake2b_8x64_ctx *ctx, int last ) { __m512i v[16], m[16]; @@ -306,7 +306,7 @@ static void blake2b_8way_compress( blake2b_8way_ctx *ctx, int last ) ctx->h[7] = mm512_xor3( ctx->h[7], v[7], v[15] ); } -int blake2b_8way_init( blake2b_8way_ctx *ctx ) +int blake2b_8x64_init( blake2b_8x64_ctx *ctx ) { size_t i; @@ -333,7 +333,7 @@ int blake2b_8way_init( blake2b_8way_ctx *ctx ) } -void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input, +void blake2b_8x64_update( blake2b_8x64_ctx *ctx, const void *input, size_t inlen ) { __m512i* in =(__m512i*)input; @@ -348,7 +348,7 @@ void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input, ctx->t[0] += ctx->c; if ( ctx->t[0] < ctx->c ) ctx->t[1]++; - blake2b_8way_compress( ctx, 0 ); + blake2b_8x64_compress( ctx, 0 ); ctx->c = 0; } ctx->b[ c++ ] = in[i]; @@ -356,7 +356,7 @@ void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input, } } -void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out ) +void blake2b_8x64_final( blake2b_8x64_ctx *ctx, void *out ) { size_t c; c = ctx->c >> 3; @@ -371,7 +371,7 @@ void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out ) ctx->c += 8; } - blake2b_8way_compress( ctx, 1 ); // final block flag = 1 + blake2b_8x64_compress( ctx, 1 ); // final block flag = 1 casti_m512i( out, 0 ) = ctx->h[0]; casti_m512i( out, 1 ) = ctx->h[1]; @@ -407,7 +407,7 @@ static const uint64_t blake2b_iv[8] = { }; */ -static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last ) +static void blake2b_4x64_compress( blake2b_4x64_ctx *ctx, int last ) { __m256i v[16], m[16]; @@ -473,7 +473,7 @@ static void blake2b_4way_compress( blake2b_4way_ctx *ctx, int last ) ctx->h[7] = _mm256_xor_si256( _mm256_xor_si256( ctx->h[7], v[7] ), v[15] ); } -int blake2b_4way_init( blake2b_4way_ctx *ctx ) +int blake2b_4x64_init( blake2b_4x64_ctx *ctx ) { size_t i; @@ -499,7 +499,7 @@ int blake2b_4way_init( blake2b_4way_ctx *ctx ) return 0; } -void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input, +void blake2b_4x64_update( blake2b_4x64_ctx *ctx, const void *input, size_t inlen ) { __m256i* in =(__m256i*)input; @@ -514,7 +514,7 @@ void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input, ctx->t[0] += ctx->c; if ( ctx->t[0] < ctx->c ) ctx->t[1]++; - blake2b_4way_compress( ctx, 0 ); + blake2b_4x64_compress( ctx, 0 ); ctx->c = 0; } ctx->b[ c++ ] = in[i]; @@ -522,7 +522,7 @@ void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input, } } -void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out ) +void blake2b_4x64_final( blake2b_4x64_ctx *ctx, void *out ) { size_t c; c = ctx->c >> 3; @@ -537,7 +537,7 @@ void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out ) ctx->c += 8; } - blake2b_4way_compress( ctx, 1 ); // final block flag = 1 + blake2b_4x64_compress( ctx, 1 ); // final block flag = 1 casti_m256i( out, 0 ) = ctx->h[0]; casti_m256i( out, 1 ) = ctx->h[1]; diff --git a/algo/blake/blake2b-hash.h b/algo/blake/blake2b-hash.h index 88f5b415..6caf8040 100644 --- a/algo/blake/blake2b-hash.h +++ b/algo/blake/blake2b-hash.h @@ -1,6 +1,6 @@ #pragma once -#ifndef __BLAKE2B_HASH_4WAY_H__ -#define __BLAKE2B_HASH_4WAY_H__ +#ifndef BLAKE2B_HASH_4WAY_H__ +#define BLAKE2B_HASH_4WAY_H__ #include "simd-utils.h" #include @@ -23,12 +23,17 @@ typedef struct ALIGN( 64 ) { uint64_t t[2]; // total number of bytes size_t c; // pointer for b[] size_t outlen; // digest size -} blake2b_8way_ctx; +} blake2b_8x64_ctx; -int blake2b_8way_init( blake2b_8way_ctx *ctx ); -void blake2b_8way_update( blake2b_8way_ctx *ctx, const void *input, +int blake2b_8x64_init( blake2b_8x64_ctx *ctx ); +void blake2b_8x64_update( blake2b_8x64_ctx *ctx, const void *input, size_t inlen ); -void blake2b_8way_final( blake2b_8way_ctx *ctx, void *out ); +void blake2b_8x64_final( blake2b_8x64_ctx *ctx, void *out ); + +#define blake2b_8way_ctx blake2b_8x64_ctx +#define blake2b_8way_init blake2b_8x64_init +#define blake2b_8way_update blake2b_8x64_update +#define blake2b_8way_final blake2b_8x64_final #endif @@ -41,12 +46,17 @@ typedef struct ALIGN( 64 ) { uint64_t t[2]; // total number of bytes size_t c; // pointer for b[] size_t outlen; // digest size -} blake2b_4way_ctx; +} blake2b_4x64_ctx; -int blake2b_4way_init( blake2b_4way_ctx *ctx ); -void blake2b_4way_update( blake2b_4way_ctx *ctx, const void *input, +int blake2b_4x64_init( blake2b_4x64_ctx *ctx ); +void blake2b_4x64_update( blake2b_4x64_ctx *ctx, const void *input, size_t inlen ); -void blake2b_4way_final( blake2b_4way_ctx *ctx, void *out ); +void blake2b_4x64_final( blake2b_4x64_ctx *ctx, void *out ); + +#define blake2b_4way_ctx blake2b_4x64_ctx +#define blake2b_4way_init blake2b_4x64_init +#define blake2b_4way_update blake2b_4x64_update +#define blake2b_4way_final blake2b_4x64_final #endif diff --git a/algo/blake/blake2s-hash.h b/algo/blake/blake2s-hash.h index 4e2d66c3..f77f4a4b 100644 --- a/algo/blake/blake2s-hash.h +++ b/algo/blake/blake2s-hash.h @@ -11,8 +11,8 @@ * this software. If not, see . */ //#pragma once -#ifndef __BLAKE2S_HASH_4WAY_H__ -#define __BLAKE2S_HASH_4WAY_H__ 1 +#ifndef BLAKE2S_HASH_4WAY_H__ +#define BLAKE2S_HASH_4WAY_H__ 1 #if defined(__SSE2__) || defined(__ARM_NEON) diff --git a/algo/groestl/myrgr-gate.c b/algo/groestl/myrgr-gate.c index f82aafba..1e668f5e 100644 --- a/algo/groestl/myrgr-gate.c +++ b/algo/groestl/myrgr-gate.c @@ -16,7 +16,7 @@ bool register_myriad_algo( algo_gate_t* gate ) init_myrgr_ctx(); gate->scanhash = (void*)&scanhash_myriad; gate->hash = (void*)&myriad_hash; - gate->optimizations = AES_OPT | SSE2_OPT | AVX2_OPT | SHA_OPT | VAES_OPT; + gate->optimizations = AES_OPT | SSE2_OPT | AVX2_OPT | SHA256_OPT | VAES_OPT; #endif return true; }; diff --git a/algo/lyra2/sponge.h b/algo/lyra2/sponge.h index 7937981e..d23a4e58 100644 --- a/algo/lyra2/sponge.h +++ b/algo/lyra2/sponge.h @@ -195,10 +195,6 @@ static const uint64_t blake2b_IV[8] = #endif // AVX2 else SSE2 -static inline uint64_t rotr64( const uint64_t w, const unsigned c ){ - return ( w >> c ) | ( w << ( 64 - c ) ); -} - #define G( r, i, a, b, c, d ) \ { \ a = a + b; \ diff --git a/algo/m7m/m7m.c b/algo/m7m/m7m.c index 9098f983..2f8194bb 100644 --- a/algo/m7m/m7m.c +++ b/algo/m7m/m7m.c @@ -306,7 +306,7 @@ bool register_m7m_algo( algo_gate_t *gate ) applog( LOG_ERR, "M7M algo is not supported on MacOS"); return false; #else - gate->optimizations = SHA_OPT; + gate->optimizations = SHA256_OPT; init_m7m_ctx(); gate->scanhash = (void*)&scanhash_m7m_hash; gate->build_stratum_request = (void*)&std_be_build_stratum_request; diff --git a/algo/ripemd/lbry-4way.c b/algo/ripemd/lbry-4way.c index 990a4af3..492236da 100644 --- a/algo/ripemd/lbry-4way.c +++ b/algo/ripemd/lbry-4way.c @@ -104,14 +104,14 @@ int scanhash_lbry_16way( struct work *work, uint32_t max_nonce, int thr_id = mythr->id; // thr_id arg is deprecated // we need bigendian data... - casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) ); - casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) ); - casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) ); - casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) ); - casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) ); - casti_m128i( edata, 5 ) = mm128_bswap_32( casti_m128i( pdata, 5 ) ); - casti_m128i( edata, 6 ) = mm128_bswap_32( casti_m128i( pdata, 6 ) ); - casti_m128i( edata, 7 ) = mm128_bswap_32( casti_m128i( pdata, 7 ) ); + casti_m128i( edata, 0 ) = v128_bswap32( casti_m128i( pdata, 0 ) ); + casti_m128i( edata, 1 ) = v128_bswap32( casti_m128i( pdata, 1 ) ); + casti_m128i( edata, 2 ) = v128_bswap32( casti_m128i( pdata, 2 ) ); + casti_m128i( edata, 3 ) = v128_bswap32( casti_m128i( pdata, 3 ) ); + casti_m128i( edata, 4 ) = v128_bswap32( casti_m128i( pdata, 4 ) ); + casti_m128i( edata, 5 ) = v128_bswap32( casti_m128i( pdata, 5 ) ); + casti_m128i( edata, 6 ) = v128_bswap32( casti_m128i( pdata, 6 ) ); + casti_m128i( edata, 7 ) = v128_bswap32( casti_m128i( pdata, 7 ) ); intrlv_16x32( vdata, edata, edata, edata, edata, edata, edata, edata, edata, edata, edata, edata, edata, edata, edata, edata, edata, 1024 ); @@ -224,14 +224,14 @@ int scanhash_lbry_8way( struct work *work, uint32_t max_nonce, int thr_id = mythr->id; // thr_id arg is deprecated // we need bigendian data... - casti_m128i( edata, 0 ) = mm128_bswap_32( casti_m128i( pdata, 0 ) ); - casti_m128i( edata, 1 ) = mm128_bswap_32( casti_m128i( pdata, 1 ) ); - casti_m128i( edata, 2 ) = mm128_bswap_32( casti_m128i( pdata, 2 ) ); - casti_m128i( edata, 3 ) = mm128_bswap_32( casti_m128i( pdata, 3 ) ); - casti_m128i( edata, 4 ) = mm128_bswap_32( casti_m128i( pdata, 4 ) ); - casti_m128i( edata, 5 ) = mm128_bswap_32( casti_m128i( pdata, 5 ) ); - casti_m128i( edata, 6 ) = mm128_bswap_32( casti_m128i( pdata, 6 ) ); - casti_m128i( edata, 7 ) = mm128_bswap_32( casti_m128i( pdata, 7 ) ); + casti_m128i( edata, 0 ) = v128_bswap32( casti_m128i( pdata, 0 ) ); + casti_m128i( edata, 1 ) = v128_bswap32( casti_m128i( pdata, 1 ) ); + casti_m128i( edata, 2 ) = v128_bswap32( casti_m128i( pdata, 2 ) ); + casti_m128i( edata, 3 ) = v128_bswap32( casti_m128i( pdata, 3 ) ); + casti_m128i( edata, 4 ) = v128_bswap32( casti_m128i( pdata, 4 ) ); + casti_m128i( edata, 5 ) = v128_bswap32( casti_m128i( pdata, 5 ) ); + casti_m128i( edata, 6 ) = v128_bswap32( casti_m128i( pdata, 6 ) ); + casti_m128i( edata, 7 ) = v128_bswap32( casti_m128i( pdata, 7 ) ); intrlv_8x32( vdata, edata, edata, edata, edata, edata, edata, edata, edata, 1024 ); diff --git a/algo/ripemd/lbry-gate.c b/algo/ripemd/lbry-gate.c index bb67b81f..9ed3f520 100644 --- a/algo/ripemd/lbry-gate.c +++ b/algo/ripemd/lbry-gate.c @@ -51,7 +51,6 @@ int lbry_get_work_data_size() { return LBRY_WORK_DATA_SIZE; } bool register_lbry_algo( algo_gate_t* gate ) { -// gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT; #if defined (LBRY_16WAY) gate->scanhash = (void*)&scanhash_lbry_16way; gate->hash = (void*)&lbry_16way_hash; @@ -67,7 +66,7 @@ bool register_lbry_algo( algo_gate_t* gate ) #else gate->scanhash = (void*)&scanhash_lbry; gate->hash = (void*)&lbry_hash; - gate->optimizations = AVX2_OPT | AVX512_OPT | SHA_OPT; + gate->optimizations = AVX2_OPT | AVX512_OPT | SHA256_OPT; #endif gate->build_stratum_request = (void*)&lbry_le_build_stratum_request; gate->build_extraheader = (void*)&lbry_build_extraheader; diff --git a/algo/scrypt/scrypt-core-4way.c b/algo/scrypt/scrypt-core-4way.c index 633d77c0..aa7e0355 100644 --- a/algo/scrypt/scrypt-core-4way.c +++ b/algo/scrypt/scrypt-core-4way.c @@ -2074,7 +2074,7 @@ void scrypt_core_4way( v128_t *X, v128_t *V, const uint32_t N ) v128_ovly v; for ( int l = 0; l < 4; l++ ) v.u32[l] = ( *(vptr[l] +i ) ) .u32[l]; - X[i] = v128_xor( X[i], v.m128 ); + X[i] = v128_xor( X[i], v.v128 ); } xor_salsa8_4way( &X[ 0], &X[16] ); @@ -2211,10 +2211,10 @@ static void salsa8_simd128( uint32_t *b, const uint32_t * const c) // X2 is shuffled left 2 (swap_64) { xd, x8, x7, x2 } // X3 is shuffled left 3 (ror_1x32) { xc, xb, x6, x1 } - y[0].m128 = X0; - y[1].m128 = X1; - y[2].m128 = X2; - y[3].m128 = X3; + y[0].v128 = X0; + y[1].v128 = X1; + y[2].v128 = X2; + y[3].v128 = X3; z[0].u32[0] = y[0].u32[0]; z[0].u32[3] = y[1].u32[0]; @@ -2236,10 +2236,10 @@ static void salsa8_simd128( uint32_t *b, const uint32_t * const c) z[3].u32[1] = y[2].u32[3]; z[3].u32[0] = y[3].u32[3]; - B[0] = v128_add32( B[0], z[0].m128 ); - B[1] = v128_add32( B[1], z[1].m128 ); - B[2] = v128_add32( B[2], z[2].m128 ); - B[3] = v128_add32( B[3], z[3].m128 ); + B[0] = v128_add32( B[0], z[0].v128 ); + B[1] = v128_add32( B[1], z[1].v128 ); + B[2] = v128_add32( B[2], z[2].v128 ); + B[3] = v128_add32( B[3], z[3].v128 ); #endif @@ -2404,14 +2404,14 @@ static inline void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb ) /* v128_ovly ya[4], za[4], yb[4], zb[4]; - ya[0].m128 = XA[0]; - yb[0].m128 = XB[0]; - ya[1].m128 = XA[1]; - yb[1].m128 = XB[1]; - ya[2].m128 = XA[2]; - yb[2].m128 = XB[2]; - ya[3].m128 = XA[3]; - yb[3].m128 = XB[3]; + ya[0].v128 = XA[0]; + yb[0].v128 = XB[0]; + ya[1].v128 = XA[1]; + yb[1].v128 = XB[1]; + ya[2].v128 = XA[2]; + yb[2].v128 = XB[2]; + ya[3].v128 = XA[3]; + yb[3].v128 = XB[3]; za[0].u32[0] = ya[0].u32[0]; zb[0].u32[0] = yb[0].u32[0]; @@ -2449,14 +2449,14 @@ static inline void salsa_simd128_unshuffle_2buf( uint32_t* xa, uint32_t* xb ) za[3].u32[3] = ya[0].u32[3]; zb[3].u32[3] = yb[0].u32[3]; - XA[0] = za[0].m128; - XB[0] = zb[0].m128; - XA[1] = za[1].m128; - XB[1] = zb[1].m128; - XA[2] = za[2].m128; - XB[2] = zb[2].m128; - XA[3] = za[3].m128; - XB[3] = zb[3].m128; + XA[0] = za[0].v128; + XB[0] = zb[0].v128; + XA[1] = za[1].v128; + XB[1] = zb[1].v128; + XA[2] = za[2].v128; + XB[2] = zb[2].v128; + XA[3] = za[3].v128; + XB[3] = zb[3].v128; */ } @@ -2770,18 +2770,18 @@ static inline void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb, /* v128_ovly ya[4], za[4], yb[4], zb[4], yc[4], zc[4]; - ya[0].m128 = XA[0]; - yb[0].m128 = XB[0]; - yc[0].m128 = XC[0]; - ya[1].m128 = XA[1]; - yb[1].m128 = XB[1]; - yc[1].m128 = XC[1]; - ya[2].m128 = XA[2]; - yb[2].m128 = XB[2]; - yc[2].m128 = XC[2]; - ya[3].m128 = XA[3]; - yb[3].m128 = XB[3]; - yc[3].m128 = XC[3]; + ya[0].v128 = XA[0]; + yb[0].v128 = XB[0]; + yc[0].v128 = XC[0]; + ya[1].v128 = XA[1]; + yb[1].v128 = XB[1]; + yc[1].v128 = XC[1]; + ya[2].v128 = XA[2]; + yb[2].v128 = XB[2]; + yc[2].v128 = XC[2]; + ya[3].v128 = XA[3]; + yb[3].v128 = XB[3]; + yc[3].v128 = XC[3]; za[0].u32[0] = ya[0].u32[0]; zb[0].u32[0] = yb[0].u32[0]; @@ -2835,18 +2835,18 @@ static inline void salsa_simd128_unshuffle_3buf( uint32_t* xa, uint32_t* xb, zb[3].u32[3] = yb[0].u32[3]; zc[3].u32[3] = yc[0].u32[3]; - XA[0] = za[0].m128; - XB[0] = zb[0].m128; - XC[0] = zc[0].m128; - XA[1] = za[1].m128; - XB[1] = zb[1].m128; - XC[1] = zc[1].m128; - XA[2] = za[2].m128; - XB[2] = zb[2].m128; - XC[2] = zc[2].m128; - XA[3] = za[3].m128; - XB[3] = zb[3].m128; - XC[3] = zc[3].m128; + XA[0] = za[0].v128; + XB[0] = zb[0].v128; + XC[0] = zc[0].v128; + XA[1] = za[1].v128; + XB[1] = zb[1].v128; + XC[1] = zc[1].v128; + XA[2] = za[2].v128; + XB[2] = zb[2].v128; + XC[2] = zc[2].v128; + XA[3] = za[3].v128; + XB[3] = zb[3].v128; + XC[3] = zc[3].v128; */ } @@ -3049,7 +3049,7 @@ static void xor_salsa8(uint32_t * const B, const uint32_t * const C) xf = (B[15] ^= C[15]); - #define ROL32( a, c ) ror32( a, c ) + #define ROL32( a, c ) rol32( a, c ) #define ADD32( a, b ) ( (a)+(b) ) #define XOR( a, b ) ( (a)^(b) ) diff --git a/algo/scrypt/scrypt.c b/algo/scrypt/scrypt.c index 13cd0aa4..00de1b6c 100644 --- a/algo/scrypt/scrypt.c +++ b/algo/scrypt/scrypt.c @@ -1481,7 +1481,7 @@ bool scrypt_miner_thread_init( int thr_id ) bool register_scrypt_algo( algo_gate_t* gate ) { #if defined(__SHA__) || defined(__ARM_FEATURE_SHA2) - gate->optimizations = SSE2_OPT | SHA_OPT | NEON_OPT; + gate->optimizations = SSE2_OPT | SHA256_OPT | NEON_OPT; #else gate->optimizations = SSE2_OPT | SSE42_OPT | AVX_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT; #endif diff --git a/algo/sha/sha256d.c b/algo/sha/sha256d.c index 3c56ad44..0731e2aa 100644 --- a/algo/sha/sha256d.c +++ b/algo/sha/sha256d.c @@ -8,14 +8,14 @@ void sha256d( void *hash, const void *data, int len ) } bool register_sha256d_algo( algo_gate_t* gate ) { - gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT; + gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT; #if defined(SHA256D_16WAY) gate->scanhash = (void*)&scanhash_sha256d_16way; #elif defined(SHA256D_SHA) - gate->optimizations = SHA_OPT; + gate->optimizations = SSE2_OPT | SHA256_OPT; gate->scanhash = (void*)&scanhash_sha256d_sha; #elif defined(SHA256D_NEON_SHA2) - gate->optimizations = SHA_OPT; + gate->optimizations = NEON_OPT | SHA256_OPT; gate->scanhash = (void*)&scanhash_sha256d_neon_sha2; #elif defined(SHA256D_8WAY) gate->scanhash = (void*)&scanhash_sha256d_8way; diff --git a/algo/sha/sha256dt.c b/algo/sha/sha256dt.c index 9a906e00..cb67dbd7 100644 --- a/algo/sha/sha256dt.c +++ b/algo/sha/sha256dt.c @@ -500,10 +500,10 @@ bool register_sha256dt_algo( algo_gate_t* gate ) #if defined(SHA256DT_16X32) gate->scanhash = (void*)&scanhash_sha256dt_16x32; #elif defined(SHA256DT_X86_SHA256) - gate->optimizations = SHA_OPT; + gate->optimizations = SSE2_OPT | SHA256_OPT; gate->scanhash = (void*)&scanhash_sha256dt_x86_x2sha; #elif defined(SHA256DT_NEON_SHA256) - gate->optimizations = SHA_OPT; + gate->optimizations = NEON_OPT | SHA256_OPT; gate->scanhash = (void*)&scanhash_sha256dt_neon_x2sha; #elif defined(SHA256DT_8X32) gate->scanhash = (void*)&scanhash_sha256dt_8x32; diff --git a/algo/sha/sha256t-gate.c b/algo/sha/sha256t-gate.c index 7d239ebb..0651f8bf 100644 --- a/algo/sha/sha256t-gate.c +++ b/algo/sha/sha256t-gate.c @@ -6,9 +6,10 @@ bool register_sha256t_algo( algo_gate_t* gate ) #if defined(SHA256T_16WAY) gate->scanhash = (void*)&scanhash_sha256t_16way; #elif defined(SHA256T_SHA) - gate->optimizations = SHA_OPT; + gate->optimizations = SSE2_OPT | SHA256_OPT; gate->scanhash = (void*)&scanhash_sha256t_sha; #elif defined(SHA256T_NEON_SHA2) + gate->optimizations = NEON_OPT | SHA256_OPT; gate->scanhash = (void*)&scanhash_sha256t_neon_sha2; #elif defined(SHA256T_8WAY) gate->scanhash = (void*)&scanhash_sha256t_8way; @@ -28,7 +29,7 @@ bool register_sha256q_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_sha256q_16way; gate->hash = (void*)&sha256q_16way_hash; //#elif defined(SHA256T_SHA) -// gate->optimizations = SHA_OPT; +// gate->optimizations = SHA256_OPT; // gate->scanhash = (void*)&scanhash_sha256q; // gate->hash = (void*)&sha256q_hash; #elif defined(SHA256T_8WAY) diff --git a/algo/sha/sha512-hash-4way.c b/algo/sha/sha512-hash-4way.c index 381219e9..0f51be74 100644 --- a/algo/sha/sha512-hash-4way.c +++ b/algo/sha/sha512-hash-4way.c @@ -71,12 +71,13 @@ static const uint64_t K512[80] = // SHA-512 implemented using SHA512 CPU extension. -// Experimental. Not tested. Not reviewed. Compile tested only. +// Experimental. Not supported. Not tested. Not reviewed. Compile tested only. +// Modelled after noloader sha256 implementation, replacing 4x32 bit +// instructions with equivalent 4x64 bit instructions and increasing rounds +// to 80. // Needs GCC-14 for compilation. // Needs Intel Lunarlake or Arrowlake CPU, or AMD Zen-6? for execution. -// Modelled after noloader sha256 implementation. - void sha512_opt_transform_be( uint64_t *state_out, const void *input, const uint64_t *state_in ) @@ -571,6 +572,20 @@ void sha512_opt_transform_le( uint64_t *state_out, const void *input, #endif +/* +#if defined(__ARM_FEATURE_NEON) && defined(__ARM_FEATURE_SHA512) + +uint64x2_t sha512_compile_test( uint64x2_t test ) +{ + test = vsha512hq_u64( test, test, test ); + test = vsha512h2q_u64( test, test, test ); + test = vsha512su0q_u64( test, test ); + test = vsha512su1q_u64( test, test, test ); + return test; +} + +#endif +*/ #if defined(SIMD512) diff --git a/algo/skein/skein-gate.c b/algo/skein/skein-gate.c index ee964d04..10b777dc 100644 --- a/algo/skein/skein-gate.c +++ b/algo/skein/skein-gate.c @@ -8,15 +8,15 @@ bool register_skein_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_skein_8way; gate->hash = (void*)&skeinhash_8way; #elif defined(SKEIN_4WAY) - gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA_OPT | NEON_OPT; + gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA256_OPT | NEON_OPT; gate->scanhash = (void*)&scanhash_skein_4way; gate->hash = (void*)&skeinhash_4way; #elif defined(SKEIN_2WAY) - gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA_OPT | NEON_OPT; + gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA256_OPT | NEON_OPT; gate->scanhash = (void*)&scanhash_skein_2x64; gate->hash = (void*)&skeinhash_2x64; #else - gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA_OPT | NEON_OPT; + gate->optimizations = SSE2_OPT | AVX2_OPT | AVX512_OPT | SHA256_OPT | NEON_OPT; gate->scanhash = (void*)&scanhash_skein; gate->hash = (void*)&skeinhash; #endif diff --git a/algo/sm3/sm3-hash-4way.c b/algo/sm3/sm3-hash-4way.c index 16adc9f5..e9b1c3de 100644 --- a/algo/sm3/sm3-hash-4way.c +++ b/algo/sm3/sm3-hash-4way.c @@ -240,10 +240,10 @@ void sm3_8way_close( void *cc, void *dst ) #if defined(__SSE2__) -#define P0(x) _mm_xor_si128( x, _mm_xor_si128( mm128_rol_32( x, 9 ), \ - mm128_rol_32( x, 17 ) ) ) -#define P1(x) _mm_xor_si128( x, _mm_xor_si128( mm128_rol_32( x, 15 ), \ - mm128_rol_32( x, 23 ) ) ) +#define P0(x) _mm_xor_si128( x, _mm_xor_si128( v128_rol32( x, 9 ), \ + v128_rol32( x, 17 ) ) ) +#define P1(x) _mm_xor_si128( x, _mm_xor_si128( v128_rol32( x, 15 ), \ + v128_rol32( x, 23 ) ) ) #define FF0(x,y,z) _mm_xor_si128( x, _mm_xor_si128( y, z ) ) #define FF1(x,y,z) _mm_or_si128( _mm_or_si128( _mm_and_si128( x, y ), \ @@ -273,13 +273,13 @@ void sm3_4way_compress( __m128i *digest, __m128i *block ) int j; for ( j = 0; j < 16; j++ ) - W[j] = mm128_bswap_32( block[j] ); + W[j] = v128_bswap32( block[j] ); for ( j = 16; j < 68; j++ ) W[j] = _mm_xor_si128( P1( _mm_xor_si128( _mm_xor_si128( W[ j-16 ], W[ j-9 ] ), - mm128_rol_32( W[ j-3 ], 15 ) ) ), - _mm_xor_si128( mm128_rol_32( W[ j-13 ], 7 ), + v128_rol32( W[ j-3 ], 15 ) ) ), + _mm_xor_si128( v128_rol32( W[ j-13 ], 7 ), W[ j-6 ] ) ); for( j = 0; j < 64; j++ ) @@ -288,19 +288,19 @@ void sm3_4way_compress( __m128i *digest, __m128i *block ) T = _mm_set1_epi32( 0x79CC4519UL ); for( j =0; j < 16; j++ ) { - SS1 = mm128_rol_32( _mm_add_epi32( _mm_add_epi32( mm128_rol_32(A,12), E ), + SS1 = v128_rol32( _mm_add_epi32( _mm_add_epi32( v128_rol32(A,12), E ), mm128_rol_var_32( T, j ) ), 7 ); - SS2 = _mm_xor_si128( SS1, mm128_rol_32( A, 12 ) ); + SS2 = _mm_xor_si128( SS1, v128_rol32( A, 12 ) ); TT1 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( FF0( A, B, C ), D ), SS2 ), W1[j] ); TT2 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( GG0( E, F, G ), H ), SS1 ), W[j] ); D = C; - C = mm128_rol_32( B, 9 ); + C = v128_rol32( B, 9 ); B = A; A = TT1; H = G; - G = mm128_rol_32( F, 19 ); + G = v128_rol32( F, 19 ); F = E; E = P0( TT2 ); } @@ -308,19 +308,19 @@ void sm3_4way_compress( __m128i *digest, __m128i *block ) T = _mm_set1_epi32( 0x7A879D8AUL ); for( j =16; j < 64; j++ ) { - SS1 = mm128_rol_32( _mm_add_epi32( _mm_add_epi32( mm128_rol_32(A,12), E ), + SS1 = v128_rol32( _mm_add_epi32( _mm_add_epi32( v128_rol32(A,12), E ), mm128_rol_var_32( T, j&31 ) ), 7 ); - SS2 = _mm_xor_si128( SS1, mm128_rol_32( A, 12 ) ); + SS2 = _mm_xor_si128( SS1, v128_rol32( A, 12 ) ); TT1 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( FF1( A, B, C ), D ), SS2 ), W1[j] ); TT2 = _mm_add_epi32( _mm_add_epi32( _mm_add_epi32( GG1( E, F, G ), H ), SS1 ), W[j] ); D = C; - C = mm128_rol_32( B, 9 ); + C = v128_rol32( B, 9 ); B = A; A = TT1; H = G; - G = mm128_rol_32( F, 19 ); + G = v128_rol32( F, 19 ); F = E; E = P0( TT2 ); } @@ -408,14 +408,14 @@ void sm3_4way_close( void *cc, void *dst ) memset_zero_128( block, ( SM3_BLOCK_SIZE - 8 ) >> 2 ); } - count[0] = mm128_bswap_32( + count[0] = v128_bswap32( _mm_set1_epi32( ctx->nblocks >> 23 ) ); - count[1] = mm128_bswap_32( _mm_set1_epi32( ( ctx->nblocks << 9 ) + - ( ctx->num << 3 ) ) ); + count[1] = v128_bswap32( _mm_set1_epi32( ( ctx->nblocks << 9 ) + + ( ctx->num << 3 ) ) ); sm3_4way_compress( ctx->digest, block ); for ( i = 0; i < 8 ; i++ ) - hash[i] = mm128_bswap_32( ctx->digest[i] ); + hash[i] = v128_bswap32( ctx->digest[i] ); } #endif diff --git a/algo/verthash/Verthash.c b/algo/verthash/Verthash.c index cbd8d84d..9b7d5603 100644 --- a/algo/verthash/Verthash.c +++ b/algo/verthash/Verthash.c @@ -137,53 +137,8 @@ void verthash_info_free(verthash_info_t* info) #define VH_N_INDEXES 4096 #define VH_BYTE_ALIGNMENT 16 -static inline uint32_t fnv1a(const uint32_t a, const uint32_t b) -{ - return (a ^ b) * 0x1000193; -} - -#if 0 -static void rotate_indexes( uint32_t *p ) -{ -#if defined(__AVX2__) - - for ( size_t x = 0; x < VH_N_SUBSET / sizeof(__m256i); x += 8 ) - { - __m256i *px = (__m256i*)p + x; - - px[0] = mm256_rol_32( px[0], 1 ); - px[1] = mm256_rol_32( px[1], 1 ); - px[2] = mm256_rol_32( px[2], 1 ); - px[3] = mm256_rol_32( px[3], 1 ); - px[4] = mm256_rol_32( px[4], 1 ); - px[5] = mm256_rol_32( px[5], 1 ); - px[6] = mm256_rol_32( px[6], 1 ); - px[7] = mm256_rol_32( px[7], 1 ); - } - -#else +#define fnv1a( a, b ) ( ( (a) ^ (b) ) * 0x1000193 ) - for ( size_t x = 0; x < VH_N_SUBSET / sizeof(__m128i); x += 8 ) - { - __m128i *px = (__m128i*)p0_index + x; - - px[0] = mm128_rol_32( px[0], 1 ); - px[1] = mm128_rol_32( px[1], 1 ); - px[2] = mm128_rol_32( px[2], 1 ); - px[3] = mm128_rol_32( px[3], 1 ); - px[4] = mm128_rol_32( px[4], 1 ); - px[5] = mm128_rol_32( px[5], 1 ); - px[6] = mm128_rol_32( px[6], 1 ); - px[7] = mm128_rol_32( px[7], 1 ); - } - -#endif -/* - for ( size_t x = 0; x < VH_N_SUBSET / sizeof(uint32_t); ++x ) - p[x] = ( p[x] << 1 ) | ( p[x] >> 31 ); -*/ -} -#endif // Vectorized and targetted version of fnv1a #if defined (__AVX2__) @@ -191,7 +146,7 @@ static void rotate_indexes( uint32_t *p ) *(__m256i*)hash = _mm256_mullo_epi32( _mm256_xor_si256( \ *(__m256i*)hash, *(__m256i*)blob_off ), k ); -#elif defined(__SSE4_1__) || defined(__ARM_NEON) +#elif defined(__SSE4_1__) || defined(__ARM_NEON) #define MULXOR \ casti_v128( hash, 0 ) = v128_mul32( v128_xor( \ @@ -229,7 +184,7 @@ for ( size_t i = 0; i < VH_N_SUBSET / sizeof(uint32_t); i++ ) \ MULXOR; \ } -// subsequent passes rotate by r on demand, no need for mass rotate +// subsequent passes rotate by r #define ROUND_r( r ) \ for ( size_t i = 0; i < VH_N_SUBSET / sizeof(uint32_t); i++ ) \ { \ @@ -243,8 +198,8 @@ for ( size_t i = 0; i < VH_N_SUBSET / sizeof(uint32_t); i++ ) \ void verthash_hash( const void *blob_bytes, const size_t blob_size, const void *input, void *output ) { - uint32_t hash[ VH_HASH_OUT_SIZE / 4 ] __attribute__ ((aligned (64))); uint32_t subset[ VH_N_SUBSET / 4 ] __attribute__ ((aligned (64))); + uint32_t hash[ VH_HASH_OUT_SIZE / 4 ] __attribute__ ((aligned (32))); const uint32_t *blob = (const uint32_t*)blob_bytes; uint32_t accumulator = 0x811c9dc5; const uint32_t mdiv = ( ( blob_size - VH_HASH_OUT_SIZE ) diff --git a/algo/verthash/verthash-gate.c b/algo/verthash/verthash-gate.c index 3238fb7d..6a8f7b96 100644 --- a/algo/verthash/verthash-gate.c +++ b/algo/verthash/verthash-gate.c @@ -91,8 +91,8 @@ void verthash_sha3_512_final_8( void *hash, const uint64_t nonce ) int scanhash_verthash( struct work *work, uint32_t max_nonce, uint64_t *hashes_done, struct thr_info *mythr ) { - uint32_t edata[20] __attribute__((aligned(64))); uint32_t hash[8] __attribute__((aligned(64))); + uint32_t edata[20] __attribute__((aligned(32))); uint32_t *pdata = work->data; const uint32_t *ptarget = work->target; const uint32_t first_nonce = pdata[19]; @@ -101,9 +101,7 @@ int scanhash_verthash( struct work *work, uint32_t max_nonce, const int thr_id = mythr->id; const bool bench = opt_benchmark; - for (int i = 0; i < 20; i++) - edata[i] = bswap_32( pdata[i] ); -// v128_bswap32_80( edata, pdata ); + v128_bswap32_80( edata, pdata ); verthash_sha3_512_prehash_72( edata ); do diff --git a/algo/x16/minotaur.c b/algo/x16/minotaur.c index bd662530..f4ea3071 100644 --- a/algo/x16/minotaur.c +++ b/algo/x16/minotaur.c @@ -318,7 +318,7 @@ bool register_minotaur_algo( algo_gate_t* gate ) gate->hash = (void*)&minotaur_hash; gate->miner_thread_init = (void*)&initialize_torture_garden; gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | AVX512_OPT | NEON_OPT; - if ( opt_algo == ALGO_MINOTAURX ) gate->optimizations |= SHA_OPT; + if ( opt_algo == ALGO_MINOTAURX ) gate->optimizations |= SHA256_OPT; return true; }; diff --git a/algo/x22/x22i-gate.c b/algo/x22/x22i-gate.c index 432cf949..755f61d7 100644 --- a/algo/x22/x22i-gate.c +++ b/algo/x22/x22i-gate.c @@ -31,7 +31,7 @@ bool register_x22i_algo( algo_gate_t* gate ) #endif - gate->optimizations = SSE2_OPT | SSE42_OPT | AES_OPT | AVX2_OPT | SHA_OPT + gate->optimizations = SSE2_OPT | SSE42_OPT | AES_OPT | AVX2_OPT | SHA256_OPT | AVX512_OPT | VAES_OPT | NEON_OPT; return true; }; @@ -48,7 +48,7 @@ bool register_x25x_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_x25x; gate->hash = (void*)&x25x_hash; #endif - gate->optimizations = SSE2_OPT | SSE42_OPT | AES_OPT | AVX2_OPT | SHA_OPT | + gate->optimizations = SSE2_OPT | SSE42_OPT | AES_OPT | AVX2_OPT | SHA256_OPT | AVX512_OPT | VAES_OPT | NEON_OPT; InitializeSWIFFTX(); return true; diff --git a/algo/yespower/yescrypt-r8g.c b/algo/yespower/yescrypt-r8g.c index 4e42e800..0d875298 100644 --- a/algo/yespower/yescrypt-r8g.c +++ b/algo/yespower/yescrypt-r8g.c @@ -71,7 +71,7 @@ int scanhash_yespower_r8g( struct work *work, uint32_t max_nonce, bool register_yescryptr8g_algo( algo_gate_t* gate ) { - gate->optimizations = SSE2_OPT | SHA_OPT | NEON_OPT; + gate->optimizations = SSE2_OPT | SHA256_OPT | NEON_OPT; gate->scanhash = (void*)&scanhash_yespower_r8g; #if (__SSE2__) || defined(__aarch64__) gate->hash = (void*)&yespower_hash; diff --git a/algo/yespower/yespower-gate.c b/algo/yespower/yespower-gate.c index 2c26b0ec..4613a53f 100644 --- a/algo/yespower/yespower-gate.c +++ b/algo/yespower/yespower-gate.c @@ -162,7 +162,7 @@ bool register_yespower_algo( algo_gate_t* gate ) if ( yespower_params.pers ) applog( LOG_NOTICE,"Key= \"%s\"\n", yespower_params.pers ); - gate->optimizations = SSE2_OPT | SHA_OPT | NEON_OPT; + gate->optimizations = SSE2_OPT | SHA256_OPT | NEON_OPT; gate->scanhash = (void*)&scanhash_yespower; #if (__SSE2__) || defined(__aarch64__) gate->hash = (void*)&yespower_hash; @@ -180,7 +180,7 @@ bool register_yespowerr16_algo( algo_gate_t* gate ) yespower_params.r = 16; yespower_params.pers = NULL; yespower_params.perslen = 0; - gate->optimizations = SSE2_OPT | SHA_OPT | NEON_OPT; + gate->optimizations = SSE2_OPT | SHA256_OPT | NEON_OPT; gate->scanhash = (void*)&scanhash_yespower; #if (__SSE2__) || defined(__aarch64__) gate->hash = (void*)&yespower_hash; @@ -195,7 +195,7 @@ bool register_yespowerr16_algo( algo_gate_t* gate ) bool register_yescrypt_algo( algo_gate_t* gate ) { - gate->optimizations = SSE2_OPT | SHA_OPT | NEON_OPT; + gate->optimizations = SSE2_OPT | SHA256_OPT | NEON_OPT; gate->scanhash = (void*)&scanhash_yespower; #if (__SSE2__) || defined(__aarch64__) gate->hash = (void*)&yespower_hash; @@ -233,7 +233,7 @@ bool register_yescrypt_algo( algo_gate_t* gate ) bool register_yescryptr8_algo( algo_gate_t* gate ) { - gate->optimizations = SSE2_OPT | SHA_OPT | NEON_OPT; + gate->optimizations = SSE2_OPT | SHA256_OPT | NEON_OPT; gate->scanhash = (void*)&scanhash_yespower; #if (__SSE2__) || defined(__aarch64__) gate->hash = (void*)&yespower_hash; @@ -251,7 +251,7 @@ bool register_yescryptr8_algo( algo_gate_t* gate ) bool register_yescryptr16_algo( algo_gate_t* gate ) { - gate->optimizations = SSE2_OPT | SHA_OPT | NEON_OPT; + gate->optimizations = SSE2_OPT | SHA256_OPT | NEON_OPT; gate->scanhash = (void*)&scanhash_yespower; #if (__SSE2__) || defined(__aarch64__) gate->hash = (void*)&yespower_hash; @@ -269,7 +269,7 @@ bool register_yescryptr16_algo( algo_gate_t* gate ) bool register_yescryptr32_algo( algo_gate_t* gate ) { - gate->optimizations = SSE2_OPT | SHA_OPT | NEON_OPT; + gate->optimizations = SSE2_OPT | SHA256_OPT | NEON_OPT; gate->scanhash = (void*)&scanhash_yespower; #if (__SSE2__) || defined(__aarch64__) gate->hash = (void*)&yespower_hash; diff --git a/armbuild-all.sh b/armbuild-all.sh index 61690aac..1f91ecad 100755 --- a/armbuild-all.sh +++ b/armbuild-all.sh @@ -4,7 +4,7 @@ # during develpment. However the information contained may provide compilation # tips to users. -rm cpuminer cpuminer-armv9-aes-sha3 cpuminer-armv9-aes-sha3-sve2 cpuminer-armv8.2-aes-sha3-sve2 cpuminer-armv8-aes-sha2-sve2 cpuminer-armv8 cpuminer-armv8-crypto cpuminer-armv8-aes cpuminer-armv8-sha2 cpuminer-armv8-aes-sha2 cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2-sha cpuminer-avx2-sha-vaes cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 cpuminer-alderlake cpuminer-x64 > /dev/null +rm cpuminer cpuminer-armv9-aes-sha3 cpuminer-armv9-aes-sha3-sve2 cpuminer-armv8.5-aes-sha3-sve2 cpuminer-armv8.4-aes-sha3 cpuminer-armv8-aes-sha2 cpuminer-armv8 cpuminer-armv8-crypto cpuminer-armv8-aes cpuminer-armv8-sha2 cpuminer-armv8-aes-sha2 cpuminer-avx512-sha-vaes cpuminer-avx512 cpuminer-avx2-sha cpuminer-avx2-sha-vaes cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse42 cpuminer-ssse3 cpuminer-sse2 cpuminer-zen cpuminer-zen3 cpuminer-zen4 cpuminer-alderlake cpuminer-x64 > /dev/null # armv9 needs gcc-13 @@ -16,26 +16,28 @@ make -j $(nproc) strip -s cpuminer mv cpuminer cpuminer-armv9-aes-sha3 -make clean || echo clean -CFLAGS="-O3 -march=armv9-a+crypto+sha3+aes+sve2 -Wall -flax-vector-conversions" ./configure --with-curl -make -j $(nproc) -strip -s cpuminer -mv cpuminer cpuminer-armv9-aes-sha3-sve2 +#make clean || echo clean +#CFLAGS="-O3 -march=armv9-a+crypto+sha3+aes+sve2 -Wall -flax-vector-conversions" ./configure --with-curl +#make -j $(nproc) +#strip -s cpuminer +#mv cpuminer cpuminer-armv9-aes-sha3-sve2 -make clean || echo clean -CFLAGS="-O3 -march=armv8.2-a+crypto+sha3+aes+sve2 -Wall -flax-vector-conversions" ./configure --with-curl -make -j $(nproc) -strip -s cpuminer -mv cpuminer cpuminer-armv8.2-aes-sha3-sve2 +# SVE2 available in armv8.5 +#make clean || echo clean +#CFLAGS="-O3 -march=armv8.5-a+crypto+sha3+aes+sve2 -Wall -flax-vector-conversions" ./configure --with-curl +#make -j $(nproc) +#strip -s cpuminer +#mv cpuminer cpuminer-armv8.5-aes-sha3-sve2 +# SHA3 available in armv8.4 make clean || echo clean -CFLAGS="-O3 -march=armv8-a+crypto+sha2+aes+sve2 -Wall -flax-vector-conversions" ./configure --with-curl +CFLAGS="-O3 -march=armv8.4-a+crypto+sha3+aes -Wall -flax-vector-conversions" ./configure --with-curl make -j $(nproc) strip -s cpuminer -mv cpuminer cpuminer-armv8-aes-sha2-sve2 +mv cpuminer cpuminer-armv8.4-aes-sha3 make clean || echo clean -CFLAGS="-O3 -march=armv8-a+crypto+sha2+aes -Wall -flax-vector-conversions" ./configure --with-curl +CFLAGS="-O3 -march=armv8-a+crypto+sha2+aes -Wall -flax-vector-conversions" ./configure --with-curl make -j $(nproc) strip -s cpuminer mv cpuminer cpuminer-armv8-aes-sha2 diff --git a/clean-all.sh b/clean-all.sh index 98979c28..0567c1e3 100755 --- a/clean-all.sh +++ b/clean-all.sh @@ -2,7 +2,7 @@ # # make clean and rm all the targetted executables. -rm cpuminer-avx512-sha-vaes cpuminer-alderlake cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-avx2-sha cpuminer-sse42 cpuminer-ssse3 cpuminer-avx2-sha-vaes cpuminer-zen3 cpuminer-zen4 cpuminer-x64 cpuminer-armv9-aes-sha3 cpuminer-armv9-aes-sha3-sve2 cpuminer-armv8.2-aes-sha3-sve2 cpuminer-armv8-aes-sha3-sve2 cpuminer-armv8-aes-sha2-sve2 cpuminer-armv8-crypto cpuminer-armv8 cpuminer-armv8-aes cpuminer-armv8-aes-sha3 cpuminer-armv8-aes-sha2 cpuminer-armv8-sha2 > /dev/null +rm cpuminer-avx512-sha-vaes cpuminer-alderlake cpuminer-avx512 cpuminer-avx2 cpuminer-avx cpuminer-aes-sse42 cpuminer-sse2 cpuminer-avx2-sha cpuminer-sse42 cpuminer-ssse3 cpuminer-avx2-sha-vaes cpuminer-zen3 cpuminer-zen4 cpuminer-x64 cpuminer-armv9-aes-sha3 cpuminer-armv9-aes-sha3-sve2 cpuminer-armv8.4-aes-sha3 cpuminer-armv8.5-aes-sha3-sve2 cpuminer-armv8-crypto cpuminer-armv8 cpuminer-armv8-aes cpuminer-armv8-aes-sha3 cpuminer-armv8-aes-sha2 cpuminer-armv8-sha2 > /dev/null rm cpuminer-avx512-sha-vaes.exe cpuminer-avx512-sha.exe cpuminer-avx512.exe cpuminer-avx2.exe cpuminer-avx.exe cpuminer-aes-sse42.exe cpuminer-sse2.exe cpuminer-avx2-sha.exe cpuminer-sse42.exe cpuminer-ssse3.exe cpuminer-avx2-sha-vaes.exe cpuminer-zen3.exe cpuminer-zen4.exe cpuminer-x64.exe > /dev/null diff --git a/configure b/configure index b7f7c371..9c49c0eb 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.71 for cpuminer-opt 24.2. +# Generated by GNU Autoconf 2.71 for cpuminer-opt 24.3. # # # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation, @@ -608,8 +608,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='24.2' -PACKAGE_STRING='cpuminer-opt 24.2' +PACKAGE_VERSION='24.3' +PACKAGE_STRING='cpuminer-opt 24.3' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1360,7 +1360,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures cpuminer-opt 24.2 to adapt to many kinds of systems. +\`configure' configures cpuminer-opt 24.3 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1432,7 +1432,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 24.2:";; + short | recursive ) echo "Configuration of cpuminer-opt 24.3:";; esac cat <<\_ACEOF @@ -1538,7 +1538,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 24.2 +cpuminer-opt configure 24.3 generated by GNU Autoconf 2.71 Copyright (C) 2021 Free Software Foundation, Inc. @@ -1985,7 +1985,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 24.2, which was +It was created by cpuminer-opt $as_me 24.3, which was generated by GNU Autoconf 2.71. Invocation command line was $ $0$ac_configure_args_raw @@ -3593,7 +3593,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='24.2' + VERSION='24.3' printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h @@ -5810,11 +5810,11 @@ if test x$ac_prog_cxx_stdcxx = xno then : { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $CXX option to enable C++11 features" >&5 printf %s "checking for $CXX option to enable C++11 features... " >&6; } -if test ${ac_cv_prog_cxx_11+y} +if test ${ac_cv_prog_cxx_cxx11+y} then : printf %s "(cached) " >&6 else $as_nop - ac_cv_prog_cxx_11=no + ac_cv_prog_cxx_cxx11=no ac_save_CXX=$CXX cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ @@ -5856,11 +5856,11 @@ if test x$ac_prog_cxx_stdcxx = xno then : { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for $CXX option to enable C++98 features" >&5 printf %s "checking for $CXX option to enable C++98 features... " >&6; } -if test ${ac_cv_prog_cxx_98+y} +if test ${ac_cv_prog_cxx_cxx98+y} then : printf %s "(cached) " >&6 else $as_nop - ac_cv_prog_cxx_98=no + ac_cv_prog_cxx_cxx98=no ac_save_CXX=$CXX cat confdefs.h - <<_ACEOF >conftest.$ac_ext /* end confdefs.h. */ @@ -7508,7 +7508,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 24.2, which was +This file was extended by cpuminer-opt $as_me 24.3, which was generated by GNU Autoconf 2.71. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -7576,7 +7576,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\ cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config='$ac_cs_config_escaped' ac_cs_version="\\ -cpuminer-opt config.status 24.2 +cpuminer-opt config.status 24.3 configured by $0, generated by GNU Autoconf 2.71, with options \\"\$ac_cs_config\\" diff --git a/configure.ac b/configure.ac index ff354444..ccca75bc 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cpuminer-opt], [24.2]) +AC_INIT([cpuminer-opt], [24.3]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/configure~ b/configure~ index dd6ce7fc..b7f7c371 100755 --- a/configure~ +++ b/configure~ @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.71 for cpuminer-opt 24.1. +# Generated by GNU Autoconf 2.71 for cpuminer-opt 24.2. # # # Copyright (C) 1992-1996, 1998-2017, 2020-2021 Free Software Foundation, @@ -608,8 +608,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='24.1' -PACKAGE_STRING='cpuminer-opt 24.1' +PACKAGE_VERSION='24.2' +PACKAGE_STRING='cpuminer-opt 24.2' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1360,7 +1360,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures cpuminer-opt 24.1 to adapt to many kinds of systems. +\`configure' configures cpuminer-opt 24.2 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1432,7 +1432,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 24.1:";; + short | recursive ) echo "Configuration of cpuminer-opt 24.2:";; esac cat <<\_ACEOF @@ -1538,7 +1538,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 24.1 +cpuminer-opt configure 24.2 generated by GNU Autoconf 2.71 Copyright (C) 2021 Free Software Foundation, Inc. @@ -1985,7 +1985,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 24.1, which was +It was created by cpuminer-opt $as_me 24.2, which was generated by GNU Autoconf 2.71. Invocation command line was $ $0$ac_configure_args_raw @@ -3593,7 +3593,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='24.1' + VERSION='24.2' printf "%s\n" "#define PACKAGE \"$PACKAGE\"" >>confdefs.h @@ -7508,7 +7508,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 24.1, which was +This file was extended by cpuminer-opt $as_me 24.2, which was generated by GNU Autoconf 2.71. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -7576,7 +7576,7 @@ ac_cs_config_escaped=`printf "%s\n" "$ac_cs_config" | sed "s/^ //; s/'/'\\\\\\\\ cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config='$ac_cs_config_escaped' ac_cs_version="\\ -cpuminer-opt config.status 24.1 +cpuminer-opt config.status 24.2 configured by $0, generated by GNU Autoconf 2.71, with options \\"\$ac_cs_config\\" diff --git a/cpu-miner.c b/cpu-miner.c index 7ccbab0f..e84dcde1 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -2840,40 +2840,42 @@ static void show_credits() static bool cpu_capability( bool display_only ) { char cpu_brand[0x40]; - bool cpu_has_aarch64 = cpu_arch_aarch64(); - bool cpu_has_x86_64 = cpu_arch_x86_64(); - bool cpu_has_sse2 = has_sse2(); // X86_64 only - bool cpu_has_ssse3 = has_ssse3(); // X86_64 only - bool cpu_has_sse41 = has_sse41(); // X86_64 only - bool cpu_has_sse42 = has_sse42(); - bool cpu_has_avx = has_avx(); - bool cpu_has_avx2 = has_avx2(); - bool cpu_has_avx512 = has_avx512(); - bool cpu_has_avx10 = has_avx10(); - bool cpu_has_aes = has_aes_ni(); // x86_64 or AArch64 AES - bool cpu_has_vaes = has_vaes(); - bool cpu_has_sha256 = has_sha(); // x86_64 or AArch64 - bool cpu_has_sha512 = has_sha512(); - bool sw_has_x86_64 = false; - bool sw_has_aarch64 = false; - int sw_arm_arch = 0; // AArch64 - bool sw_has_neon = false; // AArch64 -// bool sw_has_sve = false; // AArch64 -// bool sw_has_sve2 = false; // AArch64 - bool sw_has_sse2 = false; // x86_64 - bool sw_has_ssse3 = false; // x86_64 - bool sw_has_sse41 = false; // x86_64 - bool sw_has_sse42 = false; - bool sw_has_avx = false; - bool sw_has_avx2 = false; - bool sw_has_avx512 = false; + bool cpu_has_aarch64 = cpu_arch_aarch64(); + bool cpu_has_x86_64 = cpu_arch_x86_64(); + bool cpu_has_sse2 = has_sse2(); // X86_64 only + bool cpu_has_ssse3 = has_ssse3(); // X86_64 only + bool cpu_has_sse41 = has_sse41(); // X86_64 only + bool cpu_has_sse42 = has_sse42(); + bool cpu_has_avx = has_avx(); +// bool cpu_has_sve = has_sve(); // aarch64 only +// bool cpu_has_sve2 = has_sve2(); + bool cpu_has_avx2 = has_avx2(); + bool cpu_has_avx512 = has_avx512(); + bool cpu_has_avx10 = has_avx10(); + bool cpu_has_aes = has_aes(); // x86_64 or AArch64 + bool cpu_has_vaes = has_vaes(); + bool cpu_has_sha256 = has_sha256(); // x86_64 or AArch64 + bool cpu_has_sha512 = has_sha512(); + bool sw_has_x86_64 = false; + bool sw_has_aarch64 = false; + int sw_arm_arch = 0; // AArch64 + bool sw_has_neon = false; // AArch64 +// bool sw_has_sve = false; // AArch64 +// bool sw_has_sve2 = false; // AArch64 + bool sw_has_sse2 = false; // x86_64 + bool sw_has_ssse3 = false; // x86_64 + bool sw_has_sse41 = false; // x86_64 + bool sw_has_sse42 = false; + bool sw_has_avx = false; + bool sw_has_avx2 = false; + bool sw_has_avx512 = false; bool sw_has_avx10_256 = false; bool sw_has_avx10_512 = false; - bool sw_has_aes = false; - bool sw_has_vaes = false; - bool sw_has_sha256 = false; // x86_64 or AArch64 SHA2 - bool sw_has_sha512 = false; // x86_64 or AArch64 SHA3 - set_t algo_features = algo_gate.optimizations; + bool sw_has_aes = false; + bool sw_has_vaes = false; + bool sw_has_sha256 = false; // x86_64 or AArch64 SHA2 + bool sw_has_sha512 = false; // x86_64 or AArch64 SHA3 + set_t algo_features = algo_gate.optimizations; bool algo_has_sse2 = set_incl( SSE2_OPT, algo_features ); bool algo_has_sse42 = set_incl( SSE42_OPT, algo_features ); bool algo_has_avx = set_incl( AVX_OPT, algo_features ); @@ -2881,7 +2883,7 @@ static bool cpu_capability( bool display_only ) bool algo_has_avx512 = set_incl( AVX512_OPT, algo_features ); bool algo_has_aes = set_incl( AES_OPT, algo_features ); bool algo_has_vaes = set_incl( VAES_OPT, algo_features ); - bool algo_has_sha256 = set_incl( SHA_OPT, algo_features ); + bool algo_has_sha256 = set_incl( SHA256_OPT, algo_features ); bool algo_has_sha512 = set_incl( SHA512_OPT, algo_features ); bool algo_has_neon = set_incl( NEON_OPT, algo_features ); bool use_sse2; @@ -2896,7 +2898,6 @@ static bool cpu_capability( bool display_only ) bool use_neon; bool use_none; - // x86_64 #if defined(__x86_64__) sw_has_x86_64 = true; #elif defined(__aarch64__) @@ -2908,6 +2909,7 @@ static bool cpu_capability( bool display_only ) sw_arm_arch = __ARM_ARCH; #endif #endif + // x86_64_only #if defined(__SSE2__) sw_has_sse2 = true; #endif @@ -2935,7 +2937,7 @@ static bool cpu_capability( bool display_only ) #if defined(__AVX10_1_512__) sw_has_avx10_512 = true; #endif - + // x86_64 or AArch64 #if defined(__AES__) || defined(__ARM_FEATURE_AES) sw_has_aes = true; #endif @@ -2945,9 +2947,10 @@ static bool cpu_capability( bool display_only ) #if defined(__SHA__) || defined(__ARM_FEATURE_SHA2) sw_has_sha256 = true; #endif - #if defined(__SHA512__) || defined(__ARM_FEATURE_SHA3) + #if defined(__SHA512__) || defined(__ARM_FEATURE_SHA512) sw_has_sha512 = true; #endif + // AArch64 only #if defined(__ARM_NEON) sw_has_neon = true; #endif @@ -2986,21 +2989,26 @@ static bool cpu_capability( bool display_only ) printf("CPU features: "); if ( cpu_has_x86_64 ) { - printf( " x86_64" ); - if ( cpu_has_avx512 ) printf( " AVX512" ); - else if ( cpu_has_avx2 ) printf( " AVX2 " ); - else if ( cpu_has_avx ) printf( " AVX " ); - else if ( cpu_has_sse42 ) printf( " SSE4.2" ); - else if ( cpu_has_sse41 ) printf( " SSE4.1" ); - else if ( cpu_has_ssse3 ) printf( " SSSE3 " ); - else if ( cpu_has_sse2 ) printf( " SSE2 " ); + printf( " x86_64" ); + if ( cpu_has_avx512 ) printf( " AVX512" ); + else if ( cpu_has_avx2 ) printf( " AVX2 " ); + else if ( cpu_has_avx ) printf( " AVX " ); + else if ( cpu_has_sse42 ) printf( " SSE4.2" ); + else if ( cpu_has_sse41 ) printf( " SSE4.1" ); + else if ( cpu_has_ssse3 ) printf( " SSSE3 " ); + else if ( cpu_has_sse2 ) printf( " SSE2 " ); + } + else if ( cpu_has_aarch64 ) + { + printf( " AArch64 NEON" ); // NEON assumed +// if ( cpu_has_sve2 ) printf( " SVE2 " ); +// else if ( cpu_has_sve ) printf( " SVE " ); } - else if ( cpu_has_aarch64 ) printf( " AArch64 NEON" ); // NEON assumed - if ( cpu_has_vaes ) printf( " VAES" ); - else if ( cpu_has_aes ) printf( " AES" ); - if ( cpu_has_sha512 ) printf( " SHA512" ); - else if ( cpu_has_sha256 ) printf( " SHA256" ); - if ( cpu_has_avx10 ) printf( " AVX10.%d-%d", + if ( cpu_has_vaes ) printf( " VAES" ); + else if ( cpu_has_aes ) printf( " AES" ); + if ( cpu_has_sha512 ) printf( " SHA512" ); + else if ( cpu_has_sha256 ) printf( " SHA256" ); + if ( cpu_has_avx10 ) printf( " AVX10.%d-%d", avx10_version(), avx10_vector_length() ); printf("\nSW features: "); @@ -3022,8 +3030,8 @@ static bool cpu_capability( bool display_only ) printf( " AArch64" ); if ( sw_arm_arch ) printf( " armv%d", sw_arm_arch ); if ( sw_has_neon ) printf( " NEON" ); -// if ( sw_has_sve ) printf( " SVE" ); -// else if ( sw_has_sve2 ) printf( " SVE2" ); +// if ( sw_has_sve2 ) printf( " SVE2" ); +// else if ( sw_has_sve ) printf( " SVE" ); } if ( sw_has_vaes ) printf( " VAES" ); @@ -3052,35 +3060,6 @@ static bool cpu_capability( bool display_only ) if ( display_only ) return true; -/* - // Check for CPU and build incompatibilities - if ( !cpu_has_sse2 && !cpu_has_aarch64 ) - { - printf( "A CPU with SSE2 is required to use cpuminer-opt\n" ); - return false; - } - if ( sw_has_avx2 && !( cpu_has_avx2 && cpu_has_aes ) ) - { - printf( "The SW build requires a CPU with AES and AVX2!\n" ); - return false; - } - if ( sw_has_sse42 && !cpu_has_sse42 ) - { - printf( "The SW build requires a CPU with SSE4.2!\n" ); - return false; - } - if ( sw_has_aes && !cpu_has_aes ) - { - printf( "The SW build requires a CPU with AES!\n" ); - return false; - } - if ( sw_has_sha && !cpu_has_sha ) - { - printf( "The SW build requires a CPU with SHA!\n" ); - return false; - } -*/ - // Determine mining options use_sse2 = cpu_has_sse2 && sw_has_sse2 && algo_has_sse2; use_sse42 = cpu_has_sse42 && sw_has_sse42 && algo_has_sse42; @@ -3103,6 +3082,7 @@ static bool cpu_capability( bool display_only ) // if ( cpu_has_aarch64 ) printf( " AArch64"); // else // printf( " x86_64" ); + if ( use_neon ) printf( " NEON" ); if ( use_avx512 ) printf( " AVX512" ); else if ( use_avx2 ) printf( " AVX2" ); else if ( use_avx ) printf( " AVX" ); @@ -3112,7 +3092,6 @@ static bool cpu_capability( bool display_only ) else if ( use_aes ) printf( " AES" ); if ( use_sha512 ) printf( " SHA512" ); else if ( use_sha256 ) printf( " SHA256" ); - if ( use_neon ) printf( " NEON" ); } printf( "\n" ); diff --git a/simd-utils/simd-128.h b/simd-utils/simd-128.h index b963c24f..d4e623d7 100644 --- a/simd-utils/simd-128.h +++ b/simd-utils/simd-128.h @@ -38,7 +38,6 @@ // // __m128i -> v128_t // _mm_ -> v128_ -// mm128_ -> v128_ // // There is also new syntax to accomodate ARM's stricter type checking of // vector element size. They have no effect on x86_64. @@ -145,10 +144,8 @@ typedef union { v128_t v128; - __m128i m128; uint32_t u32[4]; -} __attribute__ ((aligned (16))) m128_ovly; -#define v128_ovly m128_ovly +} __attribute__ ((aligned (16))) v128_ovly; // use for immediate constants, use load1 for mem. #define v128_64 _mm_set1_epi64x @@ -168,7 +165,7 @@ typedef union // compiler to exploit new features to produce optimum code. // Currently only used internally and by Luffa. -static inline __m128i mm128_mov64_128( const uint64_t n ) +static inline __m128i v128_mov64( const uint64_t n ) { __m128i a; #if defined(__AVX__) @@ -178,10 +175,8 @@ static inline __m128i mm128_mov64_128( const uint64_t n ) #endif return a; } -//#define v128_mov64( u64 ) mm128_mov64_128( u64 ) - -static inline __m128i mm128_mov32_128( const uint32_t n ) +static inline __m128i v128_mov32( const uint32_t n ) { __m128i a; #if defined(__AVX__) @@ -235,7 +230,7 @@ static inline int v128_cmpeq0( v128_t v ) // Bitwise compare return 1 if all bits set. #define v128_cmpeq1(v) _mm_test_all ones(v) -#define v128_one mm128_mov64_128(1) +#define v128_one v128_mov64(1) // ASM avoids the need to initialize return variable to avoid compiler warning. // Macro abstracts function parentheses to look like an identifier. @@ -327,7 +322,7 @@ static inline __m128i v128_neg1_fn() /* // Copy i32 to element c of dest and copy remaining elemnts from v. #define v128_put32( v, i32, c ) \ - v128_xim_32( v, mm128_mov32_128( i32 ), (c)<<4 ) + v128_xim_32( v, v128_mov32( i32 ), (c)<<4 ) */ @@ -463,13 +458,11 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n ) // Returns 2 or 4 bit integer mask from MSBit of 64 or 32 bit elements. // Effectively a sign test. -#define mm128_movmask_64( v ) \ +#define v128_movmask64( v ) \ _mm_movemask_pd( (__m128d)(v) ) -#define v128_movmask64 mm128_movmask_64 -#define mm128_movmask_32( v ) \ +#define v128_movmask32( v ) \ _mm_movemask_ps( (__m128)(v) ) -#define v128_movmask32 mm128_movmask_32 // // Bit rotations @@ -608,9 +601,6 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n ) #endif -// deprecated -#define mm128_rol_32 v128_rol32 - // ror( v1 ^ v0, n ) #define v128_ror64xor( v1, v0, n ) v128_ror64( v128_xor( v1, v0 ), n ) @@ -689,7 +679,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n ) /* Not used, exists only for compatibility with NEON if ever needed. #define v128_shufflev32( v, vmask ) \ - v128_shuffle32( v, mm128_movmask_32( vmask ) ) + v128_shuffle32( v, v128_movmask32( vmask ) ) */ #define v128_shuffle8 _mm_shuffle_epi8 @@ -734,15 +724,12 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n ) #define v128_bswap32( v ) \ _mm_shuffle_epi8( v, _mm_set_epi64x( 0x0c0d0e0f08090a0b, \ 0x0405060700010203 ) ) -// deprecated -#define mm128_bswap_32 v128_bswap32 - #define v128_bswap16( v ) \ _mm_shuffle_epi8( v, _mm_set_epi64x( 0x0e0f0c0d0a0b0809, \ 0x0607040502030001 ) // 8 byte qword * 8 qwords * 2 lanes = 128 bytes -#define mm128_block_bswap_64( d, s ) \ +#define v128_block_bswap64( d, s ) \ { \ v128_t ctl = _mm_set_epi64x( 0x08090a0b0c0d0e0f, 0x0001020304050607 ); \ casti_v128( d,0 ) = _mm_shuffle_epi8( casti_v128( s,0 ), ctl ); \ @@ -754,8 +741,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n ) casti_v128( d,6 ) = _mm_shuffle_epi8( casti_v128( s,6 ), ctl ); \ casti_v128( d,7 ) = _mm_shuffle_epi8( casti_v128( s,7 ), ctl ); \ } -#define mm128_block_bswap64_512 mm128_block_bswap_64 -#define v128_block_bswap64_512 mm128_block_bswap_64 +#define v128_block_bswap64_512 v128_block_bswap64 #define v128_block_bswap64_1024( d, s ) \ { \ @@ -779,7 +765,7 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n ) } // 4 byte dword * 8 dwords * 4 lanes = 128 bytes -#define mm128_block_bswap_32( d, s ) \ +#define v128_block_bswap32( d, s ) \ { \ v128_t ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \ casti_v128( d,0 ) = _mm_shuffle_epi8( casti_v128( s,0 ), ctl ); \ @@ -791,11 +777,10 @@ static inline void v128_memcpy( v128_t *dst, const v128_t *src, const int n ) casti_v128( d,6 ) = _mm_shuffle_epi8( casti_v128( s,6 ), ctl ); \ casti_v128( d,7 ) = _mm_shuffle_epi8( casti_v128( s,7 ), ctl ); \ } -#define mm128_block_bswap32_256 mm128_block_bswap_32 -#define v128_block_bswap32_256 mm128_block_bswap_32 +#define v128_block_bswap32_256 v128_block_bswap32 -#define mm128_block_bswap32_128( d, s ) \ +#define v128_block_bswap32_128( d, s ) \ { \ v128_t ctl = _mm_set_epi64x( 0x0c0d0e0f08090a0b, 0x0405060700010203 ); \ casti_v128( d,0 ) = _mm_shuffle_epi8( casti_v128( s,0 ), ctl ); \ @@ -840,7 +825,6 @@ static inline v128_t v128_bswap32( __m128i v ) v = _mm_shufflelo_epi16( v, _MM_SHUFFLE( 2, 3, 0, 1 ) ); return _mm_shufflehi_epi16( v, _MM_SHUFFLE( 2, 3, 0, 1 ) ); } -#define mm128_bswap_32 v128_bswap32 static inline v128_t v128_bswap16( __m128i v ) { @@ -849,7 +833,7 @@ static inline v128_t v128_bswap16( __m128i v ) #define v128_bswap128( v ) v128_qrev32( v128_bswap64( v ) ) -static inline void mm128_block_bswap_64( __m128i *d, const __m128i *s ) +static inline void v128_block_bswap64( __m128i *d, const __m128i *s ) { d[0] = v128_bswap64( s[0] ); d[1] = v128_bswap64( s[1] ); @@ -860,9 +844,8 @@ static inline void mm128_block_bswap_64( __m128i *d, const __m128i *s ) d[6] = v128_bswap64( s[6] ); d[7] = v128_bswap64( s[7] ); } -#define v128_block_bswap64_512 mm128_block_bswap_64 -static inline void mm128_block_bswap64_1024( __m128i *d, const __m128i *s ) +static inline void v128_block_bswap64_1024( __m128i *d, const __m128i *s ) { d[ 0] = v128_bswap64( s[ 0] ); d[ 1] = v128_bswap64( s[ 1] ); @@ -882,7 +865,7 @@ static inline void mm128_block_bswap64_1024( __m128i *d, const __m128i *s ) d[15] = v128_bswap64( s[15] ); } -static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s ) +static inline void v128_block_bswap32( __m128i *d, const __m128i *s ) { d[0] = v128_bswap32( s[0] ); d[1] = v128_bswap32( s[1] ); @@ -893,10 +876,9 @@ static inline void mm128_block_bswap_32( __m128i *d, const __m128i *s ) d[6] = v128_bswap32( s[6] ); d[7] = v128_bswap32( s[7] ); } -#define mm128_block_bswap32_256 mm128_block_bswap_32 -#define v128_block_bswap32_256 mm128_block_bswap_32 +#define v128_block_bswap32_256 v128_block_bswap32 -static inline void mm128_block_bswap32_512( __m128i *d, const __m128i *s ) +static inline void v128_block_bswap32_512( __m128i *d, const __m128i *s ) { d[ 0] = v128_bswap32( s[ 0] ); d[ 1] = v128_bswap32( s[ 1] ); @@ -918,9 +900,6 @@ static inline void mm128_block_bswap32_512( __m128i *d, const __m128i *s ) #endif // SSSE3 else SSE2 -#define v128_block_bswap32 mm128_block_bswap_32 -#define v128_block_bswap64 mm128_block_bswap_64 - // alignr instruction for 32 & 64 bit elements is only available with AVX512 // but emulated here. Behaviour is consistent with Intel alignr intrinsics. #if defined(__SSSE3__) diff --git a/simd-utils/simd-256.h b/simd-utils/simd-256.h index f48bfd5a..f9f18175 100644 --- a/simd-utils/simd-256.h +++ b/simd-utils/simd-256.h @@ -73,10 +73,10 @@ typedef union #else -#define mm256_bcast128lo_64( i64 ) mm256_bcast_m128( mm128_mov64_128( i64 ) ) +#define mm256_bcast128lo_64( i64 ) mm256_bcast_m128( v128_mov64( i64 ) ) #define mm256_bcast128hi_64( i64 ) _mm256_permute4x64_epi64( \ - _mm256_castsi128_si256( mm128_mov64_128( i64 ) ), 0x11 ) + _mm256_castsi128_si256( v128_mov64( i64 ) ), 0x11 ) #endif diff --git a/simd-utils/simd-int.h b/simd-utils/simd-int.h index 95650c59..035d527f 100644 --- a/simd-utils/simd-int.h +++ b/simd-utils/simd-int.h @@ -108,8 +108,12 @@ static inline uint32_t le162( const uint16_t u16 ) #define rol32 __rold #define ror32 __rord +/* these don't seem to work #elif defined(__aarch64__) +// Documentation is vague, ror exists but is ambiguous. Docs say it can +// do 32 or 64 registers. Assuming that is architecture specific andcan +// only do 32 bit on 32 bit arch. Rarely used so not a big issue. static inline uint64_t ror64( uint64_t a, const int c ) { uint64_t b; @@ -125,6 +129,7 @@ static inline uint32_t ror32( uint32_t a, const int c ) return b; } #define rol32( a, c ) ror32( a, 32-(c) ) +*/ #else diff --git a/simd-utils/simd-neon.h b/simd-utils/simd-neon.h index 9c55b16c..6155f976 100644 --- a/simd-utils/simd-neon.h +++ b/simd-utils/simd-neon.h @@ -38,7 +38,9 @@ #define v128u8_load( p ) vld1q_u16( (uint8_t*)(p) ) #define v128u8_store( p, v ) vst1q_u16( (uint8_t*)(p), v ) -// load & set1 combined, doesn't work +// load & set1 combined. What if source is already loaded? +// Don't use, leave it up to the compiler to optimize. +// Same with vld1q_lane. #define v128_load1_64(p) vld1q_dup_u64( (uint64_t*)(p) ) #define v128_load1_32(p) vld1q_dup_u32( (uint32_t*)(p) ) #define v128_load1_16(p) vld1q_dup_u16( (uint16_t*)(p) ) @@ -61,17 +63,13 @@ #define v128_sub16 vsubq_u16 #define v128_sub8 vsubq_u8 -// returns low half, u64 undocumented, may not exist. -#define v128_mul64 vmulq_u64 +// returns low half #define v128_mul32 vmulq_u32 #define v128_mul16 vmulq_u16 -// Widening multiply, align source elements with Intel -static inline uint64x2_t v128_mulw32( uint32x4_t v1, uint32x4_t v0 ) -{ - return vmull_u32( vget_low_u32( vcopyq_laneq_u32( v1, 1, v1, 2 ) ), - vget_low_u32( vcopyq_laneq_u32( v0, 1, v0, 2 ) ) ); -} +// Widening multiply, realign source elements from x86_64 to NEON. +#define v128_mulw32( v1, v0 ) \ + vmull_u32( vmovn_u64( v1 ), vmovn_u64( v0 ) ) // compare #define v128_cmpeq64 vceqq_u64 @@ -315,7 +313,6 @@ static inline void v128_memset_zero( void *dst, const int n ) memset( dst, 0, n*16 ); } - static inline void v128_memset( void *dst, const void *src, const int n ) { for( int i = 0; i < n; i++ ) @@ -373,7 +370,7 @@ static inline void v128_memcpy( void *dst, const void *src, const int n ) ((uint8x16_t)(v)), c ) -// ror( v1 ^ v0, n ) +// ( v1 ^ v0 ) >>> n #if defined(__ARM_FEATURE_SHA3) #define v128_ror64xor( v1, v0, n ) vxarq_u64( v1, v0, n ) @@ -438,7 +435,6 @@ static inline void v128_memcpy( void *dst, const void *src, const int n ) // sub-vector shuffles sometimes mirror bit rotation. Shuffle is faster. // Bit rotation already promotes faster widths. Usage is context sensitive. -// preferred. // reverse elements in vector lanes #define v128_qrev32 vrev64q_u32 @@ -496,7 +492,7 @@ static inline uint16x8_t v128_shufll16( uint16x8_t v ) casti_v128u32( dst,6 ) = v128_bswap32( casti_v128u32( src,6 ) ); \ casti_v128u32( dst,7 ) = v128_bswap32( casti_v128u32( src,7 ) ); \ } -#define v128_block_bswap32_256( dst, src ) \ +#define v128_block_bswap32_256 v128_block_bswap32 #define v128_block_bswap32_512( dst, src ) \ { \ diff --git a/sysinfos.c b/sysinfos.c index a6c3c769..2caedd13 100644 --- a/sysinfos.c +++ b/sysinfos.c @@ -16,7 +16,7 @@ #include "miner.h" #include "simd-utils.h" -#if defined(__aarch64__) && !defined(__APPLE__) +#if defined(__aarch64__) // for arm's "cpuid" #include #include @@ -309,14 +309,49 @@ static inline void cpuid( unsigned int leaf, unsigned int subleaf, #endif } -#elif defined(__aarch64__) && !defined(__APPLE__) +#elif defined(__aarch64__) static inline void cpuid( unsigned int leaf, unsigned int subleaf, unsigned int output[4] ) { + output[0] = getauxval(AT_HWCAP); + output[1] = getauxval(AT_HWCAP2); + +/* +#define has(CAP, hwcap) !!((hwcap) & HWCAP_##CAP) +#define pr(CAP, hwcap) printf("%10s = %d\n", #CAP, has(CAP, hwcap)) + + unsigned long hwcaps = getauxval(AT_HWCAP); + printf("HWCAP = 0x%lx\n", hwcaps); + + pr(FP, hwcaps); + pr(ASIMD, hwcaps); + pr(EVTSTRM, hwcaps); + pr(AES, hwcaps); + pr(PMULL, hwcaps); + pr(SHA1, hwcaps); + pr(SHA2, hwcaps); + pr(CRC32, hwcaps); + pr(ATOMICS, hwcaps); + pr(FPHP, hwcaps); + pr(ASIMDHP, hwcaps); + pr(CPUID, hwcaps); + pr(ASIMDRDM, hwcaps); + pr(JSCVT, hwcaps); + pr(FCMA, hwcaps); + pr(LRCPC, hwcaps); + pr(DCPOP, hwcaps); + pr(SHA3, hwcaps); + pr(SM3, hwcaps); + pr(SM4, hwcaps); + pr(ASIMDDP, hwcaps); + pr(SHA512, hwcaps); + pr(SVE, hwcaps); +*/ } + #else #define cpuid(leaf, subleaf, out) out[0] = 0; #endif @@ -482,7 +517,6 @@ static inline void cpu_getmodelid(char *outbuf, size_t maxsz) // 1 1 1 1 = AVX10 512 bit max (version 1 granite rapids) // Other combinations are not defined. -// No technical need for this, the code won't run if false. static inline bool cpu_arch_x86_64() { #if defined(__x86_64__) @@ -515,11 +549,11 @@ static inline bool has_sse() static inline bool has_sse2() { #if defined(__x86_64__) - unsigned int cpu_info[4] = { 0 }; - cpuid( CPU_INFO, 0, cpu_info ); - return cpu_info[ EDX_Reg ] & SSE2_Flag; + unsigned int cpu_info[4] = { 0 }; + cpuid( CPU_INFO, 0, cpu_info ); + return cpu_info[ EDX_Reg ] & SSE2_Flag; #else - return false; + return false; #endif } @@ -556,43 +590,18 @@ static inline bool has_sse42() #endif } +/* doesn't work static inline bool has_neon() { -#if defined(__aarch64__) && !defined(__APPLE__) - unsigned int cpu_info[4] = { 0 }; - return cpu_info[0]; -#else - return false; -#endif -} - -static inline bool has_aes_ni() -{ -#if defined(__x86_64__) - if ( has_sse2() ) - { - unsigned int cpu_info[4] = { 0 }; - cpuid( CPU_INFO, 0, cpu_info ); - return cpu_info[ ECX_Reg ] & AES_NI_Flag; - } - return false; -#elif defined(__aarch64__) && !defined(__APPLE__) - if ( has_neon() ) - { -#if defined(KERNEL_HWCAP_AES) - return true; -#else - return false; -#endif -/* unsigned int cpu_info[4] = { 0 }; - cpuid( 0, 0, cpu_info ); - return cpu_info[0] & HWCAP_AES; -*/ } - return false; +#if defined(__aarch64__) + unsigned int cpu_info[4] = { 0 }; + cpuid( 0, 0, cpu_info ); + return cpu_info[0] & HWCAP_NEON; #else return false; #endif } +*/ static inline bool has_avx() { @@ -616,54 +625,25 @@ static inline bool has_avx2() #endif } -static inline bool has_sha() +static inline bool has_sve() { -#if defined(__x86_64__) - if ( has_avx() ) - { - unsigned int cpu_info[4] = { 0 }; - cpuid( EXTENDED_FEATURES, 0, cpu_info ); - return cpu_info[ EBX_Reg ] & SHA_Flag; - } - return false; -#elif defined(__aarch64__) && !defined(__APPLE__) - if ( has_neon() ) - { -#if defined(KERNEL_HWCAP_SHA2) - return true; -#else - return false; -#endif -/* unsigned int cpu_info[4] = { 0 }; - cpuid( 0, 0, cpu_info ); - return cpu_info[0] & HWCAP_SHA2; -*/ } - return false; +#if defined(__aarch64__) + unsigned int cpu_info[4] = { 0 }; + cpuid( 0, 0, cpu_info ); + return cpu_info[0] & HWCAP_SVE; #else - return false; + return false; #endif } -static inline bool has_sha512() +static inline bool has_sve2() { -#if defined(__x86_64__) - if ( has_avx2() ) - { - unsigned int cpu_info[4] = { 0 }; - cpuid( EXTENDED_FEATURES, 1, cpu_info ); - return cpu_info[ EAX_Reg ] & SHA512_Flag; - } - return false; -#elif defined(__aarch64__) && !defined(__APPLE__) - if ( has_neon() ) - { - unsigned int cpu_info[4] = { 0 }; - cpuid( 0, 0, cpu_info ); - return cpu_info[0] & HWCAP_SHA3; - } - return false; +#if defined(__aarch64__) + unsigned int cpu_info[4] = { 0 }; + cpuid( 0, 0, cpu_info ); + return cpu_info[1] & HWCAP2_SVE2; #else - return false; + return false; #endif } @@ -723,6 +703,47 @@ static inline bool has_avx512() #endif } +static inline bool has_vbmi() +{ +#if defined(__x86_64__) + unsigned int cpu_info[4] = { 0 }; + cpuid( EXTENDED_FEATURES, 0, cpu_info ); + return cpu_info[ ECX_Reg ] & AVX512_VBMI_Flag; +#else + return false; +#endif +} + +static inline bool has_vbmi2() +{ +#if defined(__x86_64__) + unsigned int cpu_info[4] = { 0 }; + cpuid( EXTENDED_FEATURES, 0, cpu_info ); + return cpu_info[ ECX_Reg ] & AVX512_VBMI2_Flag; +#else + return false; +#endif +} + +static inline bool has_aes() +{ +#if defined(__x86_64__) + if ( has_sse2() ) + { + unsigned int cpu_info[4] = { 0 }; + cpuid( CPU_INFO, 0, cpu_info ); + return cpu_info[ ECX_Reg ] & AES_NI_Flag; + } + return false; +#elif defined(__aarch64__) + unsigned int cpu_info[4] = { 0 }; + cpuid( 0, 0, cpu_info ); + return cpu_info[0] & HWCAP_AES; +#else + return false; +#endif +} + static inline bool has_vaes() { #if defined(__x86_64__) @@ -738,25 +759,75 @@ static inline bool has_vaes() #endif } -static inline bool has_vbmi() +static inline bool has_sveaes() +{ +#if defined(__aarch64__) + unsigned int cpu_info[4] = { 0 }; + cpuid( 0, 0, cpu_info ); + return cpu_info[1] & HWCAP2_SVEAES; +#else + return false; +#endif +} + +static inline bool has_sha256() { #if defined(__x86_64__) - unsigned int cpu_info[4] = { 0 }; - cpuid( EXTENDED_FEATURES, 0, cpu_info ); - return cpu_info[ ECX_Reg ] & AVX512_VBMI_Flag; + if ( has_avx() ) + { + unsigned int cpu_info[4] = { 0 }; + cpuid( EXTENDED_FEATURES, 0, cpu_info ); + return cpu_info[ EBX_Reg ] & SHA_Flag; + } + return false; +#elif defined(__aarch64__) + unsigned int cpu_info[4] = { 0 }; + cpuid( 0, 0, cpu_info ); + return cpu_info[0] & HWCAP_SHA2; #else return false; #endif } -static inline bool has_vbmi2() +static inline bool has_sha512() { #if defined(__x86_64__) - unsigned int cpu_info[4] = { 0 }; - cpuid( EXTENDED_FEATURES, 0, cpu_info ); - return cpu_info[ ECX_Reg ] & AVX512_VBMI2_Flag; + if ( has_avx2() ) + { + unsigned int cpu_info[4] = { 0 }; + cpuid( EXTENDED_FEATURES, 1, cpu_info ); + return cpu_info[ EAX_Reg ] & SHA512_Flag; + } + return false; +#elif defined(__aarch64__) + unsigned int cpu_info[4] = { 0 }; + cpuid( 0, 0, cpu_info ); + return cpu_info[0] & HWCAP_SHA512; #else - return false; + return false; +#endif +} + +// Arm only +static inline bool has_sha3() +{ +#if defined(__aarch64__) + unsigned int cpu_info[4] = { 0 }; + cpuid( 0, 0, cpu_info ); + return cpu_info[0] & HWCAP_SHA3; +#else + return false; +#endif +} + +static inline bool has_svesha3() +{ +#if defined(__aarch64__) + unsigned int cpu_info[4] = { 0 }; + cpuid( 0, 0, cpu_info ); + return cpu_info[1] & HWCAP2_SVESHA3; +#else + return false; #endif } @@ -962,9 +1033,7 @@ static inline void cpu_brand_string( char* s ) #elif defined(__arm__) || defined(__aarch64__) - unsigned int cpu_info[4] = { 0 }; - cpuid( 0, 0, cpu_info ); - sprintf( s, "ARM 64 bit CPU, HWCAP %08x", cpu_info[0] ); + sprintf( s, "ARM 64 bit CPU" ); #else diff --git a/util.c b/util.c index c1161c32..b610f762 100644 --- a/util.c +++ b/util.c @@ -2239,7 +2239,7 @@ static bool stratum_benchdata(json_t *result, json_t *params, int thr_id) #endif cpu_bestfeature(arch, 16); - if (has_aes_ni()) strcat(arch, " NI"); + if (has_aes()) strcat(arch, " NI"); cpu_getmodelid(vendorid, 32); cpu_getname(cpuname, 80);