diff --git a/Makefile.am b/Makefile.am index 05062cab..e91c49e5 100644 --- a/Makefile.am +++ b/Makefile.am @@ -22,7 +22,6 @@ cpuminer_SOURCES = \ api.c \ sysinfos.c \ algo-gate-api.c\ - crypto/blake2s.c \ crypto/oaes_lib.c \ crypto/c_keccak.c \ crypto/c_groestl.c \ @@ -45,6 +44,7 @@ cpuminer_SOURCES = \ algo/blake/blake-4way.c \ algo/blake/sph_blake2b.c \ algo/blake/blake2b.c \ + algo/blake/sph-blake2s.c \ algo/blake/blake2s.c \ algo/blake/blakecoin-gate.c \ algo/blake/mod_blakecoin.c \ @@ -75,7 +75,9 @@ cpuminer_SOURCES = \ algo/groestl/aes_ni/hash-groestl256.c \ algo/fugue/sph_fugue.c \ algo/hamsi/sph_hamsi.c \ - algo/haval/haval.c\ + algo/hamsi/hamsi-hash-4way.c \ + algo/haval/haval.c \ + algo/haval/haval-hash-4way.c \ algo/heavy/sph_hefty1.c \ algo/heavy/heavy.c \ algo/heavy/bastion.c \ @@ -122,6 +124,9 @@ cpuminer_SOURCES = \ algo/quark/quark-gate.c \ algo/quark/quark.c \ algo/quark/quark-4way.c \ + algo/quark/anime-gate.c \ + algo/quark/anime.c \ + algo/quark/anime-4way.c \ algo/qubit/qubit.c \ algo/qubit/deep.c \ algo/ripemd/sph_ripemd.c \ @@ -129,6 +134,7 @@ cpuminer_SOURCES = \ algo/scryptjane/scrypt-jane.c \ algo/sha/sph_sha2.c \ algo/sha/sph_sha2big.c \ + algo/sha/sha2-hash-4way.c \ algo/sha/sha2.c \ algo/sha/sha256t.c \ algo/shabal/sph_shabal.c \ @@ -210,6 +216,9 @@ cpuminer_SOURCES = \ algo/x17/xevan-gate.c \ algo/x17/xevan.c \ algo/x17/xevan-4way.c \ + algo/x17/x16r-gate.c \ + algo/x17/x16r.c \ + algo/x17/x16r-4way.c \ algo/x17/hmq1725.c \ algo/yescrypt/yescrypt.c \ algo/yescrypt/sha256_Y.c \ diff --git a/README.txt b/README.txt index dc6cf098..ac3a4842 100644 --- a/README.txt +++ b/README.txt @@ -19,19 +19,9 @@ Users are recommended to use an unoptimized miner such as cpuminer-multi. Exe name Compile flags Arch name -cpuminer-sse2.exe "-march=core2" Core2 -cpuminer-sse42.exe "-march=corei7" Nehalem +cpuminer-sse2.exe "-march=core2" Core2, Nehalem cpuminer-aes-sse42.exe "-maes -msse4.2" Westmere -cpuminer-avx.exe "-march=corei7-avx" Sandybridge, Ivybridge +cpuminer-aes-avx.exe "-march=corei7-avx" Sandybridge, Ivybridge cpuminer-avx2.exe "-march=core-avx2" Haswell... -cpuminer-avx-sha "-march=corei7-avx -msha" Ryzen... -cpuminer-4way.exe "-march=core-avx2 -DFOUR_WAY" same as avx2 -cpuminer-4way-sha.exe "-march=core-avx2 -msha -DFOUR_WAY" same as avx2-sha - -4way requires a CPU with AES and AVX2. It is still under development and -only a few algos are supported. See change log in RELEASE_NOTES in source -package for supported algos. - -Ryzen CPus perform better with AVX than AVX2 therefore an avx-sha build -is provided. Four way still uses AVX2. +cpuminer-avx2-sha.exe "-march=core-avx2 -msha" Ryzen diff --git a/RELEASE_NOTES b/RELEASE_NOTES index bc2010d0..b6742ef9 100644 --- a/RELEASE_NOTES +++ b/RELEASE_NOTES @@ -92,12 +92,6 @@ SPH may give slightly better performance on algos that use sha256 when using openssl 1.0.1 or older. Openssl 1.0.2 adds AVX2 and 1.1 adds SHA and perform better than SPH. --DFOUR_WAY - -4 way will give much better performance on supported algos with CPUs -that have AVX2 and should only be used on CPUs with AVX2. 4 way algo -support will be added incrementally, see change log below for supported algos. - Start mining. ./cpuminer -a algo -o url -u username -p password @@ -149,9 +143,9 @@ cpuminer.exe -a algo -o url -u user -p password The following tips may be useful for older AMD CPUs. -AMD CPUs older than Piledriver, including Athlon x2 and Phenom II x4, are not -supported by cpuminer-opt due to an incompatible implementation of SSE2 on -these CPUs. Some algos may crash the miner with an invalid instruction. +AMD CPUs older than Steamroller, including Athlon x2 and Phenom II x4, are +not supported by cpuminer-opt due to an incompatible implementation of SSE2 +on these CPUs. Some algos may crash the miner with an invalid instruction. Users are recommended to use an unoptimized miner such as cpuminer-multi. Some users with AMD CPUs without AES_NI have reported problems compiling @@ -165,6 +159,14 @@ Support for even older x86_64 without AES_NI or SSE2 is not availble. Change Log ---------- +v3.7.11 + +4way no longer a seperate feature, included in AVX2. +Added x16r algo for Ravencoin, anime algo for Animecoin. +More 4way optimizations for X13 and up. +Tweaked CPU affinity to better support more than 64 CPUs. +Fixed compile problem on some old AMD CPUs. + v3.7.10 4way optimizations for lyra2rev2, lyra2h, quark, timetravel8, timetravel10 diff --git a/algo-gate-api.c b/algo-gate-api.c index 7452e47e..deb7dff0 100644 --- a/algo-gate-api.c +++ b/algo-gate-api.c @@ -16,7 +16,7 @@ #include #include #include -#include "miner.h" +//#include "miner.h" #include "algo-gate-api.h" // Define null and standard functions. @@ -155,6 +155,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate ) switch (algo) { + case ALGO_ANIME: register_anime_algo ( gate ); break; case ALGO_ARGON2: register_argon2_algo ( gate ); break; case ALGO_AXIOM: register_axiom_algo ( gate ); break; case ALGO_BASTION: register_bastion_algo ( gate ); break; @@ -216,6 +217,7 @@ bool register_algo_gate( int algo, algo_gate_t *gate ) case ALGO_X13SM3: register_x13sm3_algo ( gate ); break; case ALGO_X14: register_x14_algo ( gate ); break; case ALGO_X15: register_x15_algo ( gate ); break; + case ALGO_X16R: register_x16r_algo ( gate ); break; case ALGO_X17: register_x17_algo ( gate ); break; case ALGO_XEVAN: register_xevan_algo ( gate ); break; case ALGO_YESCRYPT: register_yescrypt_algo ( gate ); break; diff --git a/algo-gate-api.h b/algo-gate-api.h index e8cc5363..65e32e9d 100644 --- a/algo-gate-api.h +++ b/algo-gate-api.h @@ -1,7 +1,6 @@ #include #include #include - #include "miner.h" ///////////////////////////// @@ -91,7 +90,7 @@ typedef uint32_t set_t; #define AVX_OPT 4 #define AVX2_OPT 8 #define SHA_OPT 0x10 -#define FOUR_WAY_OPT 0x20 +//#define FOUR_WAY_OPT 0x20 // return set containing all elements from sets a & b inline set_t set_union ( set_t a, set_t b ) { return a | b; } @@ -213,7 +212,8 @@ int64_t get_max64_0x3fffffLL(); int64_t get_max64_0x1ffff(); int64_t get_max64_0xffffLL(); -void std_set_target ( struct work *work, double job_diff ); +void std_set_target( struct work *work, double job_diff ); +void alt_set_target( struct work* work, double job_diff ); void scrypt_set_target( struct work *work, double job_diff ); bool std_le_work_decode( const json_t *val, struct work *work ); diff --git a/algo/blake/blake-4way.c b/algo/blake/blake-4way.c index 34f0e929..b8971666 100644 --- a/algo/blake/blake-4way.c +++ b/algo/blake/blake-4way.c @@ -1,6 +1,6 @@ #include "blake-gate.h" -#if defined (__AVX__) +#if defined (BLAKE_4WAY) #include "blake-hash-4way.h" #include diff --git a/algo/blake/blake-gate.c b/algo/blake/blake-gate.c index b050ff47..4dc518a6 100644 --- a/algo/blake/blake-gate.c +++ b/algo/blake/blake-gate.c @@ -7,6 +7,7 @@ int64_t blake_get_max64 () bool register_blake_algo( algo_gate_t* gate ) { + gate->optimizations = AVX2_OPT; gate->get_max64 = (void*)&blake_get_max64; //#if defined (__AVX2__) && defined (FOUR_WAY) // gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT; @@ -14,7 +15,6 @@ bool register_blake_algo( algo_gate_t* gate ) // gate->hash = (void*)&blakehash_8way; #if defined(BLAKE_4WAY) four_way_not_tested(); - gate->optimizations = FOUR_WAY_OPT; gate->scanhash = (void*)&scanhash_blake_4way; gate->hash = (void*)&blakehash_4way; #else diff --git a/algo/blake/blake-gate.h b/algo/blake/blake-gate.h index 431e5556..ec457961 100644 --- a/algo/blake/blake-gate.h +++ b/algo/blake/blake-gate.h @@ -4,7 +4,7 @@ #include "algo-gate-api.h" #include -#if defined(FOUR_WAY) && defined(__AVX__) +#if defined(__AVX2__) #define BLAKE_4WAY #endif diff --git a/algo/blake/blake-hash-4way.c b/algo/blake/blake-hash-4way.c index fab394a5..e7b424a4 100644 --- a/algo/blake/blake-hash-4way.c +++ b/algo/blake/blake-hash-4way.c @@ -78,6 +78,8 @@ static const sph_u64 IV512[8] = { #if SPH_COMPACT_BLAKE_32 || SPH_COMPACT_BLAKE_64 +// Blake-256 4 & 8 way, Blake-512 4way + static const unsigned sigma[16][16] = { { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 }, { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 }, @@ -273,6 +275,8 @@ static const unsigned sigma[16][16] = { #define Mx_(n) Mx__(n) #define Mx__(n) M ## n +// Blake-256 4 & 8 way + #define CSx(r, i) CSx_(Z ## r ## i) #define CSx_(n) CSx__(n) #define CSx__(n) CS ## n @@ -311,6 +315,8 @@ static const sph_u32 CS[16] = { #if defined(__AVX2__) +// Blake-512 4 way + #define CBx(r, i) CBx_(Z ## r ## i) #define CBx_(n) CBx__(n) #define CBx__(n) CB ## n @@ -401,6 +407,35 @@ do { \ #if defined (__AVX2__) +// BLAKE256 8 WAY + +#define GS_8WAY( m0, m1, c0, c1, a, b, c, d ) \ +do { \ + a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \ + _mm256_set1_epi32( c1 ), m0 ), b ), a ); \ + d = mm256_rotr_32( _mm256_xor_si256( d, a ), 16 ); \ + c = _mm256_add_epi32( c, d ); \ + b = mm256_rotr_32( _mm256_xor_si256( b, c ), 12 ); \ + a = _mm256_add_epi32( _mm256_add_epi32( _mm256_xor_si256( \ + _mm256_set1_epi32( c0 ), m1 ), b ), a ); \ + d = mm256_rotr_32( _mm256_xor_si256( d, a ), 8 ); \ + c = _mm256_add_epi32( c, d ); \ + b = mm256_rotr_32( _mm256_xor_si256( b, c ), 7 ); \ +} while (0) + +#define ROUND_S_8WAY(r) do { \ + GS_8WAY(Mx(r, 0), Mx(r, 1), CSx(r, 0), CSx(r, 1), V0, V4, V8, VC); \ + GS_8WAY(Mx(r, 2), Mx(r, 3), CSx(r, 2), CSx(r, 3), V1, V5, V9, VD); \ + GS_8WAY(Mx(r, 4), Mx(r, 5), CSx(r, 4), CSx(r, 5), V2, V6, VA, VE); \ + GS_8WAY(Mx(r, 6), Mx(r, 7), CSx(r, 6), CSx(r, 7), V3, V7, VB, VF); \ + GS_8WAY(Mx(r, 8), Mx(r, 9), CSx(r, 8), CSx(r, 9), V0, V5, VA, VF); \ + GS_8WAY(Mx(r, A), Mx(r, B), CSx(r, A), CSx(r, B), V1, V6, VB, VC); \ + GS_8WAY(Mx(r, C), Mx(r, D), CSx(r, C), CSx(r, D), V2, V7, V8, VD); \ + GS_8WAY(Mx(r, E), Mx(r, F), CSx(r, E), CSx(r, F), V3, V4, V9, VE); \ +} while (0) + +// Blake-512 4 way + #define GB_4WAY(m0, m1, c0, c1, a, b, c, d) do { \ a = _mm256_add_epi64( _mm256_add_epi64( _mm256_xor_si256( \ _mm256_set_epi64x( c1, c1, c1, c1 ), m0 ), b ), a ); \ @@ -627,6 +662,125 @@ do { \ #if defined (__AVX2__) +// Blake-256 8 way + +#define DECL_STATE32_8WAY \ + __m256i H0, H1, H2, H3, H4, H5, H6, H7; \ + __m256i S0, S1, S2, S3; \ + sph_u32 T0, T1; + +#define READ_STATE32_8WAY(state) \ +do { \ + H0 = (state)->H[0]; \ + H1 = (state)->H[1]; \ + H2 = (state)->H[2]; \ + H3 = (state)->H[3]; \ + H4 = (state)->H[4]; \ + H5 = (state)->H[5]; \ + H6 = (state)->H[6]; \ + H7 = (state)->H[7]; \ + S0 = (state)->S[0]; \ + S1 = (state)->S[1]; \ + S2 = (state)->S[2]; \ + S3 = (state)->S[3]; \ + T0 = (state)->T0; \ + T1 = (state)->T1; \ +} while (0) + +#define WRITE_STATE32_8WAY(state) \ +do { \ + (state)->H[0] = H0; \ + (state)->H[1] = H1; \ + (state)->H[2] = H2; \ + (state)->H[3] = H3; \ + (state)->H[4] = H4; \ + (state)->H[5] = H5; \ + (state)->H[6] = H6; \ + (state)->H[7] = H7; \ + (state)->S[0] = S0; \ + (state)->S[1] = S1; \ + (state)->S[2] = S2; \ + (state)->S[3] = S3; \ + (state)->T0 = T0; \ + (state)->T1 = T1; \ +} while (0) + +#define COMPRESS32_8WAY( rounds ) \ +do { \ + __m256i M0, M1, M2, M3, M4, M5, M6, M7; \ + __m256i M8, M9, MA, MB, MC, MD, ME, MF; \ + __m256i V0, V1, V2, V3, V4, V5, V6, V7; \ + __m256i V8, V9, VA, VB, VC, VD, VE, VF; \ + V0 = H0; \ + V1 = H1; \ + V2 = H2; \ + V3 = H3; \ + V4 = H4; \ + V5 = H5; \ + V6 = H6; \ + V7 = H7; \ + V8 = _mm256_xor_si256( S0, _mm256_set1_epi32( CS0 ) ); \ + V9 = _mm256_xor_si256( S1, _mm256_set1_epi32( CS1 ) ); \ + VA = _mm256_xor_si256( S2, _mm256_set1_epi32( CS2 ) ); \ + VB = _mm256_xor_si256( S3, _mm256_set1_epi32( CS3 ) ); \ + VC = _mm256_xor_si256( _mm256_set1_epi32( T0 ), _mm256_set1_epi32( CS4 ) ); \ + VD = _mm256_xor_si256( _mm256_set1_epi32( T0 ), _mm256_set1_epi32( CS5 ) ); \ + VE = _mm256_xor_si256( _mm256_set1_epi32( T1 ), _mm256_set1_epi32( CS6 ) ); \ + VF = _mm256_xor_si256( _mm256_set1_epi32( T1 ), _mm256_set1_epi32( CS7 ) ); \ + M0 = mm256_byteswap_32( * buf ); \ + M1 = mm256_byteswap_32( *(buf+1) ); \ + M2 = mm256_byteswap_32( *(buf+2) ); \ + M3 = mm256_byteswap_32( *(buf+3) ); \ + M4 = mm256_byteswap_32( *(buf+4) ); \ + M5 = mm256_byteswap_32( *(buf+5) ); \ + M6 = mm256_byteswap_32( *(buf+6) ); \ + M7 = mm256_byteswap_32( *(buf+7) ); \ + M8 = mm256_byteswap_32( *(buf+8) ); \ + M9 = mm256_byteswap_32( *(buf+9) ); \ + MA = mm256_byteswap_32( *(buf+10) ); \ + MB = mm256_byteswap_32( *(buf+11) ); \ + MC = mm256_byteswap_32( *(buf+12) ); \ + MD = mm256_byteswap_32( *(buf+13) ); \ + ME = mm256_byteswap_32( *(buf+14) ); \ + MF = mm256_byteswap_32( *(buf+15) ); \ + ROUND_S_8WAY(0); \ + ROUND_S_8WAY(1); \ + ROUND_S_8WAY(2); \ + ROUND_S_8WAY(3); \ + ROUND_S_8WAY(4); \ + ROUND_S_8WAY(5); \ + ROUND_S_8WAY(6); \ + ROUND_S_8WAY(7); \ + if (rounds == 14) \ + { \ + ROUND_S_8WAY(8); \ + ROUND_S_8WAY(9); \ + ROUND_S_8WAY(0); \ + ROUND_S_8WAY(1); \ + ROUND_S_8WAY(2); \ + ROUND_S_8WAY(3); \ + } \ + H0 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( V8, V0 ), \ + S0 ), H0 ); \ + H1 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( V9, V1 ), \ + S1 ), H1 ); \ + H2 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( VA, V2 ), \ + S2 ), H2 ); \ + H3 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( VB, V3 ), \ + S3 ), H3 ); \ + H4 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( VC, V4 ), \ + S0 ), H4 ); \ + H5 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( VD, V5 ), \ + S1 ), H5 ); \ + H6 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( VE, V6 ), \ + S2 ), H6 ); \ + H7 = _mm256_xor_si256( _mm256_xor_si256( _mm256_xor_si256( VF, V7 ), \ + S3 ), H7 ); \ +} while (0) + + +// Blake-512 4 way + #define DECL_STATE64_4WAY \ __m256i H0, H1, H2, H3, H4, H5, H6, H7; \ __m256i S0, S1, S2, S3; \ @@ -813,7 +967,7 @@ do { \ #endif -static const sph_u32 salt_zero_small[4] = { 0, 0, 0, 0 }; +static const sph_u32 salt_zero_4way_small[4] = { 0, 0, 0, 0 }; static void blake32_4way_init( blake_4way_small_context *sc, const sph_u32 *iv, @@ -934,6 +1088,129 @@ blake32_4way_close( blake_4way_small_context *sc, unsigned ub, unsigned n, #if defined (__AVX2__) +// Blake-256 8 way + +static const sph_u32 salt_zero_8way_small[8] = { 0, 0, 0, 0, 0, 0, 0, 0 }; + +static void +blake32_8way_init( blake_8way_small_context *sc, const sph_u32 *iv, + const sph_u32 *salt, int rounds ) +{ + int i; + for ( i = 0; i < 8; i++ ) + sc->H[i] = _mm256_set1_epi32( iv[i] ); + for ( i = 0; i < 4; i++ ) + sc->S[i] = _mm256_set1_epi32( salt[i] ); + sc->T0 = sc->T1 = 0; + sc->ptr = 0; + sc->rounds = rounds; +} + +static void +blake32_8way( blake_8way_small_context *sc, const void *data, size_t len ) +{ + __m256i *vdata = (__m256i*)data; + __m256i *buf; + size_t ptr; + const int buf_size = 64; // number of elements, sizeof/4 + DECL_STATE32_8WAY + + buf = sc->buf; + ptr = sc->ptr; + if ( len < buf_size - ptr ) + { + memcpy_256( buf + (ptr>>2), vdata, len>>2 ); + ptr += len; + sc->ptr = ptr; + return; + } + + READ_STATE32_8WAY(sc); + while ( len > 0 ) + { + size_t clen; + + clen = buf_size - ptr; + if (clen > len) + clen = len; + memcpy_256( buf + (ptr>>2), vdata, clen>>2 ); + ptr += clen; + vdata += (clen>>2); + len -= clen; + if ( ptr == buf_size ) + { + if ( ( T0 = SPH_T32(T0 + 512) ) < 512 ) + T1 = SPH_T32(T1 + 1); + COMPRESS32_8WAY( sc->rounds ); + ptr = 0; + } + } + WRITE_STATE32_8WAY(sc); + sc->ptr = ptr; +} + +static void +blake32_8way_close( blake_8way_small_context *sc, unsigned ub, unsigned n, + void *dst, size_t out_size_w32 ) +{ + union { + __m256i buf[16]; + sph_u32 dummy; + } u; + size_t ptr, k; + unsigned bit_len; + sph_u32 th, tl; + __m256i *out; + + ptr = sc->ptr; + bit_len = ((unsigned)ptr << 3); + u.buf[ptr>>2] = _mm256_set1_epi32( 0x80 ); + tl = sc->T0 + bit_len; + th = sc->T1; + + if ( ptr == 0 ) + { + sc->T0 = SPH_C32(0xFFFFFE00UL); + sc->T1 = SPH_C32(0xFFFFFFFFUL); + } + else if ( sc->T0 == 0 ) + { + sc->T0 = SPH_C32(0xFFFFFE00UL) + bit_len; + sc->T1 = SPH_T32(sc->T1 - 1); + } + else + sc->T0 -= 512 - bit_len; + + if ( ptr <= 52 ) + { + memset_zero_256( u.buf + (ptr>>2) + 1, (52 - ptr) >> 2 ); + if (out_size_w32 == 8) + u.buf[52>>2] = _mm256_or_si256( u.buf[52>>2], + _mm256_set1_epi32( 0x01000000UL ) ); + *(u.buf+(56>>2)) = mm256_byteswap_32( _mm256_set1_epi32( th ) ); + *(u.buf+(60>>2)) = mm256_byteswap_32( _mm256_set1_epi32( tl ) ); + blake32_8way( sc, u.buf + (ptr>>2), 64 - ptr ); + } + else + { + memset_zero_256( u.buf + (ptr>>2) + 1, (60-ptr) >> 2 ); + blake32_8way( sc, u.buf + (ptr>>2), 64 - ptr ); + sc->T0 = SPH_C32(0xFFFFFE00UL); + sc->T1 = SPH_C32(0xFFFFFFFFUL); + memset_zero_256( u.buf, 56>>2 ); + if (out_size_w32 == 8) + u.buf[52>>2] = _mm256_set1_epi32( 0x01000000UL ); + *(u.buf+(56>>2)) = mm256_byteswap_32( _mm256_set1_epi32( th ) ); + *(u.buf+(60>>2)) = mm256_byteswap_32( _mm256_set1_epi32( tl ) ); + blake32_8way( sc, u.buf, 64 ); + } + out = (__m256i*)dst; + for ( k = 0; k < out_size_w32; k++ ) + out[k] = mm256_byteswap_32( sc->H[k] ); +} + +// Blake-512 4 way + static const sph_u64 salt_zero_big[4] = { 0, 0, 0, 0 }; static void @@ -1065,11 +1342,13 @@ blake64_4way_close( blake_4way_big_context *sc, #endif +// Blake-256 4 way & 8 way + // default 14 rounds, backward copatibility void blake256_4way_init(void *cc) { - blake32_4way_init( cc, IV256, salt_zero_small, 14 ); + blake32_4way_init( cc, IV256, salt_zero_4way_small, 14 ); } void @@ -1084,10 +1363,31 @@ blake256_4way_close(void *cc, void *dst) blake32_4way_close(cc, 0, 0, dst, 8); } -// 14 rounds blake, decred +#if defined(__AVX2__) +void +blake256_8way_init(void *cc) +{ + blake32_8way_init( cc, IV256, salt_zero_8way_small, 14 ); +} + +void +blake256_8way(void *cc, const void *data, size_t len) +{ + blake32_8way(cc, data, len); +} + +void +blake256_8way_close(void *cc, void *dst) +{ + blake32_8way_close(cc, 0, 0, dst, 8); +} + +#endif + +// 14 rounds Blake, Decred void blake256r14_4way_init(void *cc) { - blake32_4way_init( cc, IV256, salt_zero_small, 14 ); + blake32_4way_init( cc, IV256, salt_zero_4way_small, 14 ); } void @@ -1102,10 +1402,31 @@ blake256r14_4way_close(void *cc, void *dst) blake32_4way_close(cc, 0, 0, dst, 8); } -// 8 rounds blakecoin, vanilla +#if defined(__AVX2__) + +void blake256r14_8way_init(void *cc) +{ + blake32_8way_init( cc, IV256, salt_zero_8way_small, 14 ); +} + +void +blake256r14_8way(void *cc, const void *data, size_t len) +{ + blake32_8way(cc, data, len); +} + +void +blake256r14_8way_close(void *cc, void *dst) +{ + blake32_8way_close(cc, 0, 0, dst, 8); +} + +#endif + +// 8 rounds Blakecoin, Vanilla void blake256r8_4way_init(void *cc) { - blake32_4way_init( cc, IV256, salt_zero_small, 8 ); + blake32_4way_init( cc, IV256, salt_zero_4way_small, 8 ); } void @@ -1122,6 +1443,29 @@ blake256r8_4way_close(void *cc, void *dst) #if defined (__AVX2__) +void blake256r8_8way_init(void *cc) +{ + blake32_8way_init( cc, IV256, salt_zero_8way_small, 8 ); +} + +void +blake256r8_8way(void *cc, const void *data, size_t len) +{ + blake32_8way(cc, data, len); +} + +void +blake256r8_8way_close(void *cc, void *dst) +{ + blake32_8way_close(cc, 0, 0, dst, 8); +} + +#endif + +// Blake-512 4 way + +#if defined (__AVX2__) + void blake512_4way_init(void *cc) { diff --git a/algo/blake/blake-hash-4way.h b/algo/blake/blake-hash-4way.h index 003b39ab..48ac65d7 100644 --- a/algo/blake/blake-hash-4way.h +++ b/algo/blake/blake-hash-4way.h @@ -51,6 +51,11 @@ extern "C"{ #define SPH_SIZE_blake512 512 +// With AVX only Blake-256 4 way is available. +// With AVX2 Blake-256 8way & Blake-512 4 way are also available. + +// Blake-256 4 way + typedef struct { __m128i buf[16] __attribute__ ((aligned (64))); __m128i H[8]; @@ -80,6 +85,37 @@ void blake256r8_4way_close(void *cc, void *dst); #ifdef __AVX2__ +// Blake-256 8 way + +typedef struct { + __m256i buf[16] __attribute__ ((aligned (64))); + __m256i H[8]; + __m256i S[4]; + size_t ptr; + sph_u32 T0, T1; + int rounds; // 14 for blake, 8 for blakecoin & vanilla +} blake_8way_small_context; + +// Default 14 rounds +typedef blake_8way_small_context blake256_8way_context; +void blake256_8way_init(void *cc); +void blake256_8way(void *cc, const void *data, size_t len); +void blake256_8way_close(void *cc, void *dst); + +// 14 rounds, blake, decred +typedef blake_8way_small_context blake256r14_8way_context; +void blake256r14_8way_init(void *cc); +void blake256r14_8way(void *cc, const void *data, size_t len); +void blake256r14_8way_close(void *cc, void *dst); + +// 8 rounds, blakecoin, vanilla +typedef blake_8way_small_context blake256r8_8way_context; +void blake256r8_8way_init(void *cc); +void blake256r8_8way(void *cc, const void *data, size_t len); +void blake256r8_8way_close(void *cc, void *dst); + +// Blake-512 4 way + typedef struct { __m256i buf[16] __attribute__ ((aligned (64))); __m256i H[8]; diff --git a/algo/blake/blake2s.c b/algo/blake/blake2s.c index 1a66be65..05cdc376 100644 --- a/algo/blake/blake2s.c +++ b/algo/blake/blake2s.c @@ -3,7 +3,7 @@ #include #include -#include "crypto/blake2s.h" +#include "sph-blake2s.h" static __thread blake2s_state s_midstate; static __thread blake2s_state s_ctx; diff --git a/algo/blake/blakecoin-4way.c b/algo/blake/blakecoin-4way.c index 0abd85f2..eb3930a9 100644 --- a/algo/blake/blakecoin-4way.c +++ b/algo/blake/blakecoin-4way.c @@ -1,6 +1,6 @@ #include "blakecoin-gate.h" -#if defined (__AVX__) +#if defined (BLAKECOIN_4WAY) #include "blake-hash-4way.h" #include diff --git a/algo/blake/blakecoin-gate.c b/algo/blake/blakecoin-gate.c index 9b83c58e..8ad514e2 100644 --- a/algo/blake/blakecoin-gate.c +++ b/algo/blake/blakecoin-gate.c @@ -15,13 +15,13 @@ void bc4w_get_new_work( struct work* work, struct work* g_work, int thr_id, uint32_t *end_nonce_ptr, bool clean_job ) { uint32_t *nonceptr = algo_gate.get_nonceptr( work->data ); -// + // if ( have_stratum && ( *nonceptr >= *end_nonce_ptr ) ) // algo_gate.stratum_gen_work( &stratum, g_work ); if ( memcmp( work->data, g_work->data, algo_gate.work_cmp_size ) || ( *nonceptr >= *end_nonce_ptr ) - || ( work->job_id != g_work->job_id ) && clean_job ) + || ( ( work->job_id != g_work->job_id ) && clean_job ) ) /* if ( memcmp( work->data, g_work->data, algo_gate.work_cmp_size ) && ( clean_job || ( *nonceptr >= *end_nonce_ptr ) @@ -47,7 +47,6 @@ bool register_vanilla_algo( algo_gate_t* gate ) { #if defined(BLAKECOIN_4WAY) // four_way_not_tested(); - gate->optimizations = FOUR_WAY_OPT; gate->scanhash = (void*)&scanhash_blakecoin_4way; gate->hash = (void*)&blakecoin_4way_hash; // gate->get_new_work = (void*)&bc4w_get_new_work; @@ -57,7 +56,7 @@ bool register_vanilla_algo( algo_gate_t* gate ) gate->hash = (void*)&blakecoinhash; // blakecoin_init( &blake_init_ctx ); #endif - gate->optimizations = AVX2_OPT | FOUR_WAY_OPT; + gate->optimizations = AVX2_OPT; gate->get_max64 = (void*)&blakecoin_get_max64; return true; } diff --git a/algo/blake/blakecoin-gate.h b/algo/blake/blakecoin-gate.h index f7c7b4f4..54b177a6 100644 --- a/algo/blake/blakecoin-gate.h +++ b/algo/blake/blakecoin-gate.h @@ -4,7 +4,7 @@ #include "algo-gate-api.h" #include -#if defined(FOUR_WAY) && defined(__AVX__) +#if defined(__AVX2__) #define BLAKECOIN_4WAY #endif diff --git a/algo/blake/decred-gate.c b/algo/blake/decred-gate.c index acdd9449..4c79b41b 100644 --- a/algo/blake/decred-gate.c +++ b/algo/blake/decred-gate.c @@ -145,15 +145,13 @@ bool register_decred_algo( algo_gate_t* gate ) { #if defined(DECRED_4WAY) four_way_not_tested(); - gate->optimizations = FOUR_WAY_OPT; gate->scanhash = (void*)&scanhash_decred_4way; gate->hash = (void*)&decred_hash_4way; #else - gate->optimizations = SSE2_OPT; gate->scanhash = (void*)&scanhash_decred; gate->hash = (void*)&decred_hash; #endif - + gate->optimizations = AVX2_OPT; gate->get_nonceptr = (void*)&decred_get_nonceptr; gate->get_max64 = (void*)&get_max64_0x3fffffLL; gate->display_extra_data = (void*)&decred_decode_extradata; diff --git a/algo/blake/decred-gate.h b/algo/blake/decred-gate.h index c3ea349e..62b08c20 100644 --- a/algo/blake/decred-gate.h +++ b/algo/blake/decred-gate.h @@ -18,7 +18,7 @@ // uint64_t *hashes_done ); #endif -#if defined(FOUR_WAY) && defined(__AVX__) +#if defined(__AVX2__) #define DECRED_4WAY #endif diff --git a/algo/blake/pentablake-4way.c b/algo/blake/pentablake-4way.c index 05a52bd0..35a2c6a1 100644 --- a/algo/blake/pentablake-4way.c +++ b/algo/blake/pentablake-4way.c @@ -1,6 +1,6 @@ #include "pentablake-gate.h" -#ifdef __AVX2__ +#if defined (__AVX2__) #include #include diff --git a/algo/blake/pentablake-gate.c b/algo/blake/pentablake-gate.c index b09b2b9b..b1942063 100644 --- a/algo/blake/pentablake-gate.c +++ b/algo/blake/pentablake-gate.c @@ -9,7 +9,7 @@ bool register_pentablake_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_pentablake; gate->hash = (void*)&pentablakehash; #endif - gate->optimizations = FOUR_WAY_OPT; + gate->optimizations = AVX2_OPT; gate->get_max64 = (void*)&get_max64_0x3ffff; return true; }; diff --git a/algo/blake/pentablake-gate.h b/algo/blake/pentablake-gate.h index 08ac744a..04aa2b48 100644 --- a/algo/blake/pentablake-gate.h +++ b/algo/blake/pentablake-gate.h @@ -4,7 +4,7 @@ #include "algo-gate-api.h" #include -#if defined(FOUR_WAY) && defined(__AVX2__) +#if defined(__AVX2__) #define PENTABLAKE_4WAY #endif diff --git a/crypto/blake2s.c b/algo/blake/sph-blake2s.c similarity index 99% rename from crypto/blake2s.c rename to algo/blake/sph-blake2s.c index a002db43..a732910d 100644 --- a/crypto/blake2s.c +++ b/algo/blake/sph-blake2s.c @@ -16,7 +16,7 @@ #include #include "algo/sha/sph_types.h" -#include "crypto/blake2s.h" +#include "sph-blake2s.h" static const uint32_t blake2s_IV[8] = { diff --git a/crypto/blake2s.h b/algo/blake/sph-blake2s.h similarity index 100% rename from crypto/blake2s.h rename to algo/blake/sph-blake2s.h diff --git a/algo/bmw/bmw-hash-4way.c b/algo/bmw/bmw-hash-4way.c index 2e35e70c..39da2ce2 100644 --- a/algo/bmw/bmw-hash-4way.c +++ b/algo/bmw/bmw-hash-4way.c @@ -49,6 +49,11 @@ extern "C"{ // BMW256 +// BMW small has a bug not present in big. Lanes 0 & 2 produce valid hash +// while lanes 1 & 3 produce invalid hash. The cause is not known. + + + static const sph_u32 IV256[] = { SPH_C32(0x40414243), SPH_C32(0x44454647), SPH_C32(0x48494A4B), SPH_C32(0x4C4D4E4F), @@ -116,14 +121,16 @@ static const sph_u64 IV512[] = { mm_rotl_32( M[ ( (j) + (off) ) & 0xF ] , \ ( ( (j) + (off) ) & 0xF ) + 1 ) +// The multiplication in this macro is a possible cause of the lane +// corruption but a vectorized mullo did not help. #define add_elt_s( M, H, j ) \ _mm_xor_si128( \ _mm_add_epi32( \ _mm_sub_epi32( _mm_add_epi32( rol_off_32( M, j, 0 ), \ rol_off_32( M, j, 3 ) ), \ rol_off_32( M, j, 10 ) ), \ - _mm_set1_epi32( ( (j) + 16 ) * 0x05555555UL ) ), \ - H[ ( (j)+7 ) & 0xF ] ) + _mm_set1_epi32( ( (j) + 16 ) * 0x05555555UL ) \ + ), H[ ( (j)+7 ) & 0xF ] ) #define expand1s( qt, M, H, i ) \ @@ -160,7 +167,7 @@ static const sph_u64 IV512[] = { _mm_add_epi32( \ _mm_add_epi32( qt[ (i)-16 ], rs1( qt[ (i)-15 ] ) ), \ _mm_add_epi32( qt[ (i)-14 ], rs2( qt[ (i)-13 ] ) ) ), \ - _mm_add_epi64( \ + _mm_add_epi32( \ _mm_add_epi32( qt[ (i)-12 ], rs3( qt[ (i)-11 ] ) ), \ _mm_add_epi32( qt[ (i)-10 ], rs4( qt[ (i)- 9 ] ) ) ) ), \ _mm_add_epi32( \ @@ -861,7 +868,27 @@ void compress_big( const __m256i *M, const __m256i H[16], __m256i dH[16] ) } // BMW256 - +/* +static const uint32_t final_s[16][4] = +{ + { 0xaaaaaaa0, 0xaaaaaaa0, 0xaaaaaaa0, 0xaaaaaaa0 }, + { 0xaaaaaaa1, 0xaaaaaaa1, 0xaaaaaaa1, 0xaaaaaaa1 }, + { 0xaaaaaaa2, 0xaaaaaaa2, 0xaaaaaaa2, 0xaaaaaaa2 }, + { 0xaaaaaaa3, 0xaaaaaaa3, 0xaaaaaaa3, 0xaaaaaaa3 }, + { 0xaaaaaaa4, 0xaaaaaaa4, 0xaaaaaaa4, 0xaaaaaaa4 }, + { 0xaaaaaaa5, 0xaaaaaaa5, 0xaaaaaaa5, 0xaaaaaaa5 }, + { 0xaaaaaaa6, 0xaaaaaaa6, 0xaaaaaaa6, 0xaaaaaaa6 }, + { 0xaaaaaaa7, 0xaaaaaaa7, 0xaaaaaaa7, 0xaaaaaaa7 }, + { 0xaaaaaaa8, 0xaaaaaaa8, 0xaaaaaaa8, 0xaaaaaaa8 }, + { 0xaaaaaaa9, 0xaaaaaaa9, 0xaaaaaaa9, 0xaaaaaaa9 }, + { 0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa, 0xaaaaaaaa }, + { 0xaaaaaaab, 0xaaaaaaab, 0xaaaaaaab, 0xaaaaaaab }, + { 0xaaaaaaac, 0xaaaaaaac, 0xaaaaaaac, 0xaaaaaaac }, + { 0xaaaaaaad, 0xaaaaaaad, 0xaaaaaaad, 0xaaaaaaad }, + { 0xaaaaaaae, 0xaaaaaaae, 0xaaaaaaae, 0xaaaaaaae }, + { 0xaaaaaaaf, 0xaaaaaaaf, 0xaaaaaaaf, 0xaaaaaaaf } +}; +*/ static const __m128i final_s[16] = { { 0xaaaaaaa0aaaaaaa0, 0xaaaaaaa0aaaaaaa0 }, @@ -901,11 +928,12 @@ bmw32_4way(bmw_4way_small_context *sc, const void *data, size_t len) size_t ptr; const int buf_size = 64; // bytes of one lane, compatible with len - sc->bit_count += (sph_u64)len << 3; + sc->bit_count += (sph_u32)len << 3; buf = sc->buf; ptr = sc->ptr; h1 = sc->H; h2 = htmp; + while ( len > 0 ) { size_t clen; @@ -938,13 +966,11 @@ bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n, __m128i *buf; __m128i h1[16], h2[16], *h; size_t ptr, u, v; - unsigned z; const int buf_size = 64; // bytes of one lane, compatible with len buf = sc->buf; ptr = sc->ptr; - z = 0x80 >> n; - buf[ ptr>>2 ] = _mm_set1_epi32( z ); + buf[ ptr>>2 ] = _mm_set1_epi32( 0x80 ); ptr += 4; h = sc->H; @@ -956,12 +982,15 @@ bmw32_4way_close(bmw_4way_small_context *sc, unsigned ub, unsigned n, ptr = 0; h = h1; } - memset_zero_128( buf + (ptr>>2), (buf_size - 4 - ptr) >> 2 ); - buf[ (buf_size - 4) >> 2 ] = _mm_set1_epi32( sc->bit_count + n ); + memset_zero_128( buf + (ptr>>2), (buf_size - 8 - ptr) >> 2 ); + buf[ (buf_size - 8) >> 2 ] = _mm_set1_epi32( sc->bit_count + n ); + buf[ (buf_size - 4) >> 2 ] = mm_zero; compress_small( buf, h, h2 ); + for ( u = 0; u < 16; u ++ ) buf[u] = h2[u]; - compress_small( buf, final_s, h1 ); + compress_small( buf, (__m128i*)final_s, h1 ); + for (u = 0, v = 16 - out_size_w32; u < out_size_w32; u ++, v ++) casti_m128i( dst, u ) = h1[v]; } diff --git a/algo/cryptonight/cryptonight-aesni.c b/algo/cryptonight/cryptonight-aesni.c index 28937031..e28d2cfb 100644 --- a/algo/cryptonight/cryptonight-aesni.c +++ b/algo/cryptonight/cryptonight-aesni.c @@ -3,7 +3,8 @@ #include "cryptonight.h" #include "miner.h" #include "crypto/c_keccak.h" -#include "avxdefs.h" +#include +//#include "avxdefs.h" void aesni_parallel_noxor(uint8_t *long_state, uint8_t *text, uint8_t *ExpandedKey); void aesni_parallel_xor(uint8_t *text, uint8_t *ExpandedKey, uint8_t *long_state); diff --git a/algo/cubehash/sse2/cubehash_sse2.c b/algo/cubehash/sse2/cubehash_sse2.c index 360934f4..ab36bff0 100644 --- a/algo/cubehash/sse2/cubehash_sse2.c +++ b/algo/cubehash/sse2/cubehash_sse2.c @@ -10,6 +10,10 @@ #endif #include "cubehash_sse2.h" #include "algo/sha/sha3-defs.h" +#include +#include +#include +#include "avxdefs.h" static void transform( cubehashParam *sp ) { @@ -125,6 +129,18 @@ static void transform( cubehashParam *sp ) #endif } // transform +// Ccubehash context initializing is very expensive. +// Cache the intial value for faster reinitializing. +cubehashParam cube_ctx_cache __attribute__ ((aligned (64))); + +int cubehashReinit( cubehashParam *sp ) +{ + memcpy( sp, &cube_ctx_cache, sizeof(cubehashParam) ); + return SUCCESS; + +} + +// Initialize the cache then copy to sp. int cubehashInit(cubehashParam *sp, int hashbitlen, int rounds, int blockbytes) { int i; @@ -135,24 +151,26 @@ int cubehashInit(cubehashParam *sp, int hashbitlen, int rounds, int blockbytes) /* Sanity checks */ if ( rounds <= 0 || rounds > 32 ) - rounds = CUBEHASH_ROUNDS; + rounds = CUBEHASH_ROUNDS; if ( blockbytes <= 0 || blockbytes >= 256) - blockbytes = CUBEHASH_BLOCKBYTES; + blockbytes = CUBEHASH_BLOCKBYTES; // all sizes of __m128i - sp->hashlen = hashbitlen/128; - sp->blocksize = blockbytes/16; - sp->rounds = rounds; - sp->pos = 0; + cube_ctx_cache.hashlen = hashbitlen/128; + cube_ctx_cache.blocksize = blockbytes/16; + cube_ctx_cache.rounds = rounds; + cube_ctx_cache.pos = 0; for ( i = 0; i < 8; ++i ) - sp->x[i] = _mm_set_epi32(0, 0, 0, 0); + cube_ctx_cache.x[i] = _mm_setzero_si128();; - sp->x[0] = _mm_set_epi32( 0, rounds, blockbytes, hashbitlen / 8 ); + cube_ctx_cache.x[0] = _mm_set_epi32( 0, rounds, blockbytes, + hashbitlen / 8 ); for ( i = 0; i < 10; ++i ) - transform(sp); -// sp->pos = 0; + transform( &cube_ctx_cache ); + + memcpy( sp, &cube_ctx_cache, sizeof(cubehashParam) ); return SUCCESS; } diff --git a/algo/cubehash/sse2/cubehash_sse2.h b/algo/cubehash/sse2/cubehash_sse2.h index b672e28e..4e1eaa39 100644 --- a/algo/cubehash/sse2/cubehash_sse2.h +++ b/algo/cubehash/sse2/cubehash_sse2.h @@ -29,6 +29,8 @@ extern "C" { #endif int cubehashInit(cubehashParam* sp, int hashbitlen, int rounds, int blockbytes); +// reinitialize context with same parameters, much faster. +int cubehashReinit( cubehashParam* sp ); int cubehashUpdate(cubehashParam* sp, const byte *data, size_t size); diff --git a/algo/hamsi/hamsi-hash-4way.c b/algo/hamsi/hamsi-hash-4way.c new file mode 100644 index 00000000..412fcf30 --- /dev/null +++ b/algo/hamsi/hamsi-hash-4way.c @@ -0,0 +1,510 @@ +/* $Id: hamsi.c 251 2010-10-19 14:31:51Z tp $ */ +/* + * Hamsi implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin + */ + +#include +#include + +#include "hamsi-hash-4way.h" + +#if defined(__AVX__) + +#ifdef __cplusplus +extern "C"{ +#endif + +/* + * The SPH_HAMSI_EXPAND_* define how many input bits we handle in one + * table lookup during message expansion (1 to 8, inclusive). If we note + * w the number of bits per message word (w=32 for Hamsi-224/256, w=64 + * for Hamsi-384/512), r the size of a "row" in 32-bit words (r=8 for + * Hamsi-224/256, r=16 for Hamsi-384/512), and n the expansion level, + * then we will get t tables (where t=ceil(w/n)) of individual size + * 2^n*r*4 (in bytes). The last table may be shorter (e.g. with w=32 and + * n=5, there are 7 tables, but the last one uses only two bits on + * input, not five). + * + * Also, we read t rows of r words from RAM. Words in a given row are + * concatenated in RAM in that order, so most of the cost is about + * reading the first row word; comparatively, cache misses are thus + * less expensive with Hamsi-512 (r=16) than with Hamsi-256 (r=8). + * + * When n=1, tables are "special" in that we omit the first entry of + * each table (which always contains 0), so that total table size is + * halved. + * + * We thus have the following (size1 is the cumulative table size of + * Hamsi-224/256; size2 is for Hamsi-384/512; similarly, t1 and t2 + * are for Hamsi-224/256 and Hamsi-384/512, respectively). + * + * n size1 size2 t1 t2 + * --------------------------------------- + * 1 1024 4096 32 64 + * 2 2048 8192 16 32 + * 3 2688 10880 11 22 + * 4 4096 16384 8 16 + * 5 6272 25600 7 13 + * 6 10368 41984 6 11 + * 7 16896 73856 5 10 + * 8 32768 131072 4 8 + * + * So there is a trade-off: a lower n makes the tables fit better in + * L1 cache, but increases the number of memory accesses. The optimal + * value depends on the amount of available L1 cache and the relative + * impact of a cache miss. + * + * Experimentally, in ideal benchmark conditions (which are not necessarily + * realistic with regards to L1 cache contention), it seems that n=8 is + * the best value on "big" architectures (those with 32 kB or more of L1 + * cache), while n=4 is better on "small" architectures. This was tested + * on an Intel Core2 Q6600 (both 32-bit and 64-bit mode), a PowerPC G3 + * (32 kB L1 cache, hence "big"), and a MIPS-compatible Broadcom BCM3302 + * (8 kB L1 cache). + * + * Note: with n=1, the 32 tables (actually implemented as one big table) + * are read entirely and sequentially, regardless of the input data, + * thus avoiding any data-dependent table access pattern. + */ + +// Hard coded +//#define SPH_HAMSI_EXPAND_BIG 1 + +/* +#if !defined SPH_HAMSI_EXPAND_SMALL +#if SPH_SMALL_FOOTPRINT_HAMSI +#define SPH_HAMSI_EXPAND_SMALL 4 +#else +#define SPH_HAMSI_EXPAND_SMALL 8 +#endif +#endif + +#if !defined SPH_HAMSI_EXPAND_BIG +#define SPH_HAMSI_EXPAND_BIG 8 +#endif +*/ + +#ifdef _MSC_VER +#pragma warning (disable: 4146) +#endif + +#include "hamsi-helper-4way.c" + +static const sph_u32 IV512[] = { + SPH_C32(0x73746565), SPH_C32(0x6c706172), SPH_C32(0x6b204172), + SPH_C32(0x656e6265), SPH_C32(0x72672031), SPH_C32(0x302c2062), + SPH_C32(0x75732032), SPH_C32(0x3434362c), SPH_C32(0x20422d33), + SPH_C32(0x30303120), SPH_C32(0x4c657576), SPH_C32(0x656e2d48), + SPH_C32(0x65766572), SPH_C32(0x6c65652c), SPH_C32(0x2042656c), + SPH_C32(0x6769756d) +}; + +static const sph_u32 alpha_n[] = { + SPH_C32(0xff00f0f0), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0cccc), + SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00), + SPH_C32(0xaaaacccc), SPH_C32(0xf0f0ff00), SPH_C32(0xf0f0cccc), + SPH_C32(0xaaaaff00), SPH_C32(0xccccff00), SPH_C32(0xaaaaf0f0), + SPH_C32(0xaaaaf0f0), SPH_C32(0xff00cccc), SPH_C32(0xccccf0f0), + SPH_C32(0xff00aaaa), SPH_C32(0xccccaaaa), SPH_C32(0xff00f0f0), + SPH_C32(0xff00aaaa), SPH_C32(0xf0f0cccc), SPH_C32(0xf0f0ff00), + SPH_C32(0xccccaaaa), SPH_C32(0xf0f0ff00), SPH_C32(0xaaaacccc), + SPH_C32(0xaaaaff00), SPH_C32(0xf0f0cccc), SPH_C32(0xaaaaf0f0), + SPH_C32(0xccccff00), SPH_C32(0xff00cccc), SPH_C32(0xaaaaf0f0), + SPH_C32(0xff00aaaa), SPH_C32(0xccccf0f0) +}; + +static const sph_u32 alpha_f[] = { + SPH_C32(0xcaf9639c), SPH_C32(0x0ff0f9c0), SPH_C32(0x639c0ff0), + SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9), + SPH_C32(0xf9c00ff0), SPH_C32(0x639ccaf9), SPH_C32(0x639c0ff0), + SPH_C32(0xf9c0caf9), SPH_C32(0x0ff0caf9), SPH_C32(0xf9c0639c), + SPH_C32(0xf9c0639c), SPH_C32(0xcaf90ff0), SPH_C32(0x0ff0639c), + SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0f9c0), SPH_C32(0xcaf9639c), + SPH_C32(0xcaf9f9c0), SPH_C32(0x639c0ff0), SPH_C32(0x639ccaf9), + SPH_C32(0x0ff0f9c0), SPH_C32(0x639ccaf9), SPH_C32(0xf9c00ff0), + SPH_C32(0xf9c0caf9), SPH_C32(0x639c0ff0), SPH_C32(0xf9c0639c), + SPH_C32(0x0ff0caf9), SPH_C32(0xcaf90ff0), SPH_C32(0xf9c0639c), + SPH_C32(0xcaf9f9c0), SPH_C32(0x0ff0639c) +}; + +/* +#define s0 m0 +#define s1 m1 +#define s2 c0 +#define s3 c1 +#define s4 c2 +#define s5 c3 +#define s6 m2 +#define s7 m3 +#define s8 m4 +#define s9 m5 +#define sA c4 +#define sB c5 +#define sC c6 +#define sD c7 +#define sE m6 +#define sF m7 +*/ + +#define SBOX( a, b, c, d ) \ +do { \ + __m128i t; \ + t = a; \ + a = _mm_xor_si128( d, _mm_and_si128( a, c ) ); \ + c = _mm_xor_si128( a, _mm_xor_si128( c, b ) ); \ + d = _mm_xor_si128( b, _mm_or_si128( d, t ) ); \ + t = _mm_xor_si128( t, c ); \ + b = d; \ + d = _mm_xor_si128( a, _mm_or_si128( d, t ) ); \ + a = _mm_and_si128( a, b ); \ + t = _mm_xor_si128( t, a ); \ + b = _mm_xor_si128( t, _mm_xor_si128( b, d ) ); \ + a = c; \ + c = b; \ + b = d; \ + d = mm_not( t ); \ +} while (0) + +#define L( a, b, c, d ) \ +do { \ + a = mm_rotl_32( a, 13 ); \ + c = mm_rotl_32( c, 3 ); \ + b = _mm_xor_si128( b, _mm_xor_si128( a, c ) ); \ + d = _mm_xor_si128( d, _mm_xor_si128( c, _mm_slli_epi32( a, 3 ) ) ); \ + b = mm_rotl_32( b, 1 ); \ + d = mm_rotl_32( d, 7 ); \ + a = _mm_xor_si128( a, _mm_xor_si128( b, d ) ); \ + c = _mm_xor_si128( c, _mm_xor_si128( d, _mm_slli_epi32( b, 7 ) ) ); \ + a = mm_rotl_32( a, 5 ); \ + c = mm_rotl_32( c, 22 ); \ +} while (0) + +#define DECL_STATE_BIG \ + __m128i c0, c1, c2, c3, c4, c5, c6, c7; \ + __m128i c8, c9, cA, cB, cC, cD, cE, cF; + +#define READ_STATE_BIG(sc) do { \ + c0 = sc->h[0x0]; \ + c1 = sc->h[0x1]; \ + c2 = sc->h[0x2]; \ + c3 = sc->h[0x3]; \ + c4 = sc->h[0x4]; \ + c5 = sc->h[0x5]; \ + c6 = sc->h[0x6]; \ + c7 = sc->h[0x7]; \ + c8 = sc->h[0x8]; \ + c9 = sc->h[0x9]; \ + cA = sc->h[0xA]; \ + cB = sc->h[0xB]; \ + cC = sc->h[0xC]; \ + cD = sc->h[0xD]; \ + cE = sc->h[0xE]; \ + cF = sc->h[0xF]; \ + } while (0) + +#define WRITE_STATE_BIG(sc) do { \ + sc->h[0x0] = c0; \ + sc->h[0x1] = c1; \ + sc->h[0x2] = c2; \ + sc->h[0x3] = c3; \ + sc->h[0x4] = c4; \ + sc->h[0x5] = c5; \ + sc->h[0x6] = c6; \ + sc->h[0x7] = c7; \ + sc->h[0x8] = c8; \ + sc->h[0x9] = c9; \ + sc->h[0xA] = cA; \ + sc->h[0xB] = cB; \ + sc->h[0xC] = cC; \ + sc->h[0xD] = cD; \ + sc->h[0xE] = cE; \ + sc->h[0xF] = cF; \ + } while (0) + +#define s00 m0 +#define s01 m1 +#define s02 c0 +#define s03 c1 +#define s04 m2 +#define s05 m3 +#define s06 c2 +#define s07 c3 +#define s08 c4 +#define s09 c5 +#define s0A m4 +#define s0B m5 +#define s0C c6 +#define s0D c7 +#define s0E m6 +#define s0F m7 +#define s10 m8 +#define s11 m9 +#define s12 c8 +#define s13 c9 +#define s14 mA +#define s15 mB +#define s16 cA +#define s17 cB +#define s18 cC +#define s19 cD +#define s1A mC +#define s1B mD +#define s1C cE +#define s1D cF +#define s1E mE +#define s1F mF + +#define ROUND_BIG(rc, alpha) \ +do { \ + s00 = _mm_xor_si128( s00, _mm_set1_epi32( alpha[ 0x00 ] ) ); \ + s01 = _mm_xor_si128( s01, _mm_xor_si128( _mm_set1_epi32( alpha[ 0x01 ] ), \ + _mm_set1_epi32( rc ) ) ); \ + s02 = _mm_xor_si128( s02, _mm_set1_epi32( alpha[ 0x02 ] ) ); \ + s03 = _mm_xor_si128( s03, _mm_set1_epi32( alpha[ 0x03 ] ) ); \ + s04 = _mm_xor_si128( s04, _mm_set1_epi32( alpha[ 0x04 ] ) ); \ + s05 = _mm_xor_si128( s05, _mm_set1_epi32( alpha[ 0x05 ] ) ); \ + s06 = _mm_xor_si128( s06, _mm_set1_epi32( alpha[ 0x06 ] ) ); \ + s07 = _mm_xor_si128( s07, _mm_set1_epi32( alpha[ 0x07 ] ) ); \ + s08 = _mm_xor_si128( s08, _mm_set1_epi32( alpha[ 0x08 ] ) ); \ + s09 = _mm_xor_si128( s09, _mm_set1_epi32( alpha[ 0x09 ] ) ); \ + s0A = _mm_xor_si128( s0A, _mm_set1_epi32( alpha[ 0x0A ] ) ); \ + s0B = _mm_xor_si128( s0B, _mm_set1_epi32( alpha[ 0x0B ] ) ); \ + s0C = _mm_xor_si128( s0C, _mm_set1_epi32( alpha[ 0x0C ] ) ); \ + s0D = _mm_xor_si128( s0D, _mm_set1_epi32( alpha[ 0x0D ] ) ); \ + s0E = _mm_xor_si128( s0E, _mm_set1_epi32( alpha[ 0x0E ] ) ); \ + s0F = _mm_xor_si128( s0F, _mm_set1_epi32( alpha[ 0x0F ] ) ); \ + s10 = _mm_xor_si128( s10, _mm_set1_epi32( alpha[ 0x10 ] ) ); \ + s11 = _mm_xor_si128( s11, _mm_set1_epi32( alpha[ 0x11 ] ) ); \ + s12 = _mm_xor_si128( s12, _mm_set1_epi32( alpha[ 0x12 ] ) ); \ + s13 = _mm_xor_si128( s13, _mm_set1_epi32( alpha[ 0x13 ] ) ); \ + s14 = _mm_xor_si128( s14, _mm_set1_epi32( alpha[ 0x14 ] ) ); \ + s15 = _mm_xor_si128( s15, _mm_set1_epi32( alpha[ 0x15 ] ) ); \ + s16 = _mm_xor_si128( s16, _mm_set1_epi32( alpha[ 0x16 ] ) ); \ + s17 = _mm_xor_si128( s17, _mm_set1_epi32( alpha[ 0x17 ] ) ); \ + s18 = _mm_xor_si128( s18, _mm_set1_epi32( alpha[ 0x18 ] ) ); \ + s19 = _mm_xor_si128( s19, _mm_set1_epi32( alpha[ 0x19 ] ) ); \ + s1A = _mm_xor_si128( s1A, _mm_set1_epi32( alpha[ 0x1A ] ) ); \ + s1B = _mm_xor_si128( s1B, _mm_set1_epi32( alpha[ 0x1B ] ) ); \ + s1C = _mm_xor_si128( s1C, _mm_set1_epi32( alpha[ 0x1C ] ) ); \ + s1D = _mm_xor_si128( s1D, _mm_set1_epi32( alpha[ 0x1D ] ) ); \ + s1E = _mm_xor_si128( s1E, _mm_set1_epi32( alpha[ 0x1E ] ) ); \ + s1F = _mm_xor_si128( s1F, _mm_set1_epi32( alpha[ 0x1F ] ) ); \ + SBOX( s00, s08, s10, s18); \ + SBOX( s01, s09, s11, s19); \ + SBOX( s02, s0A, s12, s1A); \ + SBOX( s03, s0B, s13, s1B); \ + SBOX( s04, s0C, s14, s1C); \ + SBOX( s05, s0D, s15, s1D); \ + SBOX( s06, s0E, s16, s1E); \ + SBOX( s07, s0F, s17, s1F); \ + L( s00, s09, s12, s1B ); \ + L( s01, s0A, s13, s1C ); \ + L( s02, s0B, s14, s1D ); \ + L( s03, s0C, s15, s1E ); \ + L( s04, s0D, s16, s1F ); \ + L( s05, s0E, s17, s18 ); \ + L( s06, s0F, s10, s19 ); \ + L( s07, s08, s11, s1A ); \ + L( s00, s02, s05, s07 ); \ + L( s10, s13, s15, s16 ); \ + L( s09, s0B, s0C, s0E ); \ + L( s19, s1A, s1C, s1F ); \ +} while (0) + +#define P_BIG do { \ + ROUND_BIG(0, alpha_n); \ + ROUND_BIG(1, alpha_n); \ + ROUND_BIG(2, alpha_n); \ + ROUND_BIG(3, alpha_n); \ + ROUND_BIG(4, alpha_n); \ + ROUND_BIG(5, alpha_n); \ + } while (0) + +#define PF_BIG do { \ + ROUND_BIG(0, alpha_f); \ + ROUND_BIG(1, alpha_f); \ + ROUND_BIG(2, alpha_f); \ + ROUND_BIG(3, alpha_f); \ + ROUND_BIG(4, alpha_f); \ + ROUND_BIG(5, alpha_f); \ + ROUND_BIG(6, alpha_f); \ + ROUND_BIG(7, alpha_f); \ + ROUND_BIG(8, alpha_f); \ + ROUND_BIG(9, alpha_f); \ + ROUND_BIG(10, alpha_f); \ + ROUND_BIG(11, alpha_f); \ + } while (0) + +#define T_BIG \ +do { /* order is important */ \ + cF = _mm_xor_si128( sc->h[ 0xF ], s17 ); \ + cE = _mm_xor_si128( sc->h[ 0xE ], s16 ); \ + cD = _mm_xor_si128( sc->h[ 0xD ], s15 ); \ + cC = _mm_xor_si128( sc->h[ 0xC ], s14 ); \ + cB = _mm_xor_si128( sc->h[ 0xB ], s13 ); \ + cA = _mm_xor_si128( sc->h[ 0xA ], s12 ); \ + c9 = _mm_xor_si128( sc->h[ 0x9 ], s11 ); \ + c8 = _mm_xor_si128( sc->h[ 0x8 ], s10 ); \ + c7 = _mm_xor_si128( sc->h[ 0x7 ], s07 ); \ + c6 = _mm_xor_si128( sc->h[ 0x6 ], s06 ); \ + c5 = _mm_xor_si128( sc->h[ 0x5 ], s05 ); \ + c4 = _mm_xor_si128( sc->h[ 0x4 ], s04 ); \ + c3 = _mm_xor_si128( sc->h[ 0x3 ], s03 ); \ + c2 = _mm_xor_si128( sc->h[ 0x2 ], s02 ); \ + c1 = _mm_xor_si128( sc->h[ 0x1 ], s01 ); \ + c0 = _mm_xor_si128( sc->h[ 0x0 ], s00 ); \ +} while (0) + +void hamsi_big( hamsi_4way_big_context *sc, __m128i *buf, size_t num ) +{ + DECL_STATE_BIG + sph_u32 tmp; + + tmp = SPH_T32( (sph_u32)num << 6 ); + sc->count_low = SPH_T32( sc->count_low + tmp ); + sc->count_high += (sph_u32)( (num >> 13) >> 13 ); + if ( sc->count_low < tmp ) + sc->count_high++; + + READ_STATE_BIG( sc ); + + while ( num-- > 0 ) + { + __m128i m0, m1, m2, m3, m4, m5, m6, m7; + __m128i m8, m9, mA, mB, mC, mD, mE, mF; + + INPUT_BIG; + P_BIG; + T_BIG; + +// Strange kluge. Without the following WRITE_STATE the hash is bad. +// SPH doesn't do it. + WRITE_STATE_BIG( sc ); + buf += 2; + } + WRITE_STATE_BIG( sc ); +} + +void hamsi_big_final( hamsi_4way_big_context *sc, __m128i *buf ) +{ + __m128i m0, m1, m2, m3, m4, m5, m6, m7; + __m128i m8, m9, mA, mB, mC, mD, mE, mF; + DECL_STATE_BIG + + READ_STATE_BIG( sc ); + INPUT_BIG; + PF_BIG; + T_BIG; + WRITE_STATE_BIG( sc ); +} + +void hamsi_big_init( hamsi_4way_big_context *sc, const sph_u32 *iv ) +{ + sc->partial_len = 0; + sc->count_high = sc->count_low = 0; + for ( int i = 0; i < 16; i ++ ) + sc->h[i] = _mm_set1_epi32( iv[i] ); +} + +void hamsi_big_core( hamsi_4way_big_context *sc, const void *data, size_t len ) +{ + __m128i *vdata = (__m128i*)data; + + if ( sc->partial_len != 0 ) + { + size_t mlen; + + mlen = 8 - sc->partial_len; + if ( len < mlen ) + { + memcpy_128( sc->partial + (sc->partial_len >> 2), data, len>>2 ); + sc->partial_len += len; + return; + } + else + { + memcpy_128( sc->partial + (sc->partial_len >> 2), data, mlen>>2 ); + len -= mlen; + vdata += mlen>>2; + hamsi_big( sc, sc->partial, 1 ); + sc->partial_len = 0; + } + } + + hamsi_big( sc, vdata, len>>3 ); + vdata += ( (len& ~(size_t)7) >> 2 ); + len &= (size_t)7; + memcpy_128( sc->partial, vdata, len>>2 ); +} + +void hamsi_big_close( hamsi_4way_big_context *sc, void *dst, + size_t out_size_w32 ) +{ + __m128i pad[2]; + size_t ptr, u; + __m128i *out = (__m128i*)dst; + + ptr = sc->partial_len; + + pad[0] = mm_byteswap_32( _mm_set1_epi32( sc->count_high ) ); + pad[1] = mm_byteswap_32( _mm_set1_epi32( sc->count_low + (ptr << 3) ) ); + + sc->partial[ ptr>>2 ] = _mm_set1_epi32( 0x80UL ); + + if ( ptr < 8 ) + memset_zero_128( sc->partial + (ptr>>2) + 1, (8-ptr) >> 2 ); + + hamsi_big( sc, sc->partial, 1 ); + hamsi_big_final( sc, pad ); + + for ( u = 0; u < 16; u ++ ) + out[u] = mm_byteswap_32( sc->h[u] ); +} + +void hamsi512_4way_init( void *cc ) +{ + hamsi_big_init( cc, IV512 ); +} + +void hamsi512_4way( void *cc, const void *data, size_t len ) +{ + hamsi_big_core( cc, data, len ); +} + +void hamsi512_4way_close( void *cc, void *dst ) +{ + hamsi_big_close( cc, dst, 16 ); +} + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/algo/hamsi/hamsi-hash-4way.h b/algo/hamsi/hamsi-hash-4way.h new file mode 100644 index 00000000..10d0fbef --- /dev/null +++ b/algo/hamsi/hamsi-hash-4way.h @@ -0,0 +1,72 @@ +/* $Id: sph_hamsi.h 216 2010-06-08 09:46:57Z tp $ */ +/** + * Hamsi interface. This code implements Hamsi with the recommended + * parameters for SHA-3, with outputs of 224, 256, 384 and 512 bits. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @file sph_hamsi.h + * @author Thomas Pornin + */ + +#ifndef HAMSI_4WAY_H__ +#define HAMSI_4WAY_H__ + +#include +#include "algo/sha/sph_types.h" + +#if defined (__AVX__) + +#include "avxdefs.h" + +#ifdef __cplusplus +extern "C"{ +#endif + +#define SPH_SIZE_hamsi512 512 + +typedef struct { + __m128i h[16]; + __m128i partial[2]; + size_t partial_len; + sph_u32 count_high, count_low; +} hamsi_4way_big_context; + +typedef hamsi_4way_big_context hamsi512_4way_context; + +void hamsi512_4way_init(void *cc); + +void hamsi512_4way(void *cc, const void *data, size_t len); + +void hamsi512_4way_close(void *cc, void *dst); + +#ifdef __cplusplus +} +#endif + +#endif + +#endif diff --git a/algo/hamsi/hamsi-helper-4way.c b/algo/hamsi/hamsi-helper-4way.c new file mode 100644 index 00000000..309f3c53 --- /dev/null +++ b/algo/hamsi/hamsi-helper-4way.c @@ -0,0 +1,482 @@ +/* $Id: hamsi_helper.c 202 2010-05-31 15:46:48Z tp $ */ +/* + * Helper code for Hamsi (input block expansion). This code is + * automatically generated and includes precomputed tables for + * expansion code which handles 2 to 8 bits at a time. + * + * This file is included from hamsi.c, and is not meant to be compiled + * independently. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin + */ + +#ifdef __cplusplus +extern "C"{ +#endif + +/* Note: this table lists bits within each byte from least + siginificant to most significant. */ +static const sph_u32 T512[64][16] = { + { SPH_C32(0xef0b0270), SPH_C32(0x3afd0000), SPH_C32(0x5dae0000), + SPH_C32(0x69490000), SPH_C32(0x9b0f3c06), SPH_C32(0x4405b5f9), + SPH_C32(0x66140a51), SPH_C32(0x924f5d0a), SPH_C32(0xc96b0030), + SPH_C32(0xe7250000), SPH_C32(0x2f840000), SPH_C32(0x264f0000), + SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137), SPH_C32(0x509f6984), + SPH_C32(0x9e69af68) }, + { SPH_C32(0xc96b0030), SPH_C32(0xe7250000), SPH_C32(0x2f840000), + SPH_C32(0x264f0000), SPH_C32(0x08695bf9), SPH_C32(0x6dfcf137), + SPH_C32(0x509f6984), SPH_C32(0x9e69af68), SPH_C32(0x26600240), + SPH_C32(0xddd80000), SPH_C32(0x722a0000), SPH_C32(0x4f060000), + SPH_C32(0x936667ff), SPH_C32(0x29f944ce), SPH_C32(0x368b63d5), + SPH_C32(0x0c26f262) }, + { SPH_C32(0x145a3c00), SPH_C32(0xb9e90000), SPH_C32(0x61270000), + SPH_C32(0xf1610000), SPH_C32(0xce613d6c), SPH_C32(0xb0493d78), + SPH_C32(0x47a96720), SPH_C32(0xe18e24c5), SPH_C32(0x23671400), + SPH_C32(0xc8b90000), SPH_C32(0xf4c70000), SPH_C32(0xfb750000), + SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549), SPH_C32(0x02c40a3f), + SPH_C32(0xdc24e61f) }, + { SPH_C32(0x23671400), SPH_C32(0xc8b90000), SPH_C32(0xf4c70000), + SPH_C32(0xfb750000), SPH_C32(0x73cd2465), SPH_C32(0xf8a6a549), + SPH_C32(0x02c40a3f), SPH_C32(0xdc24e61f), SPH_C32(0x373d2800), + SPH_C32(0x71500000), SPH_C32(0x95e00000), SPH_C32(0x0a140000), + SPH_C32(0xbdac1909), SPH_C32(0x48ef9831), SPH_C32(0x456d6d1f), + SPH_C32(0x3daac2da) }, + { SPH_C32(0x54285c00), SPH_C32(0xeaed0000), SPH_C32(0xc5d60000), + SPH_C32(0xa1c50000), SPH_C32(0xb3a26770), SPH_C32(0x94a5c4e1), + SPH_C32(0x6bb0419d), SPH_C32(0x551b3782), SPH_C32(0x9cbb1800), + SPH_C32(0xb0d30000), SPH_C32(0x92510000), SPH_C32(0xed930000), + SPH_C32(0x593a4345), SPH_C32(0xe114d5f4), SPH_C32(0x430633da), + SPH_C32(0x78cace29) }, + { SPH_C32(0x9cbb1800), SPH_C32(0xb0d30000), SPH_C32(0x92510000), + SPH_C32(0xed930000), SPH_C32(0x593a4345), SPH_C32(0xe114d5f4), + SPH_C32(0x430633da), SPH_C32(0x78cace29), SPH_C32(0xc8934400), + SPH_C32(0x5a3e0000), SPH_C32(0x57870000), SPH_C32(0x4c560000), + SPH_C32(0xea982435), SPH_C32(0x75b11115), SPH_C32(0x28b67247), + SPH_C32(0x2dd1f9ab) }, + { SPH_C32(0x29449c00), SPH_C32(0x64e70000), SPH_C32(0xf24b0000), + SPH_C32(0xc2f30000), SPH_C32(0x0ede4e8f), SPH_C32(0x56c23745), + SPH_C32(0xf3e04259), SPH_C32(0x8d0d9ec4), SPH_C32(0x466d0c00), + SPH_C32(0x08620000), SPH_C32(0xdd5d0000), SPH_C32(0xbadd0000), + SPH_C32(0x6a927942), SPH_C32(0x441f2b93), SPH_C32(0x218ace6f), + SPH_C32(0xbf2c0be2) }, + { SPH_C32(0x466d0c00), SPH_C32(0x08620000), SPH_C32(0xdd5d0000), + SPH_C32(0xbadd0000), SPH_C32(0x6a927942), SPH_C32(0x441f2b93), + SPH_C32(0x218ace6f), SPH_C32(0xbf2c0be2), SPH_C32(0x6f299000), + SPH_C32(0x6c850000), SPH_C32(0x2f160000), SPH_C32(0x782e0000), + SPH_C32(0x644c37cd), SPH_C32(0x12dd1cd6), SPH_C32(0xd26a8c36), + SPH_C32(0x32219526) }, + { SPH_C32(0xf6800005), SPH_C32(0x3443c000), SPH_C32(0x24070000), + SPH_C32(0x8f3d0000), SPH_C32(0x21373bfb), SPH_C32(0x0ab8d5ae), + SPH_C32(0xcdc58b19), SPH_C32(0xd795ba31), SPH_C32(0xa67f0001), + SPH_C32(0x71378000), SPH_C32(0x19fc0000), SPH_C32(0x96db0000), + SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3), SPH_C32(0x2c6d478f), + SPH_C32(0xac8e6c88) }, + { SPH_C32(0xa67f0001), SPH_C32(0x71378000), SPH_C32(0x19fc0000), + SPH_C32(0x96db0000), SPH_C32(0x3a8b6dfd), SPH_C32(0xebcaaef3), + SPH_C32(0x2c6d478f), SPH_C32(0xac8e6c88), SPH_C32(0x50ff0004), + SPH_C32(0x45744000), SPH_C32(0x3dfb0000), SPH_C32(0x19e60000), + SPH_C32(0x1bbc5606), SPH_C32(0xe1727b5d), SPH_C32(0xe1a8cc96), + SPH_C32(0x7b1bd6b9) }, + { SPH_C32(0xf7750009), SPH_C32(0xcf3cc000), SPH_C32(0xc3d60000), + SPH_C32(0x04920000), SPH_C32(0x029519a9), SPH_C32(0xf8e836ba), + SPH_C32(0x7a87f14e), SPH_C32(0x9e16981a), SPH_C32(0xd46a0000), + SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000), SPH_C32(0x4a290000), + SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c), SPH_C32(0x98369604), + SPH_C32(0xf746c320) }, + { SPH_C32(0xd46a0000), SPH_C32(0x8dc8c000), SPH_C32(0xa5af0000), + SPH_C32(0x4a290000), SPH_C32(0xfc4e427a), SPH_C32(0xc9b4866c), + SPH_C32(0x98369604), SPH_C32(0xf746c320), SPH_C32(0x231f0009), + SPH_C32(0x42f40000), SPH_C32(0x66790000), SPH_C32(0x4ebb0000), + SPH_C32(0xfedb5bd3), SPH_C32(0x315cb0d6), SPH_C32(0xe2b1674a), + SPH_C32(0x69505b3a) }, + { SPH_C32(0x774400f0), SPH_C32(0xf15a0000), SPH_C32(0xf5b20000), + SPH_C32(0x34140000), SPH_C32(0x89377e8c), SPH_C32(0x5a8bec25), + SPH_C32(0x0bc3cd1e), SPH_C32(0xcf3775cb), SPH_C32(0xf46c0050), + SPH_C32(0x96180000), SPH_C32(0x14a50000), SPH_C32(0x031f0000), + SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19), SPH_C32(0x9ca470d2), + SPH_C32(0x8a341574) }, + { SPH_C32(0xf46c0050), SPH_C32(0x96180000), SPH_C32(0x14a50000), + SPH_C32(0x031f0000), SPH_C32(0x42947eb8), SPH_C32(0x66bf7e19), + SPH_C32(0x9ca470d2), SPH_C32(0x8a341574), SPH_C32(0x832800a0), + SPH_C32(0x67420000), SPH_C32(0xe1170000), SPH_C32(0x370b0000), + SPH_C32(0xcba30034), SPH_C32(0x3c34923c), SPH_C32(0x9767bdcc), + SPH_C32(0x450360bf) }, + { SPH_C32(0xe8870170), SPH_C32(0x9d720000), SPH_C32(0x12db0000), + SPH_C32(0xd4220000), SPH_C32(0xf2886b27), SPH_C32(0xa921e543), + SPH_C32(0x4ef8b518), SPH_C32(0x618813b1), SPH_C32(0xb4370060), + SPH_C32(0x0c4c0000), SPH_C32(0x56c20000), SPH_C32(0x5cae0000), + SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825), SPH_C32(0x1b365f3d), + SPH_C32(0xf3d45758) }, + { SPH_C32(0xb4370060), SPH_C32(0x0c4c0000), SPH_C32(0x56c20000), + SPH_C32(0x5cae0000), SPH_C32(0x94541f3f), SPH_C32(0x3b3ef825), + SPH_C32(0x1b365f3d), SPH_C32(0xf3d45758), SPH_C32(0x5cb00110), + SPH_C32(0x913e0000), SPH_C32(0x44190000), SPH_C32(0x888c0000), + SPH_C32(0x66dc7418), SPH_C32(0x921f1d66), SPH_C32(0x55ceea25), + SPH_C32(0x925c44e9) }, + { SPH_C32(0x0c720000), SPH_C32(0x49e50f00), SPH_C32(0x42790000), + SPH_C32(0x5cea0000), SPH_C32(0x33aa301a), SPH_C32(0x15822514), + SPH_C32(0x95a34b7b), SPH_C32(0xb44b0090), SPH_C32(0xfe220000), + SPH_C32(0xa7580500), SPH_C32(0x25d10000), SPH_C32(0xf7600000), + SPH_C32(0x893178da), SPH_C32(0x1fd4f860), SPH_C32(0x4ed0a315), + SPH_C32(0xa123ff9f) }, + { SPH_C32(0xfe220000), SPH_C32(0xa7580500), SPH_C32(0x25d10000), + SPH_C32(0xf7600000), SPH_C32(0x893178da), SPH_C32(0x1fd4f860), + SPH_C32(0x4ed0a315), SPH_C32(0xa123ff9f), SPH_C32(0xf2500000), + SPH_C32(0xeebd0a00), SPH_C32(0x67a80000), SPH_C32(0xab8a0000), + SPH_C32(0xba9b48c0), SPH_C32(0x0a56dd74), SPH_C32(0xdb73e86e), + SPH_C32(0x1568ff0f) }, + { SPH_C32(0x45180000), SPH_C32(0xa5b51700), SPH_C32(0xf96a0000), + SPH_C32(0x3b480000), SPH_C32(0x1ecc142c), SPH_C32(0x231395d6), + SPH_C32(0x16bca6b0), SPH_C32(0xdf33f4df), SPH_C32(0xb83d0000), + SPH_C32(0x16710600), SPH_C32(0x379a0000), SPH_C32(0xf5b10000), + SPH_C32(0x228161ac), SPH_C32(0xae48f145), SPH_C32(0x66241616), + SPH_C32(0xc5c1eb3e) }, + { SPH_C32(0xb83d0000), SPH_C32(0x16710600), SPH_C32(0x379a0000), + SPH_C32(0xf5b10000), SPH_C32(0x228161ac), SPH_C32(0xae48f145), + SPH_C32(0x66241616), SPH_C32(0xc5c1eb3e), SPH_C32(0xfd250000), + SPH_C32(0xb3c41100), SPH_C32(0xcef00000), SPH_C32(0xcef90000), + SPH_C32(0x3c4d7580), SPH_C32(0x8d5b6493), SPH_C32(0x7098b0a6), + SPH_C32(0x1af21fe1) }, + { SPH_C32(0x75a40000), SPH_C32(0xc28b2700), SPH_C32(0x94a40000), + SPH_C32(0x90f50000), SPH_C32(0xfb7857e0), SPH_C32(0x49ce0bae), + SPH_C32(0x1767c483), SPH_C32(0xaedf667e), SPH_C32(0xd1660000), + SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000), SPH_C32(0xf6940000), + SPH_C32(0x03024527), SPH_C32(0xcf70fcf2), SPH_C32(0xb4431b17), + SPH_C32(0x857f3c2b) }, + { SPH_C32(0xd1660000), SPH_C32(0x1bbc0300), SPH_C32(0x9eec0000), + SPH_C32(0xf6940000), SPH_C32(0x03024527), SPH_C32(0xcf70fcf2), + SPH_C32(0xb4431b17), SPH_C32(0x857f3c2b), SPH_C32(0xa4c20000), + SPH_C32(0xd9372400), SPH_C32(0x0a480000), SPH_C32(0x66610000), + SPH_C32(0xf87a12c7), SPH_C32(0x86bef75c), SPH_C32(0xa324df94), + SPH_C32(0x2ba05a55) }, + { SPH_C32(0x75c90003), SPH_C32(0x0e10c000), SPH_C32(0xd1200000), + SPH_C32(0xbaea0000), SPH_C32(0x8bc42f3e), SPH_C32(0x8758b757), + SPH_C32(0xbb28761d), SPH_C32(0x00b72e2b), SPH_C32(0xeecf0001), + SPH_C32(0x6f564000), SPH_C32(0xf33e0000), SPH_C32(0xa79e0000), + SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5), SPH_C32(0x4a3b40ba), + SPH_C32(0xfeabf254) }, + { SPH_C32(0xeecf0001), SPH_C32(0x6f564000), SPH_C32(0xf33e0000), + SPH_C32(0xa79e0000), SPH_C32(0xbdb57219), SPH_C32(0xb711ebc5), + SPH_C32(0x4a3b40ba), SPH_C32(0xfeabf254), SPH_C32(0x9b060002), + SPH_C32(0x61468000), SPH_C32(0x221e0000), SPH_C32(0x1d740000), + SPH_C32(0x36715d27), SPH_C32(0x30495c92), SPH_C32(0xf11336a7), + SPH_C32(0xfe1cdc7f) }, + { SPH_C32(0x86790000), SPH_C32(0x3f390002), SPH_C32(0xe19ae000), + SPH_C32(0x98560000), SPH_C32(0x9565670e), SPH_C32(0x4e88c8ea), + SPH_C32(0xd3dd4944), SPH_C32(0x161ddab9), SPH_C32(0x30b70000), + SPH_C32(0xe5d00000), SPH_C32(0xf4f46000), SPH_C32(0x42c40000), + SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460), SPH_C32(0x21afa1ea), + SPH_C32(0xb0a51834) }, + { SPH_C32(0x30b70000), SPH_C32(0xe5d00000), SPH_C32(0xf4f46000), + SPH_C32(0x42c40000), SPH_C32(0x63b83d6a), SPH_C32(0x78ba9460), + SPH_C32(0x21afa1ea), SPH_C32(0xb0a51834), SPH_C32(0xb6ce0000), + SPH_C32(0xdae90002), SPH_C32(0x156e8000), SPH_C32(0xda920000), + SPH_C32(0xf6dd5a64), SPH_C32(0x36325c8a), SPH_C32(0xf272e8ae), + SPH_C32(0xa6b8c28d) }, + { SPH_C32(0x14190000), SPH_C32(0x23ca003c), SPH_C32(0x50df0000), + SPH_C32(0x44b60000), SPH_C32(0x1b6c67b0), SPH_C32(0x3cf3ac75), + SPH_C32(0x61e610b0), SPH_C32(0xdbcadb80), SPH_C32(0xe3430000), + SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000), SPH_C32(0xaa4e0000), + SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15), SPH_C32(0x123db156), + SPH_C32(0x3a4e99d7) }, + { SPH_C32(0xe3430000), SPH_C32(0x3a4e0014), SPH_C32(0xf2c60000), + SPH_C32(0xaa4e0000), SPH_C32(0xdb1e42a6), SPH_C32(0x256bbe15), + SPH_C32(0x123db156), SPH_C32(0x3a4e99d7), SPH_C32(0xf75a0000), + SPH_C32(0x19840028), SPH_C32(0xa2190000), SPH_C32(0xeef80000), + SPH_C32(0xc0722516), SPH_C32(0x19981260), SPH_C32(0x73dba1e6), + SPH_C32(0xe1844257) }, + { SPH_C32(0x54500000), SPH_C32(0x0671005c), SPH_C32(0x25ae0000), + SPH_C32(0x6a1e0000), SPH_C32(0x2ea54edf), SPH_C32(0x664e8512), + SPH_C32(0xbfba18c3), SPH_C32(0x7e715d17), SPH_C32(0xbc8d0000), + SPH_C32(0xfc3b0018), SPH_C32(0x19830000), SPH_C32(0xd10b0000), + SPH_C32(0xae1878c4), SPH_C32(0x42a69856), SPH_C32(0x0012da37), + SPH_C32(0x2c3b504e) }, + { SPH_C32(0xbc8d0000), SPH_C32(0xfc3b0018), SPH_C32(0x19830000), + SPH_C32(0xd10b0000), SPH_C32(0xae1878c4), SPH_C32(0x42a69856), + SPH_C32(0x0012da37), SPH_C32(0x2c3b504e), SPH_C32(0xe8dd0000), + SPH_C32(0xfa4a0044), SPH_C32(0x3c2d0000), SPH_C32(0xbb150000), + SPH_C32(0x80bd361b), SPH_C32(0x24e81d44), SPH_C32(0xbfa8c2f4), + SPH_C32(0x524a0d59) }, + { SPH_C32(0x69510000), SPH_C32(0xd4e1009c), SPH_C32(0xc3230000), + SPH_C32(0xac2f0000), SPH_C32(0xe4950bae), SPH_C32(0xcea415dc), + SPH_C32(0x87ec287c), SPH_C32(0xbce1a3ce), SPH_C32(0xc6730000), + SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000), SPH_C32(0x218d0000), + SPH_C32(0x23111587), SPH_C32(0x7913512f), SPH_C32(0x1d28ac88), + SPH_C32(0x378dd173) }, + { SPH_C32(0xc6730000), SPH_C32(0xaf8d000c), SPH_C32(0xa4c10000), + SPH_C32(0x218d0000), SPH_C32(0x23111587), SPH_C32(0x7913512f), + SPH_C32(0x1d28ac88), SPH_C32(0x378dd173), SPH_C32(0xaf220000), + SPH_C32(0x7b6c0090), SPH_C32(0x67e20000), SPH_C32(0x8da20000), + SPH_C32(0xc7841e29), SPH_C32(0xb7b744f3), SPH_C32(0x9ac484f4), + SPH_C32(0x8b6c72bd) }, + { SPH_C32(0xcc140000), SPH_C32(0xa5630000), SPH_C32(0x5ab90780), + SPH_C32(0x3b500000), SPH_C32(0x4bd013ff), SPH_C32(0x879b3418), + SPH_C32(0x694348c1), SPH_C32(0xca5a87fe), SPH_C32(0x819e0000), + SPH_C32(0xec570000), SPH_C32(0x66320280), SPH_C32(0x95f30000), + SPH_C32(0x5da92802), SPH_C32(0x48f43cbc), SPH_C32(0xe65aa22d), + SPH_C32(0x8e67b7fa) }, + { SPH_C32(0x819e0000), SPH_C32(0xec570000), SPH_C32(0x66320280), + SPH_C32(0x95f30000), SPH_C32(0x5da92802), SPH_C32(0x48f43cbc), + SPH_C32(0xe65aa22d), SPH_C32(0x8e67b7fa), SPH_C32(0x4d8a0000), + SPH_C32(0x49340000), SPH_C32(0x3c8b0500), SPH_C32(0xaea30000), + SPH_C32(0x16793bfd), SPH_C32(0xcf6f08a4), SPH_C32(0x8f19eaec), + SPH_C32(0x443d3004) }, + { SPH_C32(0x78230000), SPH_C32(0x12fc0000), SPH_C32(0xa93a0b80), + SPH_C32(0x90a50000), SPH_C32(0x713e2879), SPH_C32(0x7ee98924), + SPH_C32(0xf08ca062), SPH_C32(0x636f8bab), SPH_C32(0x02af0000), + SPH_C32(0xb7280000), SPH_C32(0xba1c0300), SPH_C32(0x56980000), + SPH_C32(0xba8d45d3), SPH_C32(0x8048c667), SPH_C32(0xa95c149a), + SPH_C32(0xf4f6ea7b) }, + { SPH_C32(0x02af0000), SPH_C32(0xb7280000), SPH_C32(0xba1c0300), + SPH_C32(0x56980000), SPH_C32(0xba8d45d3), SPH_C32(0x8048c667), + SPH_C32(0xa95c149a), SPH_C32(0xf4f6ea7b), SPH_C32(0x7a8c0000), + SPH_C32(0xa5d40000), SPH_C32(0x13260880), SPH_C32(0xc63d0000), + SPH_C32(0xcbb36daa), SPH_C32(0xfea14f43), SPH_C32(0x59d0b4f8), + SPH_C32(0x979961d0) }, + { SPH_C32(0xac480000), SPH_C32(0x1ba60000), SPH_C32(0x45fb1380), + SPH_C32(0x03430000), SPH_C32(0x5a85316a), SPH_C32(0x1fb250b6), + SPH_C32(0xfe72c7fe), SPH_C32(0x91e478f6), SPH_C32(0x1e4e0000), + SPH_C32(0xdecf0000), SPH_C32(0x6df80180), SPH_C32(0x77240000), + SPH_C32(0xec47079e), SPH_C32(0xf4a0694e), SPH_C32(0xcda31812), + SPH_C32(0x98aa496e) }, + { SPH_C32(0x1e4e0000), SPH_C32(0xdecf0000), SPH_C32(0x6df80180), + SPH_C32(0x77240000), SPH_C32(0xec47079e), SPH_C32(0xf4a0694e), + SPH_C32(0xcda31812), SPH_C32(0x98aa496e), SPH_C32(0xb2060000), + SPH_C32(0xc5690000), SPH_C32(0x28031200), SPH_C32(0x74670000), + SPH_C32(0xb6c236f4), SPH_C32(0xeb1239f8), SPH_C32(0x33d1dfec), + SPH_C32(0x094e3198) }, + { SPH_C32(0xaec30000), SPH_C32(0x9c4f0001), SPH_C32(0x79d1e000), + SPH_C32(0x2c150000), SPH_C32(0x45cc75b3), SPH_C32(0x6650b736), + SPH_C32(0xab92f78f), SPH_C32(0xa312567b), SPH_C32(0xdb250000), + SPH_C32(0x09290000), SPH_C32(0x49aac000), SPH_C32(0x81e10000), + SPH_C32(0xcafe6b59), SPH_C32(0x42793431), SPH_C32(0x43566b76), + SPH_C32(0xe86cba2e) }, + { SPH_C32(0xdb250000), SPH_C32(0x09290000), SPH_C32(0x49aac000), + SPH_C32(0x81e10000), SPH_C32(0xcafe6b59), SPH_C32(0x42793431), + SPH_C32(0x43566b76), SPH_C32(0xe86cba2e), SPH_C32(0x75e60000), + SPH_C32(0x95660001), SPH_C32(0x307b2000), SPH_C32(0xadf40000), + SPH_C32(0x8f321eea), SPH_C32(0x24298307), SPH_C32(0xe8c49cf9), + SPH_C32(0x4b7eec55) }, + { SPH_C32(0x58430000), SPH_C32(0x807e0000), SPH_C32(0x78330001), + SPH_C32(0xc66b3800), SPH_C32(0xe7375cdc), SPH_C32(0x79ad3fdd), + SPH_C32(0xac73fe6f), SPH_C32(0x3a4479b1), SPH_C32(0x1d5a0000), + SPH_C32(0x2b720000), SPH_C32(0x488d0000), SPH_C32(0xaf611800), + SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0), SPH_C32(0x81a20429), + SPH_C32(0x1e7536a6) }, + { SPH_C32(0x1d5a0000), SPH_C32(0x2b720000), SPH_C32(0x488d0000), + SPH_C32(0xaf611800), SPH_C32(0x25cb2ec5), SPH_C32(0xc879bfd0), + SPH_C32(0x81a20429), SPH_C32(0x1e7536a6), SPH_C32(0x45190000), + SPH_C32(0xab0c0000), SPH_C32(0x30be0001), SPH_C32(0x690a2000), + SPH_C32(0xc2fc7219), SPH_C32(0xb1d4800d), SPH_C32(0x2dd1fa46), + SPH_C32(0x24314f17) }, + { SPH_C32(0xa53b0000), SPH_C32(0x14260000), SPH_C32(0x4e30001e), + SPH_C32(0x7cae0000), SPH_C32(0x8f9e0dd5), SPH_C32(0x78dfaa3d), + SPH_C32(0xf73168d8), SPH_C32(0x0b1b4946), SPH_C32(0x07ed0000), + SPH_C32(0xb2500000), SPH_C32(0x8774000a), SPH_C32(0x970d0000), + SPH_C32(0x437223ae), SPH_C32(0x48c76ea4), SPH_C32(0xf4786222), + SPH_C32(0x9075b1ce) }, + { SPH_C32(0x07ed0000), SPH_C32(0xb2500000), SPH_C32(0x8774000a), + SPH_C32(0x970d0000), SPH_C32(0x437223ae), SPH_C32(0x48c76ea4), + SPH_C32(0xf4786222), SPH_C32(0x9075b1ce), SPH_C32(0xa2d60000), + SPH_C32(0xa6760000), SPH_C32(0xc9440014), SPH_C32(0xeba30000), + SPH_C32(0xccec2e7b), SPH_C32(0x3018c499), SPH_C32(0x03490afa), + SPH_C32(0x9b6ef888) }, + { SPH_C32(0x88980000), SPH_C32(0x1f940000), SPH_C32(0x7fcf002e), + SPH_C32(0xfb4e0000), SPH_C32(0xf158079a), SPH_C32(0x61ae9167), + SPH_C32(0xa895706c), SPH_C32(0xe6107494), SPH_C32(0x0bc20000), + SPH_C32(0xdb630000), SPH_C32(0x7e88000c), SPH_C32(0x15860000), + SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43), SPH_C32(0xf460449e), + SPH_C32(0xd8b61463) }, + { SPH_C32(0x0bc20000), SPH_C32(0xdb630000), SPH_C32(0x7e88000c), + SPH_C32(0x15860000), SPH_C32(0x91fd48f3), SPH_C32(0x7581bb43), + SPH_C32(0xf460449e), SPH_C32(0xd8b61463), SPH_C32(0x835a0000), + SPH_C32(0xc4f70000), SPH_C32(0x01470022), SPH_C32(0xeec80000), + SPH_C32(0x60a54f69), SPH_C32(0x142f2a24), SPH_C32(0x5cf534f2), + SPH_C32(0x3ea660f7) }, + { SPH_C32(0x52500000), SPH_C32(0x29540000), SPH_C32(0x6a61004e), + SPH_C32(0xf0ff0000), SPH_C32(0x9a317eec), SPH_C32(0x452341ce), + SPH_C32(0xcf568fe5), SPH_C32(0x5303130f), SPH_C32(0x538d0000), + SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006), SPH_C32(0x56ff0000), + SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9), SPH_C32(0xa9444018), + SPH_C32(0x7f975691) }, + { SPH_C32(0x538d0000), SPH_C32(0xa9fc0000), SPH_C32(0x9ef70006), + SPH_C32(0x56ff0000), SPH_C32(0x0ae4004e), SPH_C32(0x92c5cdf9), + SPH_C32(0xa9444018), SPH_C32(0x7f975691), SPH_C32(0x01dd0000), + SPH_C32(0x80a80000), SPH_C32(0xf4960048), SPH_C32(0xa6000000), + SPH_C32(0x90d57ea2), SPH_C32(0xd7e68c37), SPH_C32(0x6612cffd), + SPH_C32(0x2c94459e) }, + { SPH_C32(0xe6280000), SPH_C32(0x4c4b0000), SPH_C32(0xa8550000), + SPH_C32(0xd3d002e0), SPH_C32(0xd86130b8), SPH_C32(0x98a7b0da), + SPH_C32(0x289506b4), SPH_C32(0xd75a4897), SPH_C32(0xf0c50000), + SPH_C32(0x59230000), SPH_C32(0x45820000), SPH_C32(0xe18d00c0), + SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699), SPH_C32(0xcbe0fe1c), + SPH_C32(0x56a7b19f) }, + { SPH_C32(0xf0c50000), SPH_C32(0x59230000), SPH_C32(0x45820000), + SPH_C32(0xe18d00c0), SPH_C32(0x3b6d0631), SPH_C32(0xc2ed5699), + SPH_C32(0xcbe0fe1c), SPH_C32(0x56a7b19f), SPH_C32(0x16ed0000), + SPH_C32(0x15680000), SPH_C32(0xedd70000), SPH_C32(0x325d0220), + SPH_C32(0xe30c3689), SPH_C32(0x5a4ae643), SPH_C32(0xe375f8a8), + SPH_C32(0x81fdf908) }, + { SPH_C32(0xb4310000), SPH_C32(0x77330000), SPH_C32(0xb15d0000), + SPH_C32(0x7fd004e0), SPH_C32(0x78a26138), SPH_C32(0xd116c35d), + SPH_C32(0xd256d489), SPH_C32(0x4e6f74de), SPH_C32(0xe3060000), + SPH_C32(0xbdc10000), SPH_C32(0x87130000), SPH_C32(0xbff20060), + SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751), SPH_C32(0x73c5ab06), + SPH_C32(0x5bd61539) }, + { SPH_C32(0xe3060000), SPH_C32(0xbdc10000), SPH_C32(0x87130000), + SPH_C32(0xbff20060), SPH_C32(0x2eba0a1a), SPH_C32(0x8db53751), + SPH_C32(0x73c5ab06), SPH_C32(0x5bd61539), SPH_C32(0x57370000), + SPH_C32(0xcaf20000), SPH_C32(0x364e0000), SPH_C32(0xc0220480), + SPH_C32(0x56186b22), SPH_C32(0x5ca3f40c), SPH_C32(0xa1937f8f), + SPH_C32(0x15b961e7) }, + { SPH_C32(0x02f20000), SPH_C32(0xa2810000), SPH_C32(0x873f0000), + SPH_C32(0xe36c7800), SPH_C32(0x1e1d74ef), SPH_C32(0x073d2bd6), + SPH_C32(0xc4c23237), SPH_C32(0x7f32259e), SPH_C32(0xbadd0000), + SPH_C32(0x13ad0000), SPH_C32(0xb7e70000), SPH_C32(0xf7282800), + SPH_C32(0xdf45144d), SPH_C32(0x361ac33a), SPH_C32(0xea5a8d14), + SPH_C32(0x2a2c18f0) }, + { SPH_C32(0xbadd0000), SPH_C32(0x13ad0000), SPH_C32(0xb7e70000), + SPH_C32(0xf7282800), SPH_C32(0xdf45144d), SPH_C32(0x361ac33a), + SPH_C32(0xea5a8d14), SPH_C32(0x2a2c18f0), SPH_C32(0xb82f0000), + SPH_C32(0xb12c0000), SPH_C32(0x30d80000), SPH_C32(0x14445000), + SPH_C32(0xc15860a2), SPH_C32(0x3127e8ec), SPH_C32(0x2e98bf23), + SPH_C32(0x551e3d6e) }, + { SPH_C32(0x1e6c0000), SPH_C32(0xc4420000), SPH_C32(0x8a2e0000), + SPH_C32(0xbcb6b800), SPH_C32(0x2c4413b6), SPH_C32(0x8bfdd3da), + SPH_C32(0x6a0c1bc8), SPH_C32(0xb99dc2eb), SPH_C32(0x92560000), + SPH_C32(0x1eda0000), SPH_C32(0xea510000), SPH_C32(0xe8b13000), + SPH_C32(0xa93556a5), SPH_C32(0xebfb6199), SPH_C32(0xb15c2254), + SPH_C32(0x33c5244f) }, + { SPH_C32(0x92560000), SPH_C32(0x1eda0000), SPH_C32(0xea510000), + SPH_C32(0xe8b13000), SPH_C32(0xa93556a5), SPH_C32(0xebfb6199), + SPH_C32(0xb15c2254), SPH_C32(0x33c5244f), SPH_C32(0x8c3a0000), + SPH_C32(0xda980000), SPH_C32(0x607f0000), SPH_C32(0x54078800), + SPH_C32(0x85714513), SPH_C32(0x6006b243), SPH_C32(0xdb50399c), + SPH_C32(0x8a58e6a4) }, + { SPH_C32(0x033d0000), SPH_C32(0x08b30000), SPH_C32(0xf33a0000), + SPH_C32(0x3ac20007), SPH_C32(0x51298a50), SPH_C32(0x6b6e661f), + SPH_C32(0x0ea5cfe3), SPH_C32(0xe6da7ffe), SPH_C32(0xa8da0000), + SPH_C32(0x96be0000), SPH_C32(0x5c1d0000), SPH_C32(0x07da0002), + SPH_C32(0x7d669583), SPH_C32(0x1f98708a), SPH_C32(0xbb668808), + SPH_C32(0xda878000) }, + { SPH_C32(0xa8da0000), SPH_C32(0x96be0000), SPH_C32(0x5c1d0000), + SPH_C32(0x07da0002), SPH_C32(0x7d669583), SPH_C32(0x1f98708a), + SPH_C32(0xbb668808), SPH_C32(0xda878000), SPH_C32(0xabe70000), + SPH_C32(0x9e0d0000), SPH_C32(0xaf270000), SPH_C32(0x3d180005), + SPH_C32(0x2c4f1fd3), SPH_C32(0x74f61695), SPH_C32(0xb5c347eb), + SPH_C32(0x3c5dfffe) }, + { SPH_C32(0x01930000), SPH_C32(0xe7820000), SPH_C32(0xedfb0000), + SPH_C32(0xcf0c000b), SPH_C32(0x8dd08d58), SPH_C32(0xbca3b42e), + SPH_C32(0x063661e1), SPH_C32(0x536f9e7b), SPH_C32(0x92280000), + SPH_C32(0xdc850000), SPH_C32(0x57fa0000), SPH_C32(0x56dc0003), + SPH_C32(0xbae92316), SPH_C32(0x5aefa30c), SPH_C32(0x90cef752), + SPH_C32(0x7b1675d7) }, + { SPH_C32(0x92280000), SPH_C32(0xdc850000), SPH_C32(0x57fa0000), + SPH_C32(0x56dc0003), SPH_C32(0xbae92316), SPH_C32(0x5aefa30c), + SPH_C32(0x90cef752), SPH_C32(0x7b1675d7), SPH_C32(0x93bb0000), + SPH_C32(0x3b070000), SPH_C32(0xba010000), SPH_C32(0x99d00008), + SPH_C32(0x3739ae4e), SPH_C32(0xe64c1722), SPH_C32(0x96f896b3), + SPH_C32(0x2879ebac) }, + { SPH_C32(0x5fa80000), SPH_C32(0x56030000), SPH_C32(0x43ae0000), + SPH_C32(0x64f30013), SPH_C32(0x257e86bf), SPH_C32(0x1311944e), + SPH_C32(0x541e95bf), SPH_C32(0x8ea4db69), SPH_C32(0x00440000), + SPH_C32(0x7f480000), SPH_C32(0xda7c0000), SPH_C32(0x2a230001), + SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87), SPH_C32(0x030a9e60), + SPH_C32(0xbe0a679e) }, + { SPH_C32(0x00440000), SPH_C32(0x7f480000), SPH_C32(0xda7c0000), + SPH_C32(0x2a230001), SPH_C32(0x3badc9cc), SPH_C32(0xa9b69c87), + SPH_C32(0x030a9e60), SPH_C32(0xbe0a679e), SPH_C32(0x5fec0000), + SPH_C32(0x294b0000), SPH_C32(0x99d20000), SPH_C32(0x4ed00012), + SPH_C32(0x1ed34f73), SPH_C32(0xbaa708c9), SPH_C32(0x57140bdf), + SPH_C32(0x30aebcf7) }, + { SPH_C32(0xee930000), SPH_C32(0xd6070000), SPH_C32(0x92c10000), + SPH_C32(0x2b9801e0), SPH_C32(0x9451287c), SPH_C32(0x3b6cfb57), + SPH_C32(0x45312374), SPH_C32(0x201f6a64), SPH_C32(0x7b280000), + SPH_C32(0x57420000), SPH_C32(0xa9e50000), SPH_C32(0x634300a0), + SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb), SPH_C32(0x27f83b03), + SPH_C32(0xc7ff60f0) }, + { SPH_C32(0x7b280000), SPH_C32(0x57420000), SPH_C32(0xa9e50000), + SPH_C32(0x634300a0), SPH_C32(0x9edb442f), SPH_C32(0x6d9995bb), + SPH_C32(0x27f83b03), SPH_C32(0xc7ff60f0), SPH_C32(0x95bb0000), + SPH_C32(0x81450000), SPH_C32(0x3b240000), SPH_C32(0x48db0140), + SPH_C32(0x0a8a6c53), SPH_C32(0x56f56eec), SPH_C32(0x62c91877), + SPH_C32(0xe7e00a94) } +}; + +#define U_BIG( n ) \ +do { \ + __m128i db = buf[n]; \ + for ( int u = 0; u < 32; u++ ) \ + { \ + __m128i dm = mm_negate_32( _mm_and_si128( db, mm_one_32 ) ); \ + m0 = _mm_xor_si128( m0, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \ + m1 = _mm_xor_si128( m1, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \ + m2 = _mm_xor_si128( m2, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \ + m3 = _mm_xor_si128( m3, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \ + m4 = _mm_xor_si128( m4, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \ + m5 = _mm_xor_si128( m5, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \ + m6 = _mm_xor_si128( m6, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \ + m7 = _mm_xor_si128( m7, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \ + m8 = _mm_xor_si128( m8, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \ + m9 = _mm_xor_si128( m9, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \ + mA = _mm_xor_si128( mA, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \ + mB = _mm_xor_si128( mB, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \ + mC = _mm_xor_si128( mC, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \ + mD = _mm_xor_si128( mD, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \ + mE = _mm_xor_si128( mE, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \ + mF = _mm_xor_si128( mF, _mm_and_si128( dm, _mm_set1_epi32( *tp++ ) ) ); \ + db = _mm_srli_epi32( db, 1 ); \ + } \ +} while (0); + +#define INPUT_BIG \ +do { \ + const sph_u32 *tp = &T512[0][0]; \ + m0 = mm_zero; \ + m1 = mm_zero; \ + m2 = mm_zero; \ + m3 = mm_zero; \ + m4 = mm_zero; \ + m5 = mm_zero; \ + m6 = mm_zero; \ + m7 = mm_zero; \ + m8 = mm_zero; \ + m9 = mm_zero; \ + mA = mm_zero; \ + mB = mm_zero; \ + mC = mm_zero; \ + mD = mm_zero; \ + mE = mm_zero; \ + mF = mm_zero; \ + U_BIG( 0 ); \ + U_BIG( 1 ); \ +} while (0) + +#ifdef __cplusplus +} +#endif diff --git a/algo/haval/haval-4way-helper.c b/algo/haval/haval-4way-helper.c new file mode 100644 index 00000000..87de1dec --- /dev/null +++ b/algo/haval/haval-4way-helper.c @@ -0,0 +1,115 @@ +/* $Id: haval_helper.c 218 2010-06-08 17:06:34Z tp $ */ +/* + * Helper code, included (three times !) by HAVAL implementation. + * + * TODO: try to merge this with md_helper.c. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin + */ + +#undef SPH_XCAT +#define SPH_XCAT(a, b) SPH_XCAT_(a, b) +#undef SPH_XCAT_ +#define SPH_XCAT_(a, b) a ## b + +static void +SPH_XCAT(SPH_XCAT(haval, PASSES), _4way) +( haval_4way_context *sc, const void *data, size_t len ) +{ + __m128i *vdata = (__m128i*)data; + unsigned current; + + current = (unsigned)sc->count_low & 127U; + while ( len > 0 ) + { + unsigned clen; + sph_u32 clow, clow2; + + clen = 128U - current; + if ( clen > len ) + clen = len; + memcpy_128( sc->buf + (current>>2), vdata, clen>>2 ); + vdata += clen>>2; + current += clen; + len -= clen; + if ( current == 128U ) + { + DSTATE; + IN_PREPARE(sc->buf); + RSTATE; + SPH_XCAT(CORE, PASSES)(INW); + WSTATE; + current = 0; + } + clow = sc->count_low; + clow2 = SPH_T32(clow + clen); + sc->count_low = clow2; + if ( clow2 < clow ) + sc->count_high ++; + } +} + +static void +SPH_XCAT(SPH_XCAT(haval, PASSES), _4way_close)( haval_4way_context *sc, + void *dst) +{ + unsigned current; + DSTATE; + + current = (unsigned)sc->count_low & 127UL; + + sc->buf[ current>>2 ] = mm_one_32; + current += 4; + RSTATE; + if ( current > 116UL ) + { + memset_zero_128( sc->buf + ( current>>2 ), (128UL-current) >> 2 ); + do + { + IN_PREPARE(sc->buf); + SPH_XCAT(CORE, PASSES)(INW); + } while (0); + current = 0; + } + + uint32_t t1, t2; + memset_zero_128( sc->buf + ( current>>2 ), (116UL-current) >> 2 ); + t1 = 0x01 | (PASSES << 3); + t2 = sc->olen << 3; + sc->buf[ 116>>2 ] = _mm_set1_epi32( ( t1 << 16 ) | ( t2 << 24 ) ); + sc->buf[ 120>>2 ] = _mm_set1_epi32( sc->count_low << 3 ); + sc->buf[ 124>>2 ] = _mm_set1_epi32( (sc->count_high << 3) + | (sc->count_low >> 29) ); + do + { + IN_PREPARE(sc->buf); + SPH_XCAT(CORE, PASSES)(INW); + } while (0); + WSTATE; + haval_4way_out( sc, dst ); +} diff --git a/algo/haval/haval-hash-4way.c b/algo/haval/haval-hash-4way.c new file mode 100644 index 00000000..64f2a447 --- /dev/null +++ b/algo/haval/haval-hash-4way.c @@ -0,0 +1,522 @@ +/* $Id: haval.c 227 2010-06-16 17:28:38Z tp $ */ +/* + * HAVAL implementation. + * + * The HAVAL reference paper is of questionable clarity with regards to + * some details such as endianness of bits within a byte, bytes within + * a 32-bit word, or the actual ordering of words within a stream of + * words. This implementation has been made compatible with the reference + * implementation available on: http://labs.calyptix.com/haval.php + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin + */ + +#include +#include +#include "haval-hash-4way.h" + +#if defined (__AVX__) + +#ifdef __cplusplus +extern "C"{ +#endif + +//#if SPH_SMALL_FOOTPRINT && !defined SPH_SMALL_FOOTPRINT_HAVAL +#define SPH_SMALL_FOOTPRINT_HAVAL 1 +//#endif + +#define F1(x6, x5, x4, x3, x2, x1, x0) \ + _mm_xor_si128( x0, \ + _mm_xor_si128( _mm_and_si128(_mm_xor_si128( x0, x4 ), x1 ), \ + _mm_xor_si128( _mm_and_si128( x2, x5 ), \ + _mm_and_si128( x3, x6 ) ) ) ) \ + +#define F2(x6, x5, x4, x3, x2, x1, x0) \ + _mm_xor_si128( \ + _mm_and_si128( x2, \ + _mm_xor_si128( _mm_andnot_si128( x3, x1 ), \ + _mm_xor_si128( _mm_and_si128( x4, x5 ), \ + _mm_xor_si128( x6, x0 ) ) ) ), \ + _mm_xor_si128( \ + _mm_and_si128( x4, _mm_xor_si128( x1, x5 ) ), \ + _mm_xor_si128( _mm_and_si128( x3, x5 ), x0 ) ) ) \ + +#define F3(x6, x5, x4, x3, x2, x1, x0) \ + _mm_xor_si128( \ + _mm_and_si128( x3, \ + _mm_xor_si128( _mm_and_si128( x1, x2 ), \ + _mm_xor_si128( x6, x0 ) ) ), \ + _mm_xor_si128( _mm_xor_si128(_mm_and_si128( x1, x4 ), \ + _mm_and_si128( x2, x5 ) ), x0 ) ) + +#define F4(x6, x5, x4, x3, x2, x1, x0) \ + _mm_xor_si128( \ + _mm_xor_si128( \ + _mm_and_si128( x3, \ + _mm_xor_si128( _mm_xor_si128( _mm_and_si128( x1, x2 ), \ + _mm_or_si128( x4, x6 ) ), x5 ) ), \ + _mm_and_si128( x4, \ + _mm_xor_si128( _mm_xor_si128( _mm_and_si128( mm_not(x2), x5 ), \ + _mm_xor_si128( x1, x6 ) ), x0 ) ) ), \ + _mm_xor_si128( _mm_and_si128( x2, x6 ), x0 ) ) + + +#define F5(x6, x5, x4, x3, x2, x1, x0) \ + _mm_xor_si128( \ + _mm_and_si128( x0, \ + mm_not( _mm_xor_si128( \ + _mm_and_si128( _mm_and_si128( x1, x2 ), x3 ), x5 ) ) ), \ + _mm_xor_si128( _mm_xor_si128( _mm_and_si128( x1, x4 ), \ + _mm_and_si128( x2, x5 ) ), \ + _mm_and_si128( x3, x6 ) ) ) + +/* + * The macros below integrate the phi() permutations, depending on the + * pass and the total number of passes. + */ + +#define FP3_1(x6, x5, x4, x3, x2, x1, x0) \ + F1(x1, x0, x3, x5, x6, x2, x4) +#define FP3_2(x6, x5, x4, x3, x2, x1, x0) \ + F2(x4, x2, x1, x0, x5, x3, x6) +#define FP3_3(x6, x5, x4, x3, x2, x1, x0) \ + F3(x6, x1, x2, x3, x4, x5, x0) + +#define FP4_1(x6, x5, x4, x3, x2, x1, x0) \ + F1(x2, x6, x1, x4, x5, x3, x0) +#define FP4_2(x6, x5, x4, x3, x2, x1, x0) \ + F2(x3, x5, x2, x0, x1, x6, x4) +#define FP4_3(x6, x5, x4, x3, x2, x1, x0) \ + F3(x1, x4, x3, x6, x0, x2, x5) +#define FP4_4(x6, x5, x4, x3, x2, x1, x0) \ + F4(x6, x4, x0, x5, x2, x1, x3) + +#define FP5_1(x6, x5, x4, x3, x2, x1, x0) \ + F1(x3, x4, x1, x0, x5, x2, x6) +#define FP5_2(x6, x5, x4, x3, x2, x1, x0) \ + F2(x6, x2, x1, x0, x3, x4, x5) +#define FP5_3(x6, x5, x4, x3, x2, x1, x0) \ + F3(x2, x6, x0, x4, x3, x1, x5) +#define FP5_4(x6, x5, x4, x3, x2, x1, x0) \ + F4(x1, x5, x3, x2, x0, x4, x6) +#define FP5_5(x6, x5, x4, x3, x2, x1, x0) \ + F5(x2, x5, x0, x6, x4, x3, x1) + +/* + * One step, for "n" passes, pass number "p" (1 <= p <= n), using + * input word number "w" and step constant "c". + */ +#define STEP(n, p, x7, x6, x5, x4, x3, x2, x1, x0, w, c) \ +do { \ + __m128i t = FP ## n ## _ ## p(x6, x5, x4, x3, x2, x1, x0); \ + x7 = _mm_add_epi32( _mm_add_epi32( mm_rotr_32( t, 7 ), \ + mm_rotr_32( x7, 11 ) ), \ + _mm_add_epi32( w, _mm_set1_epi32( c ) ) ); \ +} while (0) + +/* + * PASSy(n, in) computes pass number "y", for a total of "n", using the + * one-argument macro "in" to access input words. Current state is assumed + * to be held in variables "s0" to "s7". + */ + +//#if SPH_SMALL_FOOTPRINT_HAVAL + +#define PASS1(n, in) do { \ + unsigned pass_count; \ + for (pass_count = 0; pass_count < 32; pass_count += 8) { \ + STEP(n, 1, s7, s6, s5, s4, s3, s2, s1, s0, \ + in(pass_count + 0), SPH_C32(0x00000000)); \ + STEP(n, 1, s6, s5, s4, s3, s2, s1, s0, s7, \ + in(pass_count + 1), SPH_C32(0x00000000)); \ + STEP(n, 1, s5, s4, s3, s2, s1, s0, s7, s6, \ + in(pass_count + 2), SPH_C32(0x00000000)); \ + STEP(n, 1, s4, s3, s2, s1, s0, s7, s6, s5, \ + in(pass_count + 3), SPH_C32(0x00000000)); \ + STEP(n, 1, s3, s2, s1, s0, s7, s6, s5, s4, \ + in(pass_count + 4), SPH_C32(0x00000000)); \ + STEP(n, 1, s2, s1, s0, s7, s6, s5, s4, s3, \ + in(pass_count + 5), SPH_C32(0x00000000)); \ + STEP(n, 1, s1, s0, s7, s6, s5, s4, s3, s2, \ + in(pass_count + 6), SPH_C32(0x00000000)); \ + STEP(n, 1, s0, s7, s6, s5, s4, s3, s2, s1, \ + in(pass_count + 7), SPH_C32(0x00000000)); \ + } \ + } while (0) + +#define PASSG(p, n, in) do { \ + unsigned pass_count; \ + for (pass_count = 0; pass_count < 32; pass_count += 8) { \ + STEP(n, p, s7, s6, s5, s4, s3, s2, s1, s0, \ + in(MP ## p[pass_count + 0]), \ + RK ## p[pass_count + 0]); \ + STEP(n, p, s6, s5, s4, s3, s2, s1, s0, s7, \ + in(MP ## p[pass_count + 1]), \ + RK ## p[pass_count + 1]); \ + STEP(n, p, s5, s4, s3, s2, s1, s0, s7, s6, \ + in(MP ## p[pass_count + 2]), \ + RK ## p[pass_count + 2]); \ + STEP(n, p, s4, s3, s2, s1, s0, s7, s6, s5, \ + in(MP ## p[pass_count + 3]), \ + RK ## p[pass_count + 3]); \ + STEP(n, p, s3, s2, s1, s0, s7, s6, s5, s4, \ + in(MP ## p[pass_count + 4]), \ + RK ## p[pass_count + 4]); \ + STEP(n, p, s2, s1, s0, s7, s6, s5, s4, s3, \ + in(MP ## p[pass_count + 5]), \ + RK ## p[pass_count + 5]); \ + STEP(n, p, s1, s0, s7, s6, s5, s4, s3, s2, \ + in(MP ## p[pass_count + 6]), \ + RK ## p[pass_count + 6]); \ + STEP(n, p, s0, s7, s6, s5, s4, s3, s2, s1, \ + in(MP ## p[pass_count + 7]), \ + RK ## p[pass_count + 7]); \ + } \ + } while (0) + +#define PASS2(n, in) PASSG(2, n, in) +#define PASS3(n, in) PASSG(3, n, in) +#define PASS4(n, in) PASSG(4, n, in) +#define PASS5(n, in) PASSG(5, n, in) + +static const unsigned MP2[32] = { + 5, 14, 26, 18, 11, 28, 7, 16, + 0, 23, 20, 22, 1, 10, 4, 8, + 30, 3, 21, 9, 17, 24, 29, 6, + 19, 12, 15, 13, 2, 25, 31, 27 +}; + +static const unsigned MP3[32] = { + 19, 9, 4, 20, 28, 17, 8, 22, + 29, 14, 25, 12, 24, 30, 16, 26, + 31, 15, 7, 3, 1, 0, 18, 27, + 13, 6, 21, 10, 23, 11, 5, 2 +}; + +static const unsigned MP4[32] = { + 24, 4, 0, 14, 2, 7, 28, 23, + 26, 6, 30, 20, 18, 25, 19, 3, + 22, 11, 31, 21, 8, 27, 12, 9, + 1, 29, 5, 15, 17, 10, 16, 13 +}; + +static const unsigned MP5[32] = { + 27, 3, 21, 26, 17, 11, 20, 29, + 19, 0, 12, 7, 13, 8, 31, 10, + 5, 9, 14, 30, 18, 6, 28, 24, + 2, 23, 16, 22, 4, 1, 25, 15 +}; + +static const sph_u32 RK2[32] = { + SPH_C32(0x452821E6), SPH_C32(0x38D01377), + SPH_C32(0xBE5466CF), SPH_C32(0x34E90C6C), + SPH_C32(0xC0AC29B7), SPH_C32(0xC97C50DD), + SPH_C32(0x3F84D5B5), SPH_C32(0xB5470917), + SPH_C32(0x9216D5D9), SPH_C32(0x8979FB1B), + SPH_C32(0xD1310BA6), SPH_C32(0x98DFB5AC), + SPH_C32(0x2FFD72DB), SPH_C32(0xD01ADFB7), + SPH_C32(0xB8E1AFED), SPH_C32(0x6A267E96), + SPH_C32(0xBA7C9045), SPH_C32(0xF12C7F99), + SPH_C32(0x24A19947), SPH_C32(0xB3916CF7), + SPH_C32(0x0801F2E2), SPH_C32(0x858EFC16), + SPH_C32(0x636920D8), SPH_C32(0x71574E69), + SPH_C32(0xA458FEA3), SPH_C32(0xF4933D7E), + SPH_C32(0x0D95748F), SPH_C32(0x728EB658), + SPH_C32(0x718BCD58), SPH_C32(0x82154AEE), + SPH_C32(0x7B54A41D), SPH_C32(0xC25A59B5) +}; + +static const sph_u32 RK3[32] = { + SPH_C32(0x9C30D539), SPH_C32(0x2AF26013), + SPH_C32(0xC5D1B023), SPH_C32(0x286085F0), + SPH_C32(0xCA417918), SPH_C32(0xB8DB38EF), + SPH_C32(0x8E79DCB0), SPH_C32(0x603A180E), + SPH_C32(0x6C9E0E8B), SPH_C32(0xB01E8A3E), + SPH_C32(0xD71577C1), SPH_C32(0xBD314B27), + SPH_C32(0x78AF2FDA), SPH_C32(0x55605C60), + SPH_C32(0xE65525F3), SPH_C32(0xAA55AB94), + SPH_C32(0x57489862), SPH_C32(0x63E81440), + SPH_C32(0x55CA396A), SPH_C32(0x2AAB10B6), + SPH_C32(0xB4CC5C34), SPH_C32(0x1141E8CE), + SPH_C32(0xA15486AF), SPH_C32(0x7C72E993), + SPH_C32(0xB3EE1411), SPH_C32(0x636FBC2A), + SPH_C32(0x2BA9C55D), SPH_C32(0x741831F6), + SPH_C32(0xCE5C3E16), SPH_C32(0x9B87931E), + SPH_C32(0xAFD6BA33), SPH_C32(0x6C24CF5C) +}; + +static const sph_u32 RK4[32] = { + SPH_C32(0x7A325381), SPH_C32(0x28958677), + SPH_C32(0x3B8F4898), SPH_C32(0x6B4BB9AF), + SPH_C32(0xC4BFE81B), SPH_C32(0x66282193), + SPH_C32(0x61D809CC), SPH_C32(0xFB21A991), + SPH_C32(0x487CAC60), SPH_C32(0x5DEC8032), + SPH_C32(0xEF845D5D), SPH_C32(0xE98575B1), + SPH_C32(0xDC262302), SPH_C32(0xEB651B88), + SPH_C32(0x23893E81), SPH_C32(0xD396ACC5), + SPH_C32(0x0F6D6FF3), SPH_C32(0x83F44239), + SPH_C32(0x2E0B4482), SPH_C32(0xA4842004), + SPH_C32(0x69C8F04A), SPH_C32(0x9E1F9B5E), + SPH_C32(0x21C66842), SPH_C32(0xF6E96C9A), + SPH_C32(0x670C9C61), SPH_C32(0xABD388F0), + SPH_C32(0x6A51A0D2), SPH_C32(0xD8542F68), + SPH_C32(0x960FA728), SPH_C32(0xAB5133A3), + SPH_C32(0x6EEF0B6C), SPH_C32(0x137A3BE4) +}; + +static const sph_u32 RK5[32] = { + SPH_C32(0xBA3BF050), SPH_C32(0x7EFB2A98), + SPH_C32(0xA1F1651D), SPH_C32(0x39AF0176), + SPH_C32(0x66CA593E), SPH_C32(0x82430E88), + SPH_C32(0x8CEE8619), SPH_C32(0x456F9FB4), + SPH_C32(0x7D84A5C3), SPH_C32(0x3B8B5EBE), + SPH_C32(0xE06F75D8), SPH_C32(0x85C12073), + SPH_C32(0x401A449F), SPH_C32(0x56C16AA6), + SPH_C32(0x4ED3AA62), SPH_C32(0x363F7706), + SPH_C32(0x1BFEDF72), SPH_C32(0x429B023D), + SPH_C32(0x37D0D724), SPH_C32(0xD00A1248), + SPH_C32(0xDB0FEAD3), SPH_C32(0x49F1C09B), + SPH_C32(0x075372C9), SPH_C32(0x80991B7B), + SPH_C32(0x25D479D8), SPH_C32(0xF6E8DEF7), + SPH_C32(0xE3FE501A), SPH_C32(0xB6794C3B), + SPH_C32(0x976CE0BD), SPH_C32(0x04C006BA), + SPH_C32(0xC1A94FB6), SPH_C32(0x409F60C4) +}; + +#define SAVE_STATE \ + __m128i u0, u1, u2, u3, u4, u5, u6, u7; \ + do { \ + u0 = s0; \ + u1 = s1; \ + u2 = s2; \ + u3 = s3; \ + u4 = s4; \ + u5 = s5; \ + u6 = s6; \ + u7 = s7; \ + } while (0) + +#define UPDATE_STATE \ +do { \ + s0 = _mm_add_epi32( s0, u0 ); \ + s1 = _mm_add_epi32( s1, u1 ); \ + s2 = _mm_add_epi32( s2, u2 ); \ + s3 = _mm_add_epi32( s3, u3 ); \ + s4 = _mm_add_epi32( s4, u4 ); \ + s5 = _mm_add_epi32( s5, u5 ); \ + s6 = _mm_add_epi32( s6, u6 ); \ + s7 = _mm_add_epi32( s7, u7 ); \ +} while (0) + +/* + * COREn(in) performs the core HAVAL computation for "n" passes, using + * the one-argument macro "in" to access the input words. Running state + * is held in variable "s0" to "s7". + */ +/* +#define CORE3(in) do { \ + SAVE_STATE; \ + PASS1(3, in); \ + PASS2(3, in); \ + PASS3(3, in); \ + UPDATE_STATE; \ + } while (0) + +#define CORE4(in) do { \ + SAVE_STATE; \ + PASS1(4, in); \ + PASS2(4, in); \ + PASS3(4, in); \ + PASS4(4, in); \ + UPDATE_STATE; \ + } while (0) +*/ +#define CORE5(in) do { \ + SAVE_STATE; \ + PASS1(5, in); \ + PASS2(5, in); \ + PASS3(5, in); \ + PASS4(5, in); \ + PASS5(5, in); \ + UPDATE_STATE; \ + } while (0) + +/* + * DSTATE declares the state variables "s0" to "s7". + */ +#define DSTATE __m128i s0, s1, s2, s3, s4, s5, s6, s7 + +/* + * RSTATE fills the state variables from the context "sc". + */ +#define RSTATE \ +do { \ + s0 = sc->s0; \ + s1 = sc->s1; \ + s2 = sc->s2; \ + s3 = sc->s3; \ + s4 = sc->s4; \ + s5 = sc->s5; \ + s6 = sc->s6; \ + s7 = sc->s7; \ +} while (0) + +/* + * WSTATE updates the context "sc" from the state variables. + */ +#define WSTATE \ +do { \ + sc->s0 = s0; \ + sc->s1 = s1; \ + sc->s2 = s2; \ + sc->s3 = s3; \ + sc->s4 = s4; \ + sc->s5 = s5; \ + sc->s6 = s6; \ + sc->s7 = s7; \ +} while (0) + +/* + * Initialize a context. "olen" is the output length, in 32-bit words + * (between 4 and 8, inclusive). "passes" is the number of passes + * (3, 4 or 5). + */ +static void +haval_4way_init( haval_4way_context *sc, unsigned olen, unsigned passes ) +{ + sc->s0 = _mm_set1_epi32( 0x243F6A88UL ); + sc->s1 = _mm_set1_epi32( 0x85A308D3UL ); + sc->s2 = _mm_set1_epi32( 0x13198A2EUL ); + sc->s3 = _mm_set1_epi32( 0x03707344UL ); + sc->s4 = _mm_set1_epi32( 0xA4093822UL ); + sc->s5 = _mm_set1_epi32( 0x299F31D0UL ); + sc->s6 = _mm_set1_epi32( 0x082EFA98UL ); + sc->s7 = _mm_set1_epi32( 0xEC4E6C89UL ); + sc->olen = olen; + sc->passes = passes; + sc->count_high = 0; + sc->count_low = 0; + +} + +#define IN_PREPARE(indata) const __m128i *const load_ptr = (indata) + +#define INW(i) load_ptr[ i ] + +/* + * Write out HAVAL output. The output length is tailored to the requested + * length. + */ +static void +haval_4way_out( haval_4way_context *sc, void *dst ) +{ + __m128i *buf = (__m128i*)dst; + DSTATE; + RSTATE; + + buf[0] = s0; + buf[1] = s1; + buf[2] = s2; + buf[3] = s3; + buf[4] = s4; + buf[5] = s5; + buf[6] = s6; + buf[7] = s7; +} + +/* + * The main core functions inline the code with the COREx() macros. We + * use a helper file, included three times, which avoids code copying. + */ +/* +#undef PASSES +#define PASSES 3 +#include "haval-helper.c" + +#undef PASSES +#define PASSES 4 +#include "haval-helper.c" +*/ + +#undef PASSES +#define PASSES 5 +#include "haval-4way-helper.c" + +/* ====================================================================== */ + +#define API(xxx, y) \ +void \ +haval ## xxx ## _ ## y ## _4way_init(void *cc) \ +{ \ + haval_4way_init(cc, xxx >> 5, y); \ +} \ + \ +void \ +haval ## xxx ## _ ## y ## _4way (void *cc, const void *data, size_t len) \ +{ \ + haval ## y ## _4way(cc, data, len); \ +} \ + \ +void \ +haval ## xxx ## _ ## y ## _4way_close(void *cc, void *dst) \ +{ \ + haval ## y ## _4way_close(cc, dst); \ +} \ + +API(256, 5) + +#define RVAL \ +do { \ + s0 = val[0]; \ + s1 = val[1]; \ + s2 = val[2]; \ + s3 = val[3]; \ + s4 = val[4]; \ + s5 = val[5]; \ + s6 = val[6]; \ + s7 = val[7]; \ +} while (0) + +#define WVAL \ +do { \ + val[0] = s0; \ + val[1] = s1; \ + val[2] = s2; \ + val[3] = s3; \ + val[4] = s4; \ + val[5] = s5; \ + val[6] = s6; \ + val[7] = s7; \ +} while (0) + +#define INMSG(i) msg[i] + +#ifdef __cplusplus +} +#endif +#endif diff --git a/algo/haval/haval-hash-4way.h b/algo/haval/haval-hash-4way.h new file mode 100644 index 00000000..8cc30697 --- /dev/null +++ b/algo/haval/haval-hash-4way.h @@ -0,0 +1,95 @@ +/* $Id: sph_haval.h 218 2010-06-08 17:06:34Z tp $ */ +/** + * HAVAL interface. + * + * HAVAL is actually a family of 15 hash functions, depending on whether + * the internal computation uses 3, 4 or 5 passes, and on the output + * length, which is 128, 160, 192, 224 or 256 bits. This implementation + * provides interface functions for all 15, which internally map to + * three cores (depending on the number of passes). Note that output + * lengths other than 256 bits are not obtained by a simple truncation + * of a longer result; the requested length is encoded within the + * padding data. + * + * HAVAL was published in: Yuliang Zheng, Josef Pieprzyk and Jennifer + * Seberry: "HAVAL -- a one-way hashing algorithm with variable length + * of output", Advances in Cryptology -- AUSCRYPT'92, Lecture Notes in + * Computer Science, Vol.718, pp.83-104, Springer-Verlag, 1993. + * + * This paper, and a reference implementation, are available on the + * Calyptix web site: http://labs.calyptix.com/haval.php + * + * The HAVAL reference paper is quite unclear on the data encoding + * details, i.e. endianness (both byte order within a 32-bit word, and + * word order within a message block). This implementation has been + * made compatible with the reference implementation referenced above. + * + * @warning A collision for HAVAL-128/3 (HAVAL with three passes and + * 128-bit output) has been published; this function is thus considered + * as cryptographically broken. The status for other variants is unclear; + * use only with care. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @file sph_haval.h + * @author Thomas Pornin + */ + +#ifndef HAVAL_HASH_4WAY_H__ +#define HAVAL_HASH_4WAY_H__ + +#if defined(__AVX__) + +#ifdef __cplusplus +extern "C"{ +#endif + +#include +#include "algo/sha/sph_types.h" +#include "avxdefs.h" + +#define SPH_SIZE_haval256_5 256 + +typedef struct { + __m128i buf[32]; + __m128i s0, s1, s2, s3, s4, s5, s6, s7; + unsigned olen, passes; + sph_u32 count_high, count_low; +} haval_4way_context; + +typedef haval_4way_context haval256_5_4way_context; + +void haval256_5_4way_init( void *cc ); + +void haval256_5_4way( void *cc, const void *data, size_t len ); + +void haval256_5_4way_close( void *cc, void *dst ); + +#ifdef __cplusplus +} +#endif +#endif +#endif diff --git a/algo/jh/jha-gate.c b/algo/jh/jha-gate.c index 05d6fbdd..ca3d4fab 100644 --- a/algo/jh/jha-gate.c +++ b/algo/jh/jha-gate.c @@ -5,14 +5,13 @@ bool register_jha_algo( algo_gate_t* gate ) { #if defined (JHA_4WAY) four_way_not_tested(); - gate->optimizations = SSE2_OPT | AES_OPT | FOUR_WAY_OPT; gate->scanhash = (void*)&scanhash_jha_4way; gate->hash = (void*)&jha_hash_4way; #else - gate->optimizations = SSE2_OPT | AES_OPT | FOUR_WAY_OPT; gate->scanhash = (void*)&scanhash_jha; gate->hash = (void*)&jha_hash; #endif + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; gate->set_target = (void*)&scrypt_set_target; return true; }; diff --git a/algo/jh/jha-gate.h b/algo/jh/jha-gate.h index cb563b69..f772d8dd 100644 --- a/algo/jh/jha-gate.h +++ b/algo/jh/jha-gate.h @@ -5,7 +5,7 @@ #include -#if defined(FOUR_WAY) && defined(__AVX2__) && !defined(NO_AES_NI) +#if defined(__AVX2__) && defined(__AES__) #define JHA_4WAY #endif diff --git a/algo/keccak/keccak-gate.c b/algo/keccak/keccak-gate.c index 8773014c..215b0e9a 100644 --- a/algo/keccak/keccak-gate.c +++ b/algo/keccak/keccak-gate.c @@ -9,7 +9,7 @@ int64_t keccak_get_max64() { return 0x7ffffLL; } bool register_keccak_algo( algo_gate_t* gate ) { - gate->optimizations = FOUR_WAY_OPT; + gate->optimizations = AVX2_OPT; gate->gen_merkle_root = (void*)&SHA256_gen_merkle_root; gate->set_target = (void*)&keccak_set_target; gate->get_max64 = (void*)&keccak_get_max64; @@ -30,7 +30,7 @@ void keccakc_set_target( struct work* work, double job_diff ) bool register_keccakc_algo( algo_gate_t* gate ) { - gate->optimizations = FOUR_WAY_OPT; + gate->optimizations = AVX2_OPT; gate->gen_merkle_root = (void*)&sha256d_gen_merkle_root; gate->set_target = (void*)&keccakc_set_target; gate->get_max64 = (void*)&keccak_get_max64; diff --git a/algo/keccak/keccak-gate.h b/algo/keccak/keccak-gate.h index f49a4204..bdc4164a 100644 --- a/algo/keccak/keccak-gate.h +++ b/algo/keccak/keccak-gate.h @@ -4,7 +4,7 @@ #include "algo-gate-api.h" #include -#if defined(FOUR_WAY) && defined(__AVX2__) +#if defined(__AVX2__) #define KECCAK_4WAY #endif diff --git a/algo/lyra2/lyra2h-gate.c b/algo/lyra2/lyra2h-gate.c index 4aaca82a..0d280e8f 100644 --- a/algo/lyra2/lyra2h-gate.c +++ b/algo/lyra2/lyra2h-gate.c @@ -17,7 +17,7 @@ bool register_lyra2h_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_lyra2h; gate->hash = (void*)&lyra2h_hash; #endif - gate->optimizations = AVX_OPT | AVX2_OPT | FOUR_WAY_OPT; + gate->optimizations = AVX_OPT | AVX2_OPT; gate->get_max64 = (void*)&get_max64_0xffffLL; gate->set_target = (void*)&lyra2h_set_target; return true; diff --git a/algo/lyra2/lyra2h-gate.h b/algo/lyra2/lyra2h-gate.h index f51c3bf7..984b4bcb 100644 --- a/algo/lyra2/lyra2h-gate.h +++ b/algo/lyra2/lyra2h-gate.h @@ -4,7 +4,7 @@ #include "algo-gate-api.h" #include -#if defined(HASH_4WAY) +#if defined(__AVX2__) #define LYRA2H_4WAY #endif diff --git a/algo/lyra2/lyra2rev2-4way.c b/algo/lyra2/lyra2rev2-4way.c index 95113e7b..5e116b43 100644 --- a/algo/lyra2/lyra2rev2-4way.c +++ b/algo/lyra2/lyra2rev2-4way.c @@ -1,7 +1,7 @@ #include "lyra2rev2-gate.h" #include -#ifdef __AVX2__ +#if defined (__AVX2__) #include "algo/blake/blake-hash-4way.h" #include "algo/keccak/keccak-hash-4way.h" @@ -9,7 +9,7 @@ #include "algo/bmw/bmw-hash-4way.h" #include "algo/cubehash/sph_cubehash.h" -#include "algo/bmw/sph_bmw.h" +//#include "algo/bmw/sph_bmw.h" #include "algo/cubehash/sse2/cubehash_sse2.h" typedef struct { @@ -17,8 +17,8 @@ typedef struct { keccak256_4way_context keccak; cubehashParam cube; skein256_4way_context skein; - sph_bmw256_context bmw; - + bmw256_4way_context bmw; +// sph_bmw256_context bmw; } lyra2v2_4way_ctx_holder; static lyra2v2_4way_ctx_holder l2v2_4way_ctx; @@ -29,7 +29,8 @@ void init_lyra2rev2_4way_ctx() keccak256_4way_init( &l2v2_4way_ctx.keccak ); cubehashInit( &l2v2_4way_ctx.cube, 256, 16, 32 ); skein256_4way_init( &l2v2_4way_ctx.skein ); - sph_bmw256_init( &l2v2_4way_ctx.bmw ); + bmw256_4way_init( &l2v2_4way_ctx.bmw ); +// sph_bmw256_init( &l2v2_4way_ctx.bmw ); } void lyra2rev2_4way_hash( void *state, const void *input ) @@ -80,23 +81,26 @@ void lyra2rev2_4way_hash( void *state, const void *input ) cubehashUpdateDigest( &ctx.cube, (byte*) hash3, (const byte*) hash3, 32 ); - sph_bmw256( &ctx.bmw, hash0, 32 ); - sph_bmw256_close( &ctx.bmw, hash0 ); - memcpy( &ctx.bmw, &l2v2_4way_ctx.bmw, sizeof ctx.bmw ); - sph_bmw256( &ctx.bmw, hash1, 32 ); - sph_bmw256_close( &ctx.bmw, hash1 ); - memcpy( &ctx.bmw, &l2v2_4way_ctx.bmw, sizeof ctx.bmw ); - sph_bmw256( &ctx.bmw, hash2, 32 ); - sph_bmw256_close( &ctx.bmw, hash2 ); - memcpy( &ctx.bmw, &l2v2_4way_ctx.bmw, sizeof ctx.bmw ); - sph_bmw256( &ctx.bmw, hash3, 32 ); - sph_bmw256_close( &ctx.bmw, hash3 ); - - - memcpy( state, hash0, 32 ); - memcpy( state+32, hash1, 32 ); - memcpy( state+64, hash2, 32 ); - memcpy( state+96, hash3, 32 ); + // BMW256 4way has a lane corruption problem, only lanes 0 & 2 produce + // good hash. As a result this ugly workaround of running bmw256-4way + // twice with data shuffled to get all 4 lanes of good hash. + // The hash is then shuffled back into the appropriate lanes for output. + // Not as fast but still faster than using sph serially. + + // shift lane 1 data to lane 2. + mm_interleave_4x32( vhash, hash0, hash0, hash1, hash1, 256 ); + bmw256_4way( &ctx.bmw, vhash, 32 ); + bmw256_4way_close( &ctx.bmw, vhash ); + uint32_t trash[8] __attribute__ ((aligned (32))); + // extract lane 0 as usual and lane2 containing lane 1 hash + mm_deinterleave_4x32( state, trash, state+32, trash, vhash, 256 ); + // shift lane2 data to lane 0 and lane 3 data to lane 2 + mm_interleave_4x32( vhash, hash2, hash2, hash3, hash3, 256 ); + bmw256_4way_init( &ctx.bmw ); + bmw256_4way( &ctx.bmw, vhash, 32 ); + bmw256_4way_close( &ctx.bmw, vhash ); + // extract lane 2 hash from lane 0 and lane 3 hash from lane 2. + mm_deinterleave_4x32( state+64, trash, state+96, trash, vhash, 256 ); } int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce, @@ -140,6 +144,7 @@ int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce, if ( hash[7] <= Htarg && fulltest( hash, ptarget ) ) { +//printf("found0\n"); found[0] = true; num_found++; nonces[0] = pdata[19] = n; @@ -147,6 +152,7 @@ int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce, } if ( (hash+8)[7] <= Htarg && fulltest( hash+8, ptarget ) ) { +//printf("found1\n"); found[1] = true; num_found++; nonces[1] = n+1; @@ -154,6 +160,7 @@ int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce, } if ( (hash+16)[7] <= Htarg && fulltest( hash+16, ptarget ) ) { +//printf("found2\n"); found[2] = true; num_found++; nonces[2] = n+2; @@ -161,6 +168,7 @@ int scanhash_lyra2rev2_4way( int thr_id, struct work *work, uint32_t max_nonce, } if ( (hash+24)[7] <= Htarg && fulltest( hash+24, ptarget ) ) { +//printf("found3\n"); found[3] = true; num_found++; nonces[3] = n+3; diff --git a/algo/lyra2/lyra2rev2-gate.c b/algo/lyra2/lyra2rev2-gate.c index a07b7211..5174c259 100644 --- a/algo/lyra2/lyra2rev2-gate.c +++ b/algo/lyra2/lyra2rev2-gate.c @@ -29,7 +29,7 @@ bool register_lyra2rev2_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_lyra2rev2; gate->hash = (void*)&lyra2rev2_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT; gate->miner_thread_init = (void*)&lyra2rev2_thread_init; gate->set_target = (void*)&lyra2rev2_set_target; return true; diff --git a/algo/lyra2/lyra2rev2-gate.h b/algo/lyra2/lyra2rev2-gate.h index 8af82909..fb7251c2 100644 --- a/algo/lyra2/lyra2rev2-gate.h +++ b/algo/lyra2/lyra2rev2-gate.h @@ -5,7 +5,7 @@ #include #include "lyra2.h" -#if defined(HASH_4WAY) +#if defined(__AVX2__) #define LYRA2REV2_4WAY #endif diff --git a/algo/lyra2/lyra2z-gate.c b/algo/lyra2/lyra2z-gate.c index 67a09518..58bc4003 100644 --- a/algo/lyra2/lyra2z-gate.c +++ b/algo/lyra2/lyra2z-gate.c @@ -17,7 +17,7 @@ bool register_lyra2z_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_lyra2z; gate->hash = (void*)&lyra2z_hash; #endif - gate->optimizations = AVX_OPT | AVX2_OPT | FOUR_WAY_OPT; + gate->optimizations = AVX_OPT | AVX2_OPT; gate->get_max64 = (void*)&get_max64_0xffffLL; gate->set_target = (void*)&lyra2z_set_target; return true; diff --git a/algo/lyra2/lyra2z-gate.h b/algo/lyra2/lyra2z-gate.h index be9d4b86..68357f3c 100644 --- a/algo/lyra2/lyra2z-gate.h +++ b/algo/lyra2/lyra2z-gate.h @@ -4,7 +4,7 @@ #include "algo-gate-api.h" #include -#if defined(HASH_4WAY) +#if defined(__AVX2__) #define LYRA2Z_4WAY #endif diff --git a/algo/nist5/nist5-gate.c b/algo/nist5/nist5-gate.c index 433a934b..7cc69f64 100644 --- a/algo/nist5/nist5-gate.c +++ b/algo/nist5/nist5-gate.c @@ -2,7 +2,7 @@ bool register_nist5_algo( algo_gate_t* gate ) { - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; #if defined (NIST5_4WAY) gate->scanhash = (void*)&scanhash_nist5_4way; gate->hash = (void*)&nist5hash_4way; diff --git a/algo/nist5/nist5-gate.h b/algo/nist5/nist5-gate.h index 4cf5741d..a314da33 100644 --- a/algo/nist5/nist5-gate.h +++ b/algo/nist5/nist5-gate.h @@ -4,7 +4,7 @@ #include "algo-gate-api.h" #include -#if defined(HASH_4WAY) && defined(__AES__) +#if defined(__AVX2__) && defined(__AES__) #define NIST5_4WAY #endif diff --git a/algo/quark/anime-4way.c b/algo/quark/anime-4way.c new file mode 100644 index 00000000..e6678bc2 --- /dev/null +++ b/algo/quark/anime-4way.c @@ -0,0 +1,231 @@ +#include "cpuminer-config.h" +#include "anime-gate.h" + +#if defined (ANIME_4WAY) + +#include +#include +#include + +#include "algo/blake/blake-hash-4way.h" +#include "algo/bmw/bmw-hash-4way.h" +#include "algo/skein/skein-hash-4way.h" +#include "algo/jh/jh-hash-4way.h" +#include "algo/keccak/keccak-hash-4way.h" +#include "algo/groestl/aes_ni/hash-groestl.h" + +typedef struct { + blake512_4way_context blake; + bmw512_4way_context bmw; + hashState_groestl groestl; + jh512_4way_context jh; + skein512_4way_context skein; + keccak512_4way_context keccak; +} anime_4way_ctx_holder; + +anime_4way_ctx_holder anime_4way_ctx __attribute__ ((aligned (64))); + +void init_anime_4way_ctx() +{ + blake512_4way_init( &anime_4way_ctx.blake ); + bmw512_4way_init( &anime_4way_ctx.bmw ); + init_groestl( &anime_4way_ctx.groestl, 64 ); + skein512_4way_init( &anime_4way_ctx.skein ); + jh512_4way_init( &anime_4way_ctx.jh ); + keccak512_4way_init( &anime_4way_ctx.keccak ); +} + +void anime_4way_hash( void *state, const void *input ) +{ + uint64_t hash0[8] __attribute__ ((aligned (64))); + uint64_t hash1[8] __attribute__ ((aligned (64))); + uint64_t hash2[8] __attribute__ ((aligned (64))); + uint64_t hash3[8] __attribute__ ((aligned (64))); + uint64_t vhash[8*4] __attribute__ ((aligned (64))); + uint64_t vhashA[8*4] __attribute__ ((aligned (64))); + uint64_t vhashB[8*4] __attribute__ ((aligned (64))); + __m256i* vh = (__m256i*)vhash; + __m256i* vhA = (__m256i*)vhashA; + __m256i* vhB = (__m256i*)vhashB; + __m256i vh_mask; + __m256i bit3_mask; bit3_mask = _mm256_set1_epi64x( 8 ); + int i; + anime_4way_ctx_holder ctx; + memcpy( &ctx, &anime_4way_ctx, sizeof(anime_4way_ctx) ); + + bmw512_4way( &ctx.bmw, vhash, 80 ); + bmw512_4way_close( &ctx.bmw, vhash ); + + blake512_4way( &ctx.blake, input, 64 ); + blake512_4way_close( &ctx.blake, vhash ); + + vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), + mm256_zero ); + + mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + update_and_final_groestl( &ctx.groestl, (char*)hash0, + (char*)hash0, 512 ); + reinit_groestl( &ctx.groestl ); + update_and_final_groestl( &ctx.groestl, (char*)hash1, + (char*)hash1, 512 ); + reinit_groestl( &ctx.groestl ); + update_and_final_groestl( &ctx.groestl, (char*)hash2, + (char*)hash2, 512 ); + reinit_groestl( &ctx.groestl ); + update_and_final_groestl( &ctx.groestl, (char*)hash3, + (char*)hash3, 512 ); + mm256_interleave_4x64( vhashA, hash0, hash1, hash2, hash3, 512 ); + + skein512_4way( &ctx.skein, vhash, 64 ); + skein512_4way_close( &ctx.skein, vhashB ); + + for ( i = 0; i < 8; i++ ) + vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask ); + + mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + reinit_groestl( &ctx.groestl ); + update_and_final_groestl( &ctx.groestl, (char*)hash0, (char*)hash0, 512 ); + reinit_groestl( &ctx.groestl ); + update_and_final_groestl( &ctx.groestl, (char*)hash1, (char*)hash1, 512 ); + reinit_groestl( &ctx.groestl ); + update_and_final_groestl( &ctx.groestl, (char*)hash2, (char*)hash2, 512 ); + reinit_groestl( &ctx.groestl ); + update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); + mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); + + jh512_4way( &ctx.jh, vhash, 64 ); + jh512_4way_close( &ctx.jh, vhash ); + + vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), + mm256_zero ); + + blake512_4way_init( &ctx.blake ); + blake512_4way( &ctx.blake, vhash, 64 ); + blake512_4way_close( &ctx.blake, vhashA ); + + bmw512_4way_init( &ctx.bmw ); + bmw512_4way( &ctx.bmw, vhash, 64 ); + bmw512_4way_close( &ctx.bmw, vhashB ); + + for ( i = 0; i < 8; i++ ) + vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask ); + + keccak512_4way( &ctx.keccak, vhash, 64 ); + keccak512_4way_close( &ctx.keccak, vhash ); + + skein512_4way_init( &ctx.skein ); + skein512_4way( &ctx.skein, vhash, 64 ); + skein512_4way_close( &ctx.skein, vhash ); + + vh_mask = _mm256_cmpeq_epi64( _mm256_and_si256( vh[0], bit3_mask ), + mm256_zero ); + + keccak512_4way_init( &ctx.keccak ); + keccak512_4way( &ctx.keccak, vhash, 64 ); + keccak512_4way_close( &ctx.keccak, vhashA ); + + jh512_4way_init( &ctx.jh ); + jh512_4way( &ctx.jh, vhash, 64 ); + jh512_4way_close( &ctx.jh, vhashB ); + + for ( i = 0; i < 8; i++ ) + vh[i] = _mm256_blendv_epi8( vhA[i], vhB[i], vh_mask ); + + mm256_deinterleave_4x64( state, state+32, state+64, state+96, vhash, 256 ); +} + +int scanhash_anime_4way( int thr_id, struct work *work, uint32_t max_nonce, + uint64_t *hashes_done) +{ + uint32_t hash[4*8] __attribute__ ((aligned (64))); + uint32_t vdata[24*4] __attribute__ ((aligned (64))); + uint32_t endiandata[20] __attribute__((aligned(64))); + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + uint32_t n = pdata[19]; + const uint32_t first_nonce = pdata[19]; + uint32_t *nonces = work->nonces; + bool *found = work->nfound; + int num_found = 0; + uint32_t *noncep0 = vdata + 73; // 9*8 + 1 + uint32_t *noncep1 = vdata + 75; + uint32_t *noncep2 = vdata + 77; + uint32_t *noncep3 = vdata + 79; + const uint32_t Htarg = ptarget[7]; + uint64_t htmax[] = { + 0, + 0xF, + 0xFF, + 0xFFF, + 0xFFFF, + 0x10000000 + }; + uint32_t masks[] = { + 0xFFFFFFFF, + 0xFFFFFFF0, + 0xFFFFFF00, + 0xFFFFF000, + 0xFFFF0000, + 0 + }; + + swab32_array( endiandata, pdata, 20 ); + + uint64_t *edata = (uint64_t*)endiandata; + mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 ); + + for (int m=0; m < 6; m++) + if (Htarg <= htmax[m]) + { + uint32_t mask = masks[m]; + + do + { + found[0] = found[1] = found[2] = found[3] = false; + be32enc( noncep0, n ); + be32enc( noncep1, n+1 ); + be32enc( noncep2, n+2 ); + be32enc( noncep3, n+3 ); + + anime_4way_hash( hash, vdata ); + pdata[19] = n; + + if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) ) + { + found[0] = true; + num_found++; + nonces[0] = n; + work_set_target_ratio( work, hash ); + } + if ( ( (hash+8)[7] & mask ) == 0 && fulltest( hash+8, ptarget ) ) + { + found[1] = true; + num_found++; + nonces[1] = n+1; + work_set_target_ratio( work, hash ); + } + if ( ( (hash+16)[7] & mask ) == 0 && fulltest( hash+16, ptarget ) ) + { + found[2] = true; + num_found++; + nonces[2] = n+2; + work_set_target_ratio( work, hash ); + } + if ( ( (hash+24)[7] & mask ) == 0 && fulltest( hash+24, ptarget ) ) + { + found[3] = true; + num_found++; + nonces[3] = n+3; + work_set_target_ratio( work, hash ); + } + n += 4; + } while ( ( num_found == 0 ) && ( n < max_nonce ) + && !work_restart[thr_id].restart ); + break; + } + + *hashes_done = n - first_nonce + 1; + return num_found; +} + +#endif diff --git a/algo/quark/anime-gate.c b/algo/quark/anime-gate.c new file mode 100644 index 00000000..53a06e1d --- /dev/null +++ b/algo/quark/anime-gate.c @@ -0,0 +1,17 @@ +#include "anime-gate.h" + +bool register_anime_algo( algo_gate_t* gate ) +{ +#if defined (ANIME_4WAY) + init_anime_4way_ctx(); + gate->scanhash = (void*)&scanhash_anime_4way; + gate->hash = (void*)&anime_4way_hash; +#else + init_anime_ctx(); + gate->scanhash = (void*)&scanhash_anime; + gate->hash = (void*)&anime_hash; +#endif + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; + return true; +}; + diff --git a/algo/quark/anime-gate.h b/algo/quark/anime-gate.h new file mode 100644 index 00000000..1e6ac07f --- /dev/null +++ b/algo/quark/anime-gate.h @@ -0,0 +1,32 @@ +#ifndef ANIME_GATE_H__ +#define ANIME_GATE_H__ 1 + +#include "algo-gate-api.h" +#include + +#if defined(__AVX2__) && defined(__AES__) + #define ANIME_4WAY +#endif + +bool register_anime_algo( algo_gate_t* gate ); + +#if defined(ANIME_4WAY) + +void anime_4way_hash( void *state, const void *input ); + +int scanhash_anime_4way( int thr_id, struct work *work, uint32_t max_nonce, + uint64_t *hashes_done ); + +void init_anime_4way_ctx(); + +#endif + +void anime_hash( void *state, const void *input ); + +int scanhash_anime( int thr_id, struct work *work, uint32_t max_nonce, + uint64_t *hashes_done ); + +void init_anime_ctx(); + +#endif + diff --git a/algo/quark/anime.c b/algo/quark/anime.c new file mode 100644 index 00000000..eebb7c2c --- /dev/null +++ b/algo/quark/anime.c @@ -0,0 +1,189 @@ +#include "cpuminer-config.h" +#include "anime-gate.h" +#include +#include +#include +#include "algo/blake/sph_blake.h" +#include "algo/bmw/sph_bmw.h" +#include "algo/skein/sph_skein.h" +#include "algo/jh/sph_jh.h" +#include "algo/keccak/sph_keccak.h" +#ifdef __AES__ + #include "algo/groestl/aes_ni/hash-groestl.h" +#else + #include "algo/groestl/sph_groestl.h" +#endif + +typedef struct { + sph_blake512_context blake; + sph_bmw512_context bmw; +#ifdef __AES__ + hashState_groestl groestl; +#else + sph_groestl512_context groestl; +#endif + sph_jh512_context jh; + sph_skein512_context skein; + sph_keccak512_context keccak; +} anime_ctx_holder; + +anime_ctx_holder anime_ctx __attribute__ ((aligned (64))); + +void init_anime_ctx() +{ + sph_blake512_init( &anime_ctx.blake ); + sph_bmw512_init( &anime_ctx.bmw ); +#ifdef __AES__ + init_groestl( &anime_ctx.groestl, 64 ); +#else + sph_groestl512_init( &anime_ctx.groestl ); +#endif + sph_skein512_init( &anime_ctx.skein ); + sph_jh512_init( &anime_ctx.jh ); + sph_keccak512_init( &anime_ctx.keccak ); +} + +void anime_hash( void *state, const void *input ) +{ + unsigned char hash[128] __attribute__ ((aligned (32))); +/* + uint64_t hash0[8] __attribute__ ((aligned (64))); + uint64_t hash1[8] __attribute__ ((aligned (64))); + uint64_t hash2[8] __attribute__ ((aligned (64))); + uint64_t hash3[8] __attribute__ ((aligned (64))); + uint64_t vhash[8*4] __attribute__ ((aligned (64))); + uint64_t vhashA[8*4] __attribute__ ((aligned (64))); + uint64_t vhashB[8*4] __attribute__ ((aligned (64))); + __m256i* vh = (__m256i*)vhash; + __m256i* vhA = (__m256i*)vhashA; + __m256i* vhB = (__m256i*)vhashB; + __m256i vh_mask; + __m256i bit3_mask; bit3_mask = _mm256_set1_epi64x( 8 ); +*/ + uint32_t mask = 8; + anime_ctx_holder ctx; + memcpy( &ctx, &anime_ctx, sizeof(anime_ctx) ); + + sph_bmw512( &ctx.bmw, input, 80 ); + sph_bmw512_close( &ctx.bmw, hash ); + + sph_blake512( &ctx.blake, hash, 64 ); + sph_blake512_close( &ctx.blake, hash ); + + if ( ( hash[0] & mask ) != 0 ) + { +#ifdef __AES__ + update_and_final_groestl( &ctx.groestl, (char*)hash, (char*)hash, 512 ); + reinit_groestl( &ctx.groestl ); +#else + sph_groestl512 ( &ctx.groestl, hash, 64 ); + sph_groestl512_close( &ctx.groestl, hash ); + sph_groestl512_init( &ctx.groestl ); +#endif + } + else + { + sph_skein512( &ctx.skein, hash, 64 ); + sph_skein512_close( &ctx.skein, hash ); + sph_skein512_init( &ctx.skein ); + } + +#ifdef __AES__ + update_and_final_groestl( &ctx.groestl, (char*)hash, (char*)hash, 512 ); +#else + sph_groestl512 ( &ctx.groestl, hash, 64 ); + sph_groestl512_close( &ctx.groestl, hash ); +#endif + + sph_jh512( &ctx.jh, hash, 64 ); + sph_jh512_close( &ctx.jh, hash ); + + if ( ( hash[0] & mask ) != 0 ) + { + sph_blake512_init( &ctx.blake ); + sph_blake512( &ctx.blake, hash, 64 ); + sph_blake512_close( &ctx.blake, hash ); + } + else + { + sph_bmw512_init( &ctx.bmw ); + sph_bmw512( &ctx.bmw, hash, 64 ); + sph_bmw512_close( &ctx.bmw, hash ); + } + + sph_keccak512( &ctx.keccak, hash, 64 ); + sph_keccak512_close( &ctx.keccak, hash ); + + sph_skein512( &ctx.skein, hash, 64 ); + sph_skein512_close( &ctx.skein, hash ); + + if ( ( hash[0] & mask ) != 0 ) + { + sph_keccak512_init( &ctx.keccak ); + sph_keccak512( &ctx.keccak, hash, 64 ); + sph_keccak512_close( &ctx.keccak, hash ); + } + else + { + sph_jh512_init( &ctx.jh ); + sph_jh512( &ctx.jh, hash, 64 ); + sph_jh512_close( &ctx.jh, hash ); + } + + memcpy( state, hash, 32 ); +} + +int scanhash_anime( int thr_id, struct work *work, uint32_t max_nonce, + uint64_t *hashes_done) +{ + uint32_t hash[8] __attribute__ ((aligned (64))); + uint32_t endiandata[20] __attribute__((aligned(64))); + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + uint32_t n = pdata[19]; + const uint32_t first_nonce = pdata[19]; + const uint32_t Htarg = ptarget[7]; + uint64_t htmax[] = { + 0, + 0xF, + 0xFF, + 0xFFF, + 0xFFFF, + 0x10000000 + }; + uint32_t masks[] = { + 0xFFFFFFFF, + 0xFFFFFFF0, + 0xFFFFFF00, + 0xFFFFF000, + 0xFFFF0000, + 0 + }; + + swab32_array( endiandata, pdata, 20 ); + + for (int m=0; m < 6; m++) + if (Htarg <= htmax[m]) + { + uint32_t mask = masks[m]; + do + { + be32enc( &endiandata[19], n ); + anime_hash( hash, endiandata ); + pdata[19] = n; + + if ( ( hash[7] & mask ) == 0 && fulltest( hash, ptarget ) ) + { + work_set_target_ratio( work, hash ); + *hashes_done = n - first_nonce + 1; + return true; + } + n++; + } while ( ( n < max_nonce ) && !work_restart[thr_id].restart ); + break; + } + + pdata[19] = n; + return 0; +} + diff --git a/algo/quark/quark-4way.c b/algo/quark/quark-4way.c index 1a92e98b..1e6aecce 100644 --- a/algo/quark/quark-4way.c +++ b/algo/quark/quark-4way.c @@ -1,7 +1,7 @@ #include "cpuminer-config.h" #include "quark-gate.h" -#if defined (__AVX2__) && defined (__AES__) +#if defined (QUARK_4WAY) #include #include diff --git a/algo/quark/quark-gate.c b/algo/quark/quark-gate.c index 9356cdb4..4d7018ae 100644 --- a/algo/quark/quark-gate.c +++ b/algo/quark/quark-gate.c @@ -11,7 +11,7 @@ bool register_quark_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_quark; gate->hash = (void*)&quark_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; return true; }; diff --git a/algo/quark/quark-gate.h b/algo/quark/quark-gate.h index 676c6b5a..1191d469 100644 --- a/algo/quark/quark-gate.h +++ b/algo/quark/quark-gate.h @@ -4,7 +4,7 @@ #include "algo-gate-api.h" #include -#if defined(HASH_4WAY) && defined(__AES__) +#if defined(__AVX2__) && defined(__AES__) #define QUARK_4WAY #endif diff --git a/algo/sha/md-helper-4way.c b/algo/sha/md-helper-4way.c new file mode 100644 index 00000000..8ffac8e3 --- /dev/null +++ b/algo/sha/md-helper-4way.c @@ -0,0 +1,270 @@ +/* $Id: md_helper.c 216 2010-06-08 09:46:57Z tp $ */ +/* + * This file contains some functions which implement the external data + * handling and padding for Merkle-Damgard hash functions which follow + * the conventions set out by MD4 (little-endian) or SHA-1 (big-endian). + * + * API: this file is meant to be included, not compiled as a stand-alone + * file. Some macros must be defined: + * RFUN name for the round function + * HASH "short name" for the hash function + * BE32 defined for big-endian, 32-bit based (e.g. SHA-1) + * LE32 defined for little-endian, 32-bit based (e.g. MD5) + * BE64 defined for big-endian, 64-bit based (e.g. SHA-512) + * LE64 defined for little-endian, 64-bit based (no example yet) + * PW01 if defined, append 0x01 instead of 0x80 (for Tiger) + * BLEN if defined, length of a message block (in bytes) + * PLW1 if defined, length is defined on one 64-bit word only (for Tiger) + * PLW4 if defined, length is defined on four 64-bit words (for WHIRLPOOL) + * SVAL if defined, reference to the context state information + * + * BLEN is used when a message block is not 16 (32-bit or 64-bit) words: + * this is used for instance for Tiger, which works on 64-bit words but + * uses 512-bit message blocks (eight 64-bit words). PLW1 and PLW4 are + * ignored if 32-bit words are used; if 64-bit words are used and PLW1 is + * set, then only one word (64 bits) will be used to encode the input + * message length (in bits), otherwise two words will be used (as in + * SHA-384 and SHA-512). If 64-bit words are used and PLW4 is defined (but + * not PLW1), four 64-bit words will be used to encode the message length + * (in bits). Note that regardless of those settings, only 64-bit message + * lengths are supported (in bits): messages longer than 2 Exabytes will be + * improperly hashed (this is unlikely to happen soon: 2 Exabytes is about + * 2 millions Terabytes, which is huge). + * + * If CLOSE_ONLY is defined, then this file defines only the sph_XXX_close() + * function. This is used for Tiger2, which is identical to Tiger except + * when it comes to the padding (Tiger2 uses the standard 0x80 byte instead + * of the 0x01 from original Tiger). + * + * The RFUN function is invoked with two arguments, the first pointing to + * aligned data (as a "const void *"), the second being state information + * from the context structure. By default, this state information is the + * "val" field from the context, and this field is assumed to be an array + * of words ("sph_u32" or "sph_u64", depending on BE32/LE32/BE64/LE64). + * from the context structure. The "val" field can have any type, except + * for the output encoding which assumes that it is an array of "sph_u32" + * values. By defining NO_OUTPUT, this last step is deactivated; the + * includer code is then responsible for writing out the hash result. When + * NO_OUTPUT is defined, the third parameter to the "close()" function is + * ignored. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin + */ + +#ifdef _MSC_VER +#pragma warning (disable: 4146) +#endif + +#undef SPH_XCAT +#define SPH_XCAT(a, b) SPH_XCAT_(a, b) +#undef SPH_XCAT_ +#define SPH_XCAT_(a, b) a ## b + +#undef SPH_BLEN +#undef SPH_WLEN +#if defined BE64 || defined LE64 +#define SPH_BLEN 128U +#define SPH_WLEN 8U +#else +#define SPH_BLEN 64U +#define SPH_WLEN 4U +#endif + +#ifdef BLEN +#undef SPH_BLEN +#define SPH_BLEN BLEN +#endif + +#undef SPH_MAXPAD +#if defined PLW1 +#define SPH_MAXPAD (SPH_BLEN - SPH_WLEN) +#elif defined PLW4 +#define SPH_MAXPAD (SPH_BLEN - (SPH_WLEN << 2)) +#else +#define SPH_MAXPAD (SPH_BLEN - (SPH_WLEN << 1)) +#endif + +#undef SPH_VAL +#undef SPH_NO_OUTPUT +#ifdef SVAL +#define SPH_VAL SVAL +#define SPH_NO_OUTPUT 1 +#else +#define SPH_VAL sc->val +#endif + +#ifndef CLOSE_ONLY + +#ifdef SPH_UPTR +static void +SPH_XCAT(HASH, _short)( void *cc, const void *data, size_t len ) +#else +void +HASH ( void *cc, const void *data, size_t len ) +#endif +{ + SPH_XCAT( HASH, _context ) *sc; + __m256i *vdata = (__m256i*)data; + size_t ptr; + + sc = cc; + ptr = (unsigned)sc->count & (SPH_BLEN - 1U); + while ( len > 0 ) + { + size_t clen; + clen = SPH_BLEN - ptr; + if ( clen > len ) + clen = len; + memcpy_256( sc->buf + (ptr>>3), vdata, clen>>3 ); + vdata = vdata + (clen>>3); + ptr += clen; + len -= clen; + if ( ptr == SPH_BLEN ) + { + RFUN( sc->buf, SPH_VAL ); + ptr = 0; + } + sc->count += clen; + } +} + +#ifdef SPH_UPTR +void +HASH (void *cc, const void *data, size_t len) +{ + SPH_XCAT(HASH, _context) *sc; + __m256i *vdata = (__m256i*)data; + unsigned ptr; + + if ( len < (2 * SPH_BLEN) ) + { + SPH_XCAT(HASH, _short)(cc, data, len); + return; + } + sc = cc; + ptr = (unsigned)sc->count & (SPH_BLEN - 1U); + if ( ptr > 0 ) + { + unsigned t; + t = SPH_BLEN - ptr; + SPH_XCAT( HASH, _short )( cc, data, t ); + vdata = vdata + (t>>3); + len -= t; + } + SPH_XCAT( HASH, _short )( cc, data, len ); +} +#endif + +#endif + +/* + * Perform padding and produce result. The context is NOT reinitialized + * by this function. + */ +static void +SPH_XCAT( HASH, _addbits_and_close )(void *cc, unsigned ub, unsigned n, + void *dst, unsigned rnum ) +{ + SPH_XCAT(HASH, _context) *sc; + unsigned ptr, u; + sc = cc; + ptr = (unsigned)sc->count & (SPH_BLEN - 1U); + +#ifdef PW01 + sc->buf[ptr>>3] = _mm256_set1_epi64x( 0x100 >> 8 ); +#else + sc->buf[ptr>>3] = _mm256_set1_epi64x( 0x80 ); +#endif + ptr += 8; + + if ( ptr > SPH_MAXPAD ) + { + memset_zero_256( sc->buf + (ptr>>3), (SPH_BLEN - ptr) >> 3 ); + RFUN( sc->buf, SPH_VAL ); + memset_zero_256( sc->buf, SPH_MAXPAD >> 3 ); + } + else + { + memset_zero_256( sc->buf + (ptr>>3), (SPH_MAXPAD - ptr) >> 3 ); + } +#if defined BE64 +#if defined PLW1 + sc->buf[ SPH_MAXPAD>>3 ] = + mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) ); +#elif defined PLW4 + memset_zero_256( sc->buf + (SPH_MAXPAD>>3), ( 2 * SPH_WLEN ) >> 3 ); + sc->buf[ (SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] = + mm256_byteswap_64( _mm256_set1_epi64x( sc->count >> 61 ) ); + sc->buf[ (SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] = + mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) ); +#else + sc->buf[ ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ] = + mm256_byteswap_64( _mm256_set1_epi64x( sc->count >> 61 ) ); + sc->buf[ ( SPH_MAXPAD + 3 * SPH_WLEN ) >> 3 ] = + mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) ); +#endif // PLW +#else // LE64 +#if defined PLW1 + sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_set1_epi64x( sc->count << 3 ); +#elif defined PLW4 + sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_set1_epi64x( sc->count << 3 ); + sc->buf[ ( SPH_MAXPAD + SPH_WLEN ) >> 3 ] = + _mm256_set1_epi64x( c->count >> 61 ); + memset_zero_256( sc->buf + ( ( SPH_MAXPAD + 2 * SPH_WLEN ) >> 3 ), + 2 * SPH_WLEN ); +#else + sc->buf[ SPH_MAXPAD >> 3 ] = _mm256_set1_epi64x( sc->count << 3 ); + sc->buf[ ( SPH_MAXPAD + SPH_WLEN ) >> 3 ] = + _mm256_set1_epi64x( sc->count >> 61 ); +#endif // PLW + +#endif // LE64 + + RFUN( sc->buf, SPH_VAL ); + +#ifdef SPH_NO_OUTPUT + (void)dst; + (void)rnum; + (void)u; +#else + for ( u = 0; u < rnum; u ++ ) + { +#if defined BE64 + ((__m256i*)dst)[u] = mm256_byteswap_64( sc->val[u] ); +#else // LE64 + ((__m256i*)dst)[u] = sc->val[u]; +#endif + } +#endif +} + +static void +SPH_XCAT( HASH, _mdclose )( void *cc, void *dst, unsigned rnum ) +{ + SPH_XCAT( HASH, _addbits_and_close )( cc, 0, 0, dst, rnum ); +} diff --git a/algo/sha/sha2-big-4way.c b/algo/sha/sha2-big-4way.c new file mode 100644 index 00000000..8ea292f6 --- /dev/null +++ b/algo/sha/sha2-big-4way.c @@ -0,0 +1,247 @@ +/* $Id: sha2big.c 216 2010-06-08 09:46:57Z tp $ */ +/* + * SHA-384 / SHA-512 implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin + */ + +#include +#include + +#include "sph_sha2.h" + +#if SPH_64 + +#define CH(X, Y, Z) ((((Y) ^ (Z)) & (X)) ^ (Z)) +#define MAJ(X, Y, Z) (((X) & (Y)) | (((X) | (Y)) & (Z))) + +#define ROTR64 SPH_ROTR64 + +#define BSG5_0(x) (ROTR64(x, 28) ^ ROTR64(x, 34) ^ ROTR64(x, 39)) +#define BSG5_1(x) (ROTR64(x, 14) ^ ROTR64(x, 18) ^ ROTR64(x, 41)) +#define SSG5_0(x) (ROTR64(x, 1) ^ ROTR64(x, 8) ^ SPH_T64((x) >> 7)) +#define SSG5_1(x) (ROTR64(x, 19) ^ ROTR64(x, 61) ^ SPH_T64((x) >> 6)) + +static const sph_u64 K512[80] = { + SPH_C64(0x428A2F98D728AE22), SPH_C64(0x7137449123EF65CD), + SPH_C64(0xB5C0FBCFEC4D3B2F), SPH_C64(0xE9B5DBA58189DBBC), + SPH_C64(0x3956C25BF348B538), SPH_C64(0x59F111F1B605D019), + SPH_C64(0x923F82A4AF194F9B), SPH_C64(0xAB1C5ED5DA6D8118), + SPH_C64(0xD807AA98A3030242), SPH_C64(0x12835B0145706FBE), + SPH_C64(0x243185BE4EE4B28C), SPH_C64(0x550C7DC3D5FFB4E2), + SPH_C64(0x72BE5D74F27B896F), SPH_C64(0x80DEB1FE3B1696B1), + SPH_C64(0x9BDC06A725C71235), SPH_C64(0xC19BF174CF692694), + SPH_C64(0xE49B69C19EF14AD2), SPH_C64(0xEFBE4786384F25E3), + SPH_C64(0x0FC19DC68B8CD5B5), SPH_C64(0x240CA1CC77AC9C65), + SPH_C64(0x2DE92C6F592B0275), SPH_C64(0x4A7484AA6EA6E483), + SPH_C64(0x5CB0A9DCBD41FBD4), SPH_C64(0x76F988DA831153B5), + SPH_C64(0x983E5152EE66DFAB), SPH_C64(0xA831C66D2DB43210), + SPH_C64(0xB00327C898FB213F), SPH_C64(0xBF597FC7BEEF0EE4), + SPH_C64(0xC6E00BF33DA88FC2), SPH_C64(0xD5A79147930AA725), + SPH_C64(0x06CA6351E003826F), SPH_C64(0x142929670A0E6E70), + SPH_C64(0x27B70A8546D22FFC), SPH_C64(0x2E1B21385C26C926), + SPH_C64(0x4D2C6DFC5AC42AED), SPH_C64(0x53380D139D95B3DF), + SPH_C64(0x650A73548BAF63DE), SPH_C64(0x766A0ABB3C77B2A8), + SPH_C64(0x81C2C92E47EDAEE6), SPH_C64(0x92722C851482353B), + SPH_C64(0xA2BFE8A14CF10364), SPH_C64(0xA81A664BBC423001), + SPH_C64(0xC24B8B70D0F89791), SPH_C64(0xC76C51A30654BE30), + SPH_C64(0xD192E819D6EF5218), SPH_C64(0xD69906245565A910), + SPH_C64(0xF40E35855771202A), SPH_C64(0x106AA07032BBD1B8), + SPH_C64(0x19A4C116B8D2D0C8), SPH_C64(0x1E376C085141AB53), + SPH_C64(0x2748774CDF8EEB99), SPH_C64(0x34B0BCB5E19B48A8), + SPH_C64(0x391C0CB3C5C95A63), SPH_C64(0x4ED8AA4AE3418ACB), + SPH_C64(0x5B9CCA4F7763E373), SPH_C64(0x682E6FF3D6B2B8A3), + SPH_C64(0x748F82EE5DEFB2FC), SPH_C64(0x78A5636F43172F60), + SPH_C64(0x84C87814A1F0AB72), SPH_C64(0x8CC702081A6439EC), + SPH_C64(0x90BEFFFA23631E28), SPH_C64(0xA4506CEBDE82BDE9), + SPH_C64(0xBEF9A3F7B2C67915), SPH_C64(0xC67178F2E372532B), + SPH_C64(0xCA273ECEEA26619C), SPH_C64(0xD186B8C721C0C207), + SPH_C64(0xEADA7DD6CDE0EB1E), SPH_C64(0xF57D4F7FEE6ED178), + SPH_C64(0x06F067AA72176FBA), SPH_C64(0x0A637DC5A2C898A6), + SPH_C64(0x113F9804BEF90DAE), SPH_C64(0x1B710B35131C471B), + SPH_C64(0x28DB77F523047D84), SPH_C64(0x32CAAB7B40C72493), + SPH_C64(0x3C9EBE0A15C9BEBC), SPH_C64(0x431D67C49C100D4C), + SPH_C64(0x4CC5D4BECB3E42B6), SPH_C64(0x597F299CFC657E2A), + SPH_C64(0x5FCB6FAB3AD6FAEC), SPH_C64(0x6C44198C4A475817) +}; + +static const sph_u64 H384[8] = { + SPH_C64(0xCBBB9D5DC1059ED8), SPH_C64(0x629A292A367CD507), + SPH_C64(0x9159015A3070DD17), SPH_C64(0x152FECD8F70E5939), + SPH_C64(0x67332667FFC00B31), SPH_C64(0x8EB44A8768581511), + SPH_C64(0xDB0C2E0D64F98FA7), SPH_C64(0x47B5481DBEFA4FA4) +}; + +static const sph_u64 H512[8] = { + SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B), + SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1), + SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F), + SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179) +}; + +/* + * This macro defines the body for a SHA-384 / SHA-512 compression function + * implementation. The "in" parameter should evaluate, when applied to a + * numerical input parameter from 0 to 15, to an expression which yields + * the corresponding input block. The "r" parameter should evaluate to + * an array or pointer expression designating the array of 8 words which + * contains the input and output of the compression function. + * + * SHA-512 is hard for the compiler. If the loop is completely unrolled, + * then the code will be quite huge (possibly more than 100 kB), and the + * performance will be degraded due to cache misses on the code. We + * unroll only eight steps, which avoids all needless copies when + * 64-bit registers are swapped. + */ + +#define SHA3_STEP(A, B, C, D, E, F, G, H, i) do { \ + sph_u64 T1, T2; \ + T1 = SPH_T64(H + BSG5_1(E) + CH(E, F, G) + K512[i] + W[i]); \ + T2 = SPH_T64(BSG5_0(A) + MAJ(A, B, C)); \ + D = SPH_T64(D + T1); \ + H = SPH_T64(T1 + T2); \ + } while (0) + +#define SHA3_ROUND_BODY(in, r) do { \ + int i; \ + sph_u64 A, B, C, D, E, F, G, H; \ + sph_u64 W[80]; \ + \ + for (i = 0; i < 16; i ++) \ + W[i] = in(i); \ + for (i = 16; i < 80; i ++) \ + W[i] = SPH_T64(SSG5_1(W[i - 2]) + W[i - 7] \ + + SSG5_0(W[i - 15]) + W[i - 16]); \ + A = (r)[0]; \ + B = (r)[1]; \ + C = (r)[2]; \ + D = (r)[3]; \ + E = (r)[4]; \ + F = (r)[5]; \ + G = (r)[6]; \ + H = (r)[7]; \ + for (i = 0; i < 80; i += 8) { \ + SHA3_STEP(A, B, C, D, E, F, G, H, i + 0); \ + SHA3_STEP(H, A, B, C, D, E, F, G, i + 1); \ + SHA3_STEP(G, H, A, B, C, D, E, F, i + 2); \ + SHA3_STEP(F, G, H, A, B, C, D, E, i + 3); \ + SHA3_STEP(E, F, G, H, A, B, C, D, i + 4); \ + SHA3_STEP(D, E, F, G, H, A, B, C, i + 5); \ + SHA3_STEP(C, D, E, F, G, H, A, B, i + 6); \ + SHA3_STEP(B, C, D, E, F, G, H, A, i + 7); \ + } \ + (r)[0] = SPH_T64((r)[0] + A); \ + (r)[1] = SPH_T64((r)[1] + B); \ + (r)[2] = SPH_T64((r)[2] + C); \ + (r)[3] = SPH_T64((r)[3] + D); \ + (r)[4] = SPH_T64((r)[4] + E); \ + (r)[5] = SPH_T64((r)[5] + F); \ + (r)[6] = SPH_T64((r)[6] + G); \ + (r)[7] = SPH_T64((r)[7] + H); \ + } while (0) + +/* + * One round of SHA-384 / SHA-512. The data must be aligned for 64-bit access. + */ +static void +sha3_round(const unsigned char *data, sph_u64 r[8]) +{ +#define SHA3_IN(x) sph_dec64be_aligned(data + (8 * (x))) + SHA3_ROUND_BODY(SHA3_IN, r); +#undef SHA3_IN +} + +/* see sph_sha3.h */ +void +sph_sha384_init(void *cc) +{ + sph_sha384_context *sc; + + sc = cc; + memcpy(sc->val, H384, sizeof H384); + sc->count = 0; +} + +/* see sph_sha3.h */ +void +sph_sha512_init(void *cc) +{ + sph_sha512_context *sc; + + sc = cc; + memcpy(sc->val, H512, sizeof H512); + sc->count = 0; +} + +#define RFUN sha3_round +#define HASH sha384 +#define BE64 1 +#include "md_helper.c" + +/* see sph_sha3.h */ +void +sph_sha384_close(void *cc, void *dst) +{ + sha384_close(cc, dst, 6); +// sph_sha384_init(cc); +} + +/* see sph_sha3.h */ +void +sph_sha384_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + sha384_addbits_and_close(cc, ub, n, dst, 6); +// sph_sha384_init(cc); +} + +/* see sph_sha3.h */ +void +sph_sha512_close(void *cc, void *dst) +{ + sha384_close(cc, dst, 8); +// sph_sha512_init(cc); +} + +/* see sph_sha3.h */ +void +sph_sha512_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst) +{ + sha384_addbits_and_close(cc, ub, n, dst, 8); +// sph_sha512_init(cc); +} + +/* see sph_sha3.h */ +void +sph_sha384_comp(const sph_u64 msg[16], sph_u64 val[8]) +{ +#define SHA3_IN(x) msg[x] + SHA3_ROUND_BODY(SHA3_IN, val); +#undef SHA3_IN +} + +#endif diff --git a/algo/sha/sha2-hash-4way.c b/algo/sha/sha2-hash-4way.c new file mode 100644 index 00000000..c23bb9f6 --- /dev/null +++ b/algo/sha/sha2-hash-4way.c @@ -0,0 +1,236 @@ +/* $Id: sha2big.c 216 2010-06-08 09:46:57Z tp $ */ +/* + * SHA-384 / SHA-512 implementation. + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @author Thomas Pornin + */ + +#include +#include + +#include "sha2-hash-4way.h" + +#if defined(__AVX2__) + +static const sph_u64 K512[80] = { + SPH_C64(0x428A2F98D728AE22), SPH_C64(0x7137449123EF65CD), + SPH_C64(0xB5C0FBCFEC4D3B2F), SPH_C64(0xE9B5DBA58189DBBC), + SPH_C64(0x3956C25BF348B538), SPH_C64(0x59F111F1B605D019), + SPH_C64(0x923F82A4AF194F9B), SPH_C64(0xAB1C5ED5DA6D8118), + SPH_C64(0xD807AA98A3030242), SPH_C64(0x12835B0145706FBE), + SPH_C64(0x243185BE4EE4B28C), SPH_C64(0x550C7DC3D5FFB4E2), + SPH_C64(0x72BE5D74F27B896F), SPH_C64(0x80DEB1FE3B1696B1), + SPH_C64(0x9BDC06A725C71235), SPH_C64(0xC19BF174CF692694), + SPH_C64(0xE49B69C19EF14AD2), SPH_C64(0xEFBE4786384F25E3), + SPH_C64(0x0FC19DC68B8CD5B5), SPH_C64(0x240CA1CC77AC9C65), + SPH_C64(0x2DE92C6F592B0275), SPH_C64(0x4A7484AA6EA6E483), + SPH_C64(0x5CB0A9DCBD41FBD4), SPH_C64(0x76F988DA831153B5), + SPH_C64(0x983E5152EE66DFAB), SPH_C64(0xA831C66D2DB43210), + SPH_C64(0xB00327C898FB213F), SPH_C64(0xBF597FC7BEEF0EE4), + SPH_C64(0xC6E00BF33DA88FC2), SPH_C64(0xD5A79147930AA725), + SPH_C64(0x06CA6351E003826F), SPH_C64(0x142929670A0E6E70), + SPH_C64(0x27B70A8546D22FFC), SPH_C64(0x2E1B21385C26C926), + SPH_C64(0x4D2C6DFC5AC42AED), SPH_C64(0x53380D139D95B3DF), + SPH_C64(0x650A73548BAF63DE), SPH_C64(0x766A0ABB3C77B2A8), + SPH_C64(0x81C2C92E47EDAEE6), SPH_C64(0x92722C851482353B), + SPH_C64(0xA2BFE8A14CF10364), SPH_C64(0xA81A664BBC423001), + SPH_C64(0xC24B8B70D0F89791), SPH_C64(0xC76C51A30654BE30), + SPH_C64(0xD192E819D6EF5218), SPH_C64(0xD69906245565A910), + SPH_C64(0xF40E35855771202A), SPH_C64(0x106AA07032BBD1B8), + SPH_C64(0x19A4C116B8D2D0C8), SPH_C64(0x1E376C085141AB53), + SPH_C64(0x2748774CDF8EEB99), SPH_C64(0x34B0BCB5E19B48A8), + SPH_C64(0x391C0CB3C5C95A63), SPH_C64(0x4ED8AA4AE3418ACB), + SPH_C64(0x5B9CCA4F7763E373), SPH_C64(0x682E6FF3D6B2B8A3), + SPH_C64(0x748F82EE5DEFB2FC), SPH_C64(0x78A5636F43172F60), + SPH_C64(0x84C87814A1F0AB72), SPH_C64(0x8CC702081A6439EC), + SPH_C64(0x90BEFFFA23631E28), SPH_C64(0xA4506CEBDE82BDE9), + SPH_C64(0xBEF9A3F7B2C67915), SPH_C64(0xC67178F2E372532B), + SPH_C64(0xCA273ECEEA26619C), SPH_C64(0xD186B8C721C0C207), + SPH_C64(0xEADA7DD6CDE0EB1E), SPH_C64(0xF57D4F7FEE6ED178), + SPH_C64(0x06F067AA72176FBA), SPH_C64(0x0A637DC5A2C898A6), + SPH_C64(0x113F9804BEF90DAE), SPH_C64(0x1B710B35131C471B), + SPH_C64(0x28DB77F523047D84), SPH_C64(0x32CAAB7B40C72493), + SPH_C64(0x3C9EBE0A15C9BEBC), SPH_C64(0x431D67C49C100D4C), + SPH_C64(0x4CC5D4BECB3E42B6), SPH_C64(0x597F299CFC657E2A), + SPH_C64(0x5FCB6FAB3AD6FAEC), SPH_C64(0x6C44198C4A475817) +}; + +static const sph_u64 H512[8] = { + SPH_C64(0x6A09E667F3BCC908), SPH_C64(0xBB67AE8584CAA73B), + SPH_C64(0x3C6EF372FE94F82B), SPH_C64(0xA54FF53A5F1D36F1), + SPH_C64(0x510E527FADE682D1), SPH_C64(0x9B05688C2B3E6C1F), + SPH_C64(0x1F83D9ABFB41BD6B), SPH_C64(0x5BE0CD19137E2179) +}; + +#define CH(X, Y, Z) \ + _mm256_xor_si256( _mm256_and_si256( _mm256_xor_si256( Y, Z ), X ), Z ) + +#define MAJ(X, Y, Z) \ + _mm256_or_si256( _mm256_and_si256( X, Y ), \ + _mm256_and_si256( _mm256_or_si256( X, Y ), Z ) ) + +#define BSG5_0(x) \ + _mm256_xor_si256( _mm256_xor_si256( \ + mm256_rotr_64(x, 28), mm256_rotr_64(x, 34) ), mm256_rotr_64(x, 39) ) + +#define BSG5_1(x) \ + _mm256_xor_si256( _mm256_xor_si256( \ + mm256_rotr_64(x, 14), mm256_rotr_64(x, 18) ), mm256_rotr_64(x, 41) ) + +#define SSG5_0(x) \ + _mm256_xor_si256( _mm256_xor_si256( \ + mm256_rotr_64(x, 1), mm256_rotr_64(x, 8) ), _mm256_srli_epi64(x, 7) ) + +#define SSG5_1(x) \ + _mm256_xor_si256( _mm256_xor_si256( \ + mm256_rotr_64(x, 19), mm256_rotr_64(x, 61) ), _mm256_srli_epi64(x, 6) ) + +#define SHA3_4WAY_STEP(A, B, C, D, E, F, G, H, i) \ +do { \ + __m256i T1, T2; \ + T1 = _mm256_add_epi64( _mm256_add_epi64( _mm256_add_epi64( \ + _mm256_add_epi64( H, BSG5_1(E) ), CH(E, F, G) ), \ + _mm256_set1_epi64x( K512[i] ) ), W[i] ); \ + T2 = _mm256_add_epi64( BSG5_0(A), MAJ(A, B, C) ); \ + D = _mm256_add_epi64( D, T1 ); \ + H = _mm256_add_epi64( T1, T2 ); \ +} while (0) + +static void +sha512_4way_round( __m256i *in, __m256i r[8] ) +{ + int i; + __m256i A, B, C, D, E, F, G, H; + __m256i W[80]; + + for ( i = 0; i < 16; i++ ) + W[i] = mm256_byteswap_64( in[i] ); + for ( i = 16; i < 80; i++ ) + W[i] = _mm256_add_epi64( _mm256_add_epi64( _mm256_add_epi64( + SSG5_1( W[ i-2 ] ), W[ i-7 ] ), SSG5_0( W[ i-15 ] ) ), W[ i-16 ] ); + + A = r[0]; + B = r[1]; + C = r[2]; + D = r[3]; + E = r[4]; + F = r[5]; + G = r[6]; + H = r[7]; + + for ( i = 0; i < 80; i += 8 ) + { + SHA3_4WAY_STEP( A, B, C, D, E, F, G, H, i + 0 ); + SHA3_4WAY_STEP( H, A, B, C, D, E, F, G, i + 1 ); + SHA3_4WAY_STEP( G, H, A, B, C, D, E, F, i + 2 ); + SHA3_4WAY_STEP( F, G, H, A, B, C, D, E, i + 3 ); + SHA3_4WAY_STEP( E, F, G, H, A, B, C, D, i + 4 ); + SHA3_4WAY_STEP( D, E, F, G, H, A, B, C, i + 5 ); + SHA3_4WAY_STEP( C, D, E, F, G, H, A, B, i + 6 ); + SHA3_4WAY_STEP( B, C, D, E, F, G, H, A, i + 7 ); + } + + r[0] = _mm256_add_epi64( r[0], A ); + r[1] = _mm256_add_epi64( r[1], B ); + r[2] = _mm256_add_epi64( r[2], C ); + r[3] = _mm256_add_epi64( r[3], D ); + r[4] = _mm256_add_epi64( r[4], E ); + r[5] = _mm256_add_epi64( r[5], F ); + r[6] = _mm256_add_epi64( r[6], G ); + r[7] = _mm256_add_epi64( r[7], H ); +} + +void sha512_4way_init( sha512_4way_context *sc ) +{ + sc->count = 0; + sc->val[0] = _mm256_set1_epi64x( H512[0] ); + sc->val[1] = _mm256_set1_epi64x( H512[1] ); + sc->val[2] = _mm256_set1_epi64x( H512[2] ); + sc->val[3] = _mm256_set1_epi64x( H512[3] ); + sc->val[4] = _mm256_set1_epi64x( H512[4] ); + sc->val[5] = _mm256_set1_epi64x( H512[5] ); + sc->val[6] = _mm256_set1_epi64x( H512[6] ); + sc->val[7] = _mm256_set1_epi64x( H512[7] ); +} + +void sha512_4way( sha512_4way_context *sc, const void *data, size_t len ) +{ + __m256i *vdata = (__m256i*)data; + size_t ptr; + int buf_size = 128; + + ptr = (unsigned)sc->count & (buf_size - 1U); + while ( len > 0 ) + { + size_t clen; + clen = buf_size - ptr; + if ( clen > len ) + clen = len; + memcpy_256( sc->buf + (ptr>>3), vdata, clen>>3 ); + vdata = vdata + (clen>>3); + ptr += clen; + len -= clen; + if ( ptr == buf_size ) + { + sha512_4way_round( sc->buf, sc->val ); + ptr = 0; + } + sc->count += clen; + } +} + +void sha512_4way_close( sha512_4way_context *sc, void *dst ) +{ + unsigned ptr, u; + int buf_size = 128; + int pad = buf_size - 16; + + ptr = (unsigned)sc->count & (buf_size - 1U); + sc->buf[ ptr>>3 ] = _mm256_set1_epi64x( 0x80 ); + ptr += 8; + + if ( ptr > pad ) + { + memset_zero_256( sc->buf + (ptr>>3), (buf_size - ptr) >> 3 ); + sha512_4way_round( sc->buf, sc->val ); + memset_zero_256( sc->buf, pad >> 3 ); + } + else + memset_zero_256( sc->buf + (ptr>>3), (pad - ptr) >> 3 ); + + sc->buf[ pad >> 3 ] = + mm256_byteswap_64( _mm256_set1_epi64x( sc->count >> 61 ) ); + sc->buf[ ( pad+8 ) >> 3 ] = + mm256_byteswap_64( _mm256_set1_epi64x( sc->count << 3 ) ); + sha512_4way_round( sc->buf, sc->val ); + + for ( u = 0; u < 8; u ++ ) + ((__m256i*)dst)[u] = mm256_byteswap_64( sc->val[u] ); +} + +#endif diff --git a/algo/sha/sha2-hash-4way.h b/algo/sha/sha2-hash-4way.h new file mode 100644 index 00000000..70708658 --- /dev/null +++ b/algo/sha/sha2-hash-4way.h @@ -0,0 +1,104 @@ +/* $Id: sph_sha2.h 216 2010-06-08 09:46:57Z tp $ */ +/** + * SHA-224, SHA-256, SHA-384 and SHA-512 interface. + * + * SHA-256 has been published in FIPS 180-2, now amended with a change + * notice to include SHA-224 as well (which is a simple variation on + * SHA-256). SHA-384 and SHA-512 are also defined in FIPS 180-2. FIPS + * standards can be found at: + * http://csrc.nist.gov/publications/fips/ + * + * ==========================(LICENSE BEGIN)============================ + * + * Copyright (c) 2007-2010 Projet RNRT SAPHIR + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + * ===========================(LICENSE END)============================= + * + * @file sph_sha2.h + * @author Thomas Pornin + */ + +#ifndef SHA2_HASH_4WAY_H__ +#define SHA2_HASH_4WAY_H__ 1 + +#include +#include "sph_types.h" +#include "avxdefs.h" + +#if 0 + +#define SPH_SIZE_sha224 224 + +#define SPH_SIZE_sha256 256 + +typedef struct { +#ifndef DOXYGEN_IGNORE + unsigned char buf[64]; /* first field, for alignment */ + sph_u32 val[8]; +#if SPH_64 + sph_u64 count; +#else + sph_u32 count_high, count_low; +#endif +#endif +} sph_sha224_context; + +typedef sph_sha224_context sph_sha256_context; + +void sph_sha224_init(void *cc); + +void sph_sha224(void *cc, const void *data, size_t len); + +void sph_sha224_close(void *cc, void *dst); + +void sph_sha224_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst); + +void sph_sha224_comp(const sph_u32 msg[16], sph_u32 val[8]); + +void sph_sha256_init(void *cc); + +void sph_sha256(void *cc, const void *data, size_t len); + +void sph_sha256_close(void *cc, void *dst); + +void sph_sha256_addbits_and_close(void *cc, unsigned ub, unsigned n, void *dst); + +void sph_sha256_comp(const sph_u32 msg[16], sph_u32 val[8]); + +#endif + +#if defined (__AVX2__) + +#define SPH_SIZE_sha512 512 + +typedef struct { + __m256i buf[128>>3]; + __m256i val[8]; + uint64_t count; +} sha512_4way_context; + +void sha512_4way_init( sha512_4way_context *sc); +void sha512_4way( sha512_4way_context *sc, const void *data, size_t len ); +void sha512_4way_close( sha512_4way_context *sc, void *dst ); + +#endif +#endif diff --git a/algo/skein/skein-gate.c b/algo/skein/skein-gate.c index 6d144c15..f41c874f 100644 --- a/algo/skein/skein-gate.c +++ b/algo/skein/skein-gate.c @@ -6,7 +6,7 @@ int64_t skein_get_max64() { return 0x7ffffLL; } bool register_skein_algo( algo_gate_t* gate ) { - gate->optimizations = FOUR_WAY_OPT | SHA_OPT; + gate->optimizations = AVX2_OPT | SHA_OPT; #if defined (SKEIN_4WAY) gate->scanhash = (void*)&scanhash_skein_4way; gate->hash = (void*)&skeinhash_4way; diff --git a/algo/skein/skein-gate.h b/algo/skein/skein-gate.h index 225adb72..c90f1536 100644 --- a/algo/skein/skein-gate.h +++ b/algo/skein/skein-gate.h @@ -3,7 +3,7 @@ #include #include "algo-gate-api.h" -#if defined(FOUR_WAY) && defined(__AVX2__) +#if defined(__AVX2__) #define SKEIN_4WAY #endif diff --git a/algo/skein/skein2-gate.c b/algo/skein/skein2-gate.c index 17cc2ca0..efc9f414 100644 --- a/algo/skein/skein2-gate.c +++ b/algo/skein/skein2-gate.c @@ -9,7 +9,7 @@ int64_t skein2_get_max64 () bool register_skein2_algo( algo_gate_t* gate ) { - gate->optimizations = FOUR_WAY_OPT; + gate->optimizations = AVX2_OPT; #if defined (FOUR_WAY) && defined (__AVX2__) gate->scanhash = (void*)&scanhash_skein2_4way; gate->hash = (void*)&skein2hash_4way; diff --git a/algo/skein/skein2-gate.h b/algo/skein/skein2-gate.h index e60213f5..6dcabe38 100644 --- a/algo/skein/skein2-gate.h +++ b/algo/skein/skein2-gate.h @@ -3,7 +3,7 @@ #include "algo-gate-api.h" #include -#if defined(FOUR_WAY) && defined(__AVX2__) +#if defined(__AVX2__) #define SKEIN2_4WAY #endif diff --git a/algo/whirlpool/whirlpool-gate.c b/algo/whirlpool/whirlpool-gate.c index fa3e9d9a..8cf33e32 100644 --- a/algo/whirlpool/whirlpool-gate.c +++ b/algo/whirlpool/whirlpool-gate.c @@ -4,7 +4,7 @@ bool register_whirlpool_algo( algo_gate_t* gate ) { #if defined (WHIRLPOOL_4WAY) four_way_not_tested(); - gate->optimizations = FOUR_WAY_OPT; + gate->optimizations = AVX2_OPT; gate->scanhash = (void*)&scanhash_whirlpool_4way; gate->hash = (void*)&whirlpool_hash_4way; #else diff --git a/algo/x11/c11-4way.c b/algo/x11/c11-4way.c index 1669fa44..1d96fa16 100644 --- a/algo/x11/c11-4way.c +++ b/algo/x11/c11-4way.c @@ -1,7 +1,7 @@ #include "cpuminer-config.h" #include "c11-gate.h" -#if defined (__AVX2__) && defined (__AES__) +#if defined (C11_4WAY) #include #include diff --git a/algo/x11/c11-gate.c b/algo/x11/c11-gate.c index e94b9bca..518de081 100644 --- a/algo/x11/c11-gate.c +++ b/algo/x11/c11-gate.c @@ -11,7 +11,7 @@ bool register_c11_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_c11; gate->hash = (void*)&c11_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT; gate->get_max64 = (void*)&get_max64_0x3ffff; return true; }; diff --git a/algo/x11/c11-gate.h b/algo/x11/c11-gate.h index 6a16123e..4983c518 100644 --- a/algo/x11/c11-gate.h +++ b/algo/x11/c11-gate.h @@ -4,7 +4,7 @@ #include "algo-gate-api.h" #include -#if defined(HASH_4WAY) && defined(__AES__) +#if defined(__AVX2__) && defined(__AES__) #define C11_4WAY #endif diff --git a/algo/x11/timetravel-4way.c b/algo/x11/timetravel-4way.c index ea7dc6c4..3538710c 100644 --- a/algo/x11/timetravel-4way.c +++ b/algo/x11/timetravel-4way.c @@ -1,6 +1,6 @@ #include "timetravel-gate.h" -#if defined(__AVX2__) && defined(__AES__) +#if defined(TIMETRAVEL_4WAY) #include #include diff --git a/algo/x11/timetravel-gate.c b/algo/x11/timetravel-gate.c index bee194ca..c7f521c2 100644 --- a/algo/x11/timetravel-gate.c +++ b/algo/x11/timetravel-gate.c @@ -17,7 +17,7 @@ bool register_timetravel_algo( algo_gate_t* gate ) gate->hash = (void*)&timetravel_hash; #endif gate->set_target = (void*)&tt8_set_target; - gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT; gate->get_max64 = (void*)&get_max64_0xffffLL; return true; }; diff --git a/algo/x11/timetravel-gate.h b/algo/x11/timetravel-gate.h index 758b73d5..7aa9b34d 100644 --- a/algo/x11/timetravel-gate.h +++ b/algo/x11/timetravel-gate.h @@ -4,7 +4,7 @@ #include "algo-gate-api.h" #include -#if defined(HASH_4WAY) && defined(__AES__) +#if defined(__AVX2__) && defined(__AES__) #define TIMETRAVEL_4WAY #endif diff --git a/algo/x11/timetravel10-4way.c b/algo/x11/timetravel10-4way.c index 918cb2ca..e2e9c1f3 100644 --- a/algo/x11/timetravel10-4way.c +++ b/algo/x11/timetravel10-4way.c @@ -1,6 +1,6 @@ #include "timetravel10-gate.h" -#if defined(__AVX2__) && defined(__AES__) +#if defined(TIMETRAVEL10_4WAY) #include #include diff --git a/algo/x11/timetravel10-gate.c b/algo/x11/timetravel10-gate.c index 91e27db6..f5768e28 100644 --- a/algo/x11/timetravel10-gate.c +++ b/algo/x11/timetravel10-gate.c @@ -17,7 +17,7 @@ bool register_timetravel10_algo( algo_gate_t* gate ) gate->hash = (void*)&timetravel10_hash; #endif gate->set_target = (void*)&tt10_set_target; - gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT; gate->get_max64 = (void*)&get_max64_0xffffLL; return true; }; diff --git a/algo/x11/timetravel10-gate.h b/algo/x11/timetravel10-gate.h index 6a7090a6..d2823a12 100644 --- a/algo/x11/timetravel10-gate.h +++ b/algo/x11/timetravel10-gate.h @@ -4,7 +4,7 @@ #include "algo-gate-api.h" #include -#if defined(HASH_4WAY) && defined(__AES__) +#if defined(__AVX2__) && defined(__AES__) #define TIMETRAVEL10_4WAY #endif diff --git a/algo/x11/tribus-4way.c b/algo/x11/tribus-4way.c index 3baf27b4..8959c78f 100644 --- a/algo/x11/tribus-4way.c +++ b/algo/x11/tribus-4way.c @@ -4,7 +4,7 @@ #include #include -#if defined(__AVX2__) && !defined(NO_AES_NI) +#if defined(TRIBUS_4WAY) #include "algo/jh/jh-hash-4way.h" #include "algo/keccak/keccak-hash-4way.h" diff --git a/algo/x11/tribus-gate.c b/algo/x11/tribus-gate.c index 4804ff2d..f30d65ea 100644 --- a/algo/x11/tribus-gate.c +++ b/algo/x11/tribus-gate.c @@ -2,7 +2,7 @@ bool register_tribus_algo( algo_gate_t* gate ) { - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; gate->get_max64 = (void*)&get_max64_0x1ffff; #if defined (TRIBUS_4WAY) // init_tribus_4way_ctx(); diff --git a/algo/x11/tribus-gate.h b/algo/x11/tribus-gate.h index 51cec2f8..d3d03158 100644 --- a/algo/x11/tribus-gate.h +++ b/algo/x11/tribus-gate.h @@ -4,7 +4,7 @@ #include "algo-gate-api.h" #include -#if defined(HASH_4WAY) && defined(__AES__) +#if defined(__AVX2__) && defined(__AES__) #define TRIBUS_4WAY #endif diff --git a/algo/x11/x11-4way.c b/algo/x11/x11-4way.c index 92a06103..35ce68ed 100644 --- a/algo/x11/x11-4way.c +++ b/algo/x11/x11-4way.c @@ -1,7 +1,7 @@ #include "cpuminer-config.h" #include "x11-gate.h" -#if defined (__AVX2__) && defined (__AES__) +#if defined (X11_4WAY) #include #include diff --git a/algo/x11/x11-gate.c b/algo/x11/x11-gate.c index cfa625d7..408eed84 100644 --- a/algo/x11/x11-gate.c +++ b/algo/x11/x11-gate.c @@ -11,7 +11,7 @@ bool register_x11_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_x11; gate->hash = (void*)&x11_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT; gate->get_max64 = (void*)&get_max64_0x3ffff; return true; }; diff --git a/algo/x11/x11-gate.h b/algo/x11/x11-gate.h index a07d816e..69106dbe 100644 --- a/algo/x11/x11-gate.h +++ b/algo/x11/x11-gate.h @@ -4,7 +4,7 @@ #include "algo-gate-api.h" #include -#if defined(HASH_4WAY) && defined(__AES__) +#if defined(__AVX2__) && defined(__AES__) #define X11_4WAY #endif diff --git a/algo/x11/x11evo-4way.c b/algo/x11/x11evo-4way.c index d1a3339d..e73e52c2 100644 --- a/algo/x11/x11evo-4way.c +++ b/algo/x11/x11evo-4way.c @@ -1,7 +1,7 @@ #include "cpuminer-config.h" #include "x11evo-gate.h" -#if defined(__AVX2__) && defined(__AES__) +#if defined(X11EVO_4WAY) #include #include diff --git a/algo/x11/x11evo-gate.c b/algo/x11/x11evo-gate.c index 8f8841ae..47019d85 100644 --- a/algo/x11/x11evo-gate.c +++ b/algo/x11/x11evo-gate.c @@ -89,7 +89,7 @@ bool register_x11evo_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_x11evo; gate->hash = (void*)&x11evo_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT; return true; }; diff --git a/algo/x11/x11evo-gate.h b/algo/x11/x11evo-gate.h index 32bd551c..7be09d81 100644 --- a/algo/x11/x11evo-gate.h +++ b/algo/x11/x11evo-gate.h @@ -4,7 +4,7 @@ #include "algo-gate-api.h" #include -#if defined(HASH_4WAY) && defined(__AES__) +#if defined(__AVX2__) && defined(__AES__) #define X11EVO_4WAY #endif diff --git a/algo/x11/x11gost-4way.c b/algo/x11/x11gost-4way.c index 3727f5e7..b22f1d60 100644 --- a/algo/x11/x11gost-4way.c +++ b/algo/x11/x11gost-4way.c @@ -1,7 +1,7 @@ #include "cpuminer-config.h" #include "x11gost-gate.h" -#if defined (__AVX2__) && defined (__AES__) +#if defined (X11GOST_4WAY) #include #include diff --git a/algo/x11/x11gost-gate.c b/algo/x11/x11gost-gate.c index c3643ea0..c69d933c 100644 --- a/algo/x11/x11gost-gate.c +++ b/algo/x11/x11gost-gate.c @@ -11,7 +11,7 @@ bool register_x11gost_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_x11gost; gate->hash = (void*)&x11gost_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT; gate->get_max64 = (void*)&get_max64_0x3ffff; return true; }; diff --git a/algo/x11/x11gost-gate.h b/algo/x11/x11gost-gate.h index 868d0511..cd486269 100644 --- a/algo/x11/x11gost-gate.h +++ b/algo/x11/x11gost-gate.h @@ -4,7 +4,7 @@ #include "algo-gate-api.h" #include -#if defined(HASH_4WAY) && defined(__AES__) +#if defined(__AVX2__) && defined(__AES__) #define X11GOST_4WAY #endif diff --git a/algo/x13/phi1612-4way.c b/algo/x13/phi1612-4way.c index e7493e68..f074ec0a 100644 --- a/algo/x13/phi1612-4way.c +++ b/algo/x13/phi1612-4way.c @@ -1,6 +1,6 @@ -#include "x13-gate.h" +#include "phi1612-gate.h" -#if defined(__AVX2__) && defined(__AES__) +#if defined(PHI1612_4WAY) #include #include diff --git a/algo/x13/phi1612-gate.c b/algo/x13/phi1612-gate.c index 77eae6ec..9a9d8711 100644 --- a/algo/x13/phi1612-gate.c +++ b/algo/x13/phi1612-gate.c @@ -11,7 +11,7 @@ bool register_phi1612_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_phi1612; gate->hash = (void*)&phi1612_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; gate->get_max64 = (void*)&get_max64_0x3ffff; return true; }; diff --git a/algo/x13/phi1612-gate.h b/algo/x13/phi1612-gate.h index 12d2df7a..713ccadc 100644 --- a/algo/x13/phi1612-gate.h +++ b/algo/x13/phi1612-gate.h @@ -4,7 +4,7 @@ #include "algo-gate-api.h" #include -#if defined(HASH_4WAY) && defined(__AES__) +#if defined(__AVX2__) && defined(__AES__) #define PHI1612_4WAY #endif diff --git a/algo/x13/skunk-4way.c b/algo/x13/skunk-4way.c index dc7363fb..0692bbce 100644 --- a/algo/x13/skunk-4way.c +++ b/algo/x13/skunk-4way.c @@ -1,6 +1,6 @@ #include "skunk-gate.h" -#ifdef __AVX2__ +#if defined(SKUNK_4WAY) #include #include diff --git a/algo/x13/skunk-gate.c b/algo/x13/skunk-gate.c index 53377532..3844adee 100644 --- a/algo/x13/skunk-gate.c +++ b/algo/x13/skunk-gate.c @@ -2,7 +2,7 @@ bool register_skunk_algo( algo_gate_t* gate ) { - gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT; + gate->optimizations = SSE2_OPT | AVX_OPT | AVX2_OPT; #if defined (SKUNK_4WAY) gate->miner_thread_init = (void*)&skunk_4way_thread_init; gate->scanhash = (void*)&scanhash_skunk_4way; diff --git a/algo/x13/skunk-gate.h b/algo/x13/skunk-gate.h index 95429a54..d616bf1e 100644 --- a/algo/x13/skunk-gate.h +++ b/algo/x13/skunk-gate.h @@ -4,7 +4,7 @@ #include "algo-gate-api.h" #include -#if defined(HASH_4WAY) +#if defined(__AVX2__) #define SKUNK_4WAY #endif diff --git a/algo/x13/x13-4way.c b/algo/x13/x13-4way.c index 9f78bef2..927ea337 100644 --- a/algo/x13/x13-4way.c +++ b/algo/x13/x13-4way.c @@ -1,6 +1,6 @@ #include "x13-gate.h" -#if defined(__AVX2__) && defined(__AES__) +#if defined(X13_4WAY) #include #include @@ -17,7 +17,7 @@ #include "algo/shavite/sph_shavite.h" #include "algo/simd/sse2/nist.h" #include "algo/echo/aes_ni/hash_api.h" -#include "algo/hamsi/sph_hamsi.h" +#include "algo/hamsi/hamsi-hash-4way.h" #include "algo/fugue/sph_fugue.h" typedef struct { @@ -32,7 +32,7 @@ typedef struct { sph_shavite512_context shavite; hashState_sd simd; hashState_echo echo; - sph_hamsi512_context hamsi; + hamsi512_4way_context hamsi; sph_fugue512_context fugue; } x13_4way_ctx_holder; @@ -51,7 +51,7 @@ void init_x13_4way_ctx() sph_shavite512_init( &x13_4way_ctx.shavite ); init_sd( &x13_4way_ctx.simd, 512 ); init_echo( &x13_4way_ctx.echo, 512 ); - sph_hamsi512_init( &x13_4way_ctx.hamsi ); + hamsi512_4way_init( &x13_4way_ctx.hamsi ); sph_fugue512_init( &x13_4way_ctx.fugue ); }; @@ -85,7 +85,7 @@ void x13_4way_hash( void *state, const void *input ) memcpy( &ctx.groestl, &x13_4way_ctx.groestl, sizeof(hashState_groestl) ); update_and_final_groestl( &ctx.groestl, (char*)hash3, (char*)hash3, 512 ); - // Parallel 4way + // Parallel 4way 64 bit mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); // 4 Skein @@ -100,7 +100,7 @@ void x13_4way_hash( void *state, const void *input ) keccak512_4way( &ctx.keccak, vhash, 64 ); keccak512_4way_close( &ctx.keccak, vhash ); - // Serial to the end + // Serial mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); // 7 Luffa @@ -167,20 +167,13 @@ void x13_4way_hash( void *state, const void *input ) update_final_echo( &ctx.echo, (BitSequence *)hash3, (const BitSequence *) hash3, 512 ); - // 12 Hamsi - sph_hamsi512( &ctx.hamsi, hash0, 64 ); - sph_hamsi512_close( &ctx.hamsi, hash0 ); - memcpy( &ctx.hamsi, &x13_4way_ctx.hamsi, sizeof(sph_hamsi512_context) ); - sph_hamsi512( &ctx.hamsi, hash1, 64 ); - sph_hamsi512_close( &ctx.hamsi, hash1 ); - memcpy( &ctx.hamsi, &x13_4way_ctx.hamsi, sizeof(sph_hamsi512_context) ); - sph_hamsi512( &ctx.hamsi, hash2, 64 ); - sph_hamsi512_close( &ctx.hamsi, hash2 ); - memcpy( &ctx.hamsi, &x13_4way_ctx.hamsi, sizeof(sph_hamsi512_context) ); - sph_hamsi512( &ctx.hamsi, hash3, 64 ); - sph_hamsi512_close( &ctx.hamsi, hash3 ); + // 12 Hamsi parallel 4way 32 bit + mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 ); + hamsi512_4way( &ctx.hamsi, vhash, 64 ); + hamsi512_4way_close( &ctx.hamsi, vhash ); + mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 ); - // 13 Fugue + // 13 Fugue serial sph_fugue512( &ctx.fugue, hash0, 64 ); sph_fugue512_close( &ctx.fugue, hash0 ); memcpy( &ctx.fugue, &x13_4way_ctx.fugue, sizeof(sph_fugue512_context) ); diff --git a/algo/x13/x13-gate.c b/algo/x13/x13-gate.c index 6bd43738..ce3e6408 100644 --- a/algo/x13/x13-gate.c +++ b/algo/x13/x13-gate.c @@ -11,7 +11,7 @@ bool register_x13_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_x13; gate->hash = (void*)&x13hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; gate->get_max64 = (void*)&get_max64_0x3ffff; return true; }; diff --git a/algo/x13/x13-gate.h b/algo/x13/x13-gate.h index 6b71276a..718810e9 100644 --- a/algo/x13/x13-gate.h +++ b/algo/x13/x13-gate.h @@ -4,7 +4,7 @@ #include "algo-gate-api.h" #include -#if defined(HASH_4WAY) && defined(__AES__) +#if defined(__AVX2__) && defined(__AES__) #define X13_4WAY #endif diff --git a/algo/x13/x13sm3-4way.c b/algo/x13/x13sm3-4way.c index e7b52830..7cc18b60 100644 --- a/algo/x13/x13sm3-4way.c +++ b/algo/x13/x13sm3-4way.c @@ -1,6 +1,6 @@ #include "x13sm3-gate.h" -#if defined(__AVX2__) && defined(__AES__) +#if defined(X13SM3_4WAY) #include #include @@ -18,7 +18,7 @@ #include "algo/simd/sse2/nist.h" #include "algo/echo/aes_ni/hash_api.h" #include "algo/sm3/sm3-hash-4way.h" -#include "algo/hamsi/sph_hamsi.h" +#include "algo/hamsi/hamsi-hash-4way.h" #include "algo/fugue/sph_fugue.h" typedef struct { @@ -34,7 +34,7 @@ typedef struct { hashState_sd simd; hashState_echo echo; sm3_4way_ctx_t sm3; - sph_hamsi512_context hamsi; + hamsi512_4way_context hamsi; sph_fugue512_context fugue; } x13sm3_4way_ctx_holder; @@ -55,7 +55,7 @@ void init_x13sm3_4way_ctx() init_sd( &x13sm3_4way_ctx.simd, 512 ); init_echo( &x13sm3_4way_ctx.echo, 512 ); sm3_4way_init( &x13sm3_4way_ctx.sm3 ); - sph_hamsi512_init( &x13sm3_4way_ctx.hamsi ); + hamsi512_4way_init( &x13sm3_4way_ctx.hamsi ); sph_fugue512_init( &x13sm3_4way_ctx.fugue ); }; @@ -174,7 +174,9 @@ void x13sm3_4way_hash( void *state, const void *input ) update_final_echo( &ctx.echo, (BitSequence *)hash3, (const BitSequence *) hash3, 512 ); - // SM3 + mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 ); + + // SM3 parallel 32 bit uint32_t sm3_vhash[32*4] __attribute__ ((aligned (64))); memset( sm3_vhash, 0, sizeof sm3_vhash ); uint32_t sm3_hash0[32] __attribute__ ((aligned (32))); @@ -186,26 +188,16 @@ void x13sm3_4way_hash( void *state, const void *input ) uint32_t sm3_hash3[32] __attribute__ ((aligned (32))); memset( sm3_hash3, 0, sizeof sm3_hash3 ); - mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 ); sm3_4way( &ctx.sm3, vhash, 64 ); sm3_4way_close( &ctx.sm3, sm3_vhash ); - mm_deinterleave_4x32( sm3_hash0, sm3_hash1, sm3_hash2, sm3_hash3, - sm3_vhash, 1024 ); - - // Hamsi - sph_hamsi512( &ctx.hamsi, sm3_hash0, 64 ); - sph_hamsi512_close( &ctx.hamsi, hash0 ); - memcpy( &ctx.hamsi, &x13sm3_4way_ctx.hamsi, sizeof(sph_hamsi512_context) ); - sph_hamsi512( &ctx.hamsi, sm3_hash1, 64 ); - sph_hamsi512_close( &ctx.hamsi, hash1 ); - memcpy( &ctx.hamsi, &x13sm3_4way_ctx.hamsi, sizeof(sph_hamsi512_context) ); - sph_hamsi512( &ctx.hamsi, sm3_hash2, 64 ); - sph_hamsi512_close( &ctx.hamsi, hash2 ); - memcpy( &ctx.hamsi, &x13sm3_4way_ctx.hamsi, sizeof(sph_hamsi512_context) ); - sph_hamsi512( &ctx.hamsi, sm3_hash3, 64 ); - sph_hamsi512_close( &ctx.hamsi, hash3 ); - - // Fugue + + // Hamsi parallel 32 bit + hamsi512_4way( &ctx.hamsi, sm3_vhash, 64 ); + hamsi512_4way_close( &ctx.hamsi, vhash ); + + mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 ); + + // Fugue serial sph_fugue512( &ctx.fugue, hash0, 64 ); sph_fugue512_close( &ctx.fugue, hash0 ); memcpy( &ctx.fugue, &x13sm3_4way_ctx.fugue, sizeof(sph_fugue512_context) ); diff --git a/algo/x13/x13sm3-gate.c b/algo/x13/x13sm3-gate.c index a6280ce8..4de40ed1 100644 --- a/algo/x13/x13sm3-gate.c +++ b/algo/x13/x13sm3-gate.c @@ -11,7 +11,7 @@ bool register_x13sm3_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_x13sm3; gate->hash = (void*)&x13sm3_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT | FOUR_WAY_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX2_OPT; gate->get_max64 = (void*)&get_max64_0x3ffff; return true; }; diff --git a/algo/x13/x13sm3-gate.h b/algo/x13/x13sm3-gate.h index 3e356338..5399a41b 100644 --- a/algo/x13/x13sm3-gate.h +++ b/algo/x13/x13sm3-gate.h @@ -4,7 +4,7 @@ #include "algo-gate-api.h" #include -#if defined(HASH_4WAY) && defined(__AES__) +#if defined(__AVX2__) && defined(__AES__) #define X13SM3_4WAY #endif diff --git a/algo/x14/polytimos-4way.c b/algo/x14/polytimos-4way.c index 8276941e..bd6d3921 100644 --- a/algo/x14/polytimos-4way.c +++ b/algo/x14/polytimos-4way.c @@ -1,6 +1,6 @@ #include "polytimos-gate.h" -#if defined(__AVX2__) && defined(__AES__) +#if defined(POLYTIMOS_4WAY) #include #include diff --git a/algo/x14/polytimos-gate.c b/algo/x14/polytimos-gate.c index b3307d46..e93d3289 100644 --- a/algo/x14/polytimos-gate.c +++ b/algo/x14/polytimos-gate.c @@ -2,7 +2,7 @@ bool register_polytimos_algo( algo_gate_t* gate ) { - gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT; #ifdef POLYTIMOS_4WAY init_polytimos_4way_ctx(); gate->scanhash = (void*)&scanhash_polytimos_4way; diff --git a/algo/x14/veltor-gate.c b/algo/x14/veltor-gate.c index c5051c6f..593ea0dd 100644 --- a/algo/x14/veltor-gate.c +++ b/algo/x14/veltor-gate.c @@ -11,7 +11,7 @@ bool register_veltor_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_veltor; gate->hash = (void*)&veltor_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT; gate->get_max64 = (void*)&get_max64_0x3ffff; return true; }; diff --git a/algo/x14/veltor-gate.h b/algo/x14/veltor-gate.h index fc8d717a..3f56d3ed 100644 --- a/algo/x14/veltor-gate.h +++ b/algo/x14/veltor-gate.h @@ -4,7 +4,7 @@ #include "algo-gate-api.h" #include -#if defined(HASH_4WAY) && defined(__AES__) +#if defined(__AVX2__) && defined(__AES__) #define VELTOR_4WAY #endif diff --git a/algo/x14/x14-4way.c b/algo/x14/x14-4way.c index 2b2fb543..0a02fa9a 100644 --- a/algo/x14/x14-4way.c +++ b/algo/x14/x14-4way.c @@ -1,6 +1,6 @@ #include "x14-gate.h" -#if defined(__AVX2__) && defined(__AES__) +#if defined(X14_4WAY) #include #include @@ -18,7 +18,7 @@ #include "algo/simd/sse2/nist.h" #include "algo/echo/aes_ni/hash_api.h" #include "algo/echo/sph_echo.h" -#include "algo/hamsi/sph_hamsi.h" +#include "algo/hamsi/hamsi-hash-4way.h" #include "algo/fugue/sph_fugue.h" #include "algo/shabal/shabal-hash-4way.h" @@ -34,7 +34,7 @@ typedef struct { sph_shavite512_context shavite; hashState_sd simd; hashState_echo echo; - sph_hamsi512_context hamsi; + hamsi512_4way_context hamsi; sph_fugue512_context fugue; shabal512_4way_context shabal; } x14_4way_ctx_holder; @@ -55,7 +55,7 @@ void init_x14_4way_ctx() sph_shavite512_init( &x14_4way_ctx.shavite ); init_sd( &x14_4way_ctx.simd, 512 ); init_echo( &x14_4way_ctx.echo, 512 ); - sph_hamsi512_init( &x14_4way_ctx.hamsi ); + hamsi512_4way_init( &x14_4way_ctx.hamsi ); sph_fugue512_init( &x14_4way_ctx.fugue ); shabal512_4way_init( &x14_4way_ctx.shabal ); }; @@ -172,20 +172,13 @@ void x14_4way_hash( void *state, const void *input ) update_final_echo( &ctx.echo, (BitSequence *)hash3, (const BitSequence *) hash3, 512 ); - // 12 Hamsi - sph_hamsi512( &ctx.hamsi, hash0, 64 ); - sph_hamsi512_close( &ctx.hamsi, hash0 ); - memcpy( &ctx.hamsi, &x14_4way_ctx.hamsi, sizeof(sph_hamsi512_context) ); - sph_hamsi512( &ctx.hamsi, hash1, 64 ); - sph_hamsi512_close( &ctx.hamsi, hash1 ); - memcpy( &ctx.hamsi, &x14_4way_ctx.hamsi, sizeof(sph_hamsi512_context) ); - sph_hamsi512( &ctx.hamsi, hash2, 64 ); - sph_hamsi512_close( &ctx.hamsi, hash2 ); - memcpy( &ctx.hamsi, &x14_4way_ctx.hamsi, sizeof(sph_hamsi512_context) ); - sph_hamsi512( &ctx.hamsi, hash3, 64 ); - sph_hamsi512_close( &ctx.hamsi, hash3 ); + // 12 Hamsi parallel 4way 32 bit + mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 ); + hamsi512_4way( &ctx.hamsi, vhash, 64 ); + hamsi512_4way_close( &ctx.hamsi, vhash ); + mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 ); - // 13 Fugue + // 13 Fugue serial sph_fugue512( &ctx.fugue, hash0, 64 ); sph_fugue512_close( &ctx.fugue, hash0 ); memcpy( &ctx.fugue, &x14_4way_ctx.fugue, sizeof(sph_fugue512_context) ); diff --git a/algo/x14/x14-gate.c b/algo/x14/x14-gate.c index 596aa1e8..9f2a6d03 100644 --- a/algo/x14/x14-gate.c +++ b/algo/x14/x14-gate.c @@ -11,7 +11,7 @@ bool register_x14_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_x14; gate->hash = (void*)&x14hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT; gate->get_max64 = (void*)&get_max64_0x3ffff; return true; }; diff --git a/algo/x14/x14-gate.h b/algo/x14/x14-gate.h index 127c1017..1b3770e6 100644 --- a/algo/x14/x14-gate.h +++ b/algo/x14/x14-gate.h @@ -4,7 +4,7 @@ #include "algo-gate-api.h" #include -#if defined(HASH_4WAY) && defined(__AES__) +#if defined(__AVX2__) && defined(__AES__) #define X14_4WAY #endif diff --git a/algo/x15/x15-4way.c b/algo/x15/x15-4way.c index af13d980..56e4b559 100644 --- a/algo/x15/x15-4way.c +++ b/algo/x15/x15-4way.c @@ -1,6 +1,6 @@ #include "x15-gate.h" -#if defined(__AVX2__) && defined(__AES__) +#if defined(X15_4WAY) #include #include @@ -18,7 +18,8 @@ #include "algo/simd/sse2/nist.h" #include "algo/echo/aes_ni/hash_api.h" #include "algo/echo/sph_echo.h" -#include "algo/hamsi/sph_hamsi.h" +#include "algo/hamsi/hamsi-hash-4way.h" +//#include "algo/hamsi/sph_hamsi.h" #include "algo/fugue/sph_fugue.h" #include "algo/shabal/shabal-hash-4way.h" #include "algo/whirlpool/sph_whirlpool.h" @@ -35,7 +36,8 @@ typedef struct { sph_shavite512_context shavite; hashState_sd simd; hashState_echo echo; - sph_hamsi512_context hamsi; + hamsi512_4way_context hamsi; +// sph_hamsi512_context hamsi; sph_fugue512_context fugue; shabal512_4way_context shabal; sph_whirlpool_context whirlpool; @@ -56,7 +58,8 @@ void init_x15_4way_ctx() sph_shavite512_init( &x15_4way_ctx.shavite ); init_sd( &x15_4way_ctx.simd, 512 ); init_echo( &x15_4way_ctx.echo, 512 ); - sph_hamsi512_init( &x15_4way_ctx.hamsi ); + hamsi512_4way_init( &x15_4way_ctx.hamsi ); +// sph_hamsi512_init( &x15_4way_ctx.hamsi ); sph_fugue512_init( &x15_4way_ctx.fugue ); shabal512_4way_init( &x15_4way_ctx.shabal ); sph_whirlpool_init( &x15_4way_ctx.whirlpool ); @@ -174,6 +177,12 @@ void x15_4way_hash( void *state, const void *input ) update_final_echo( &ctx.echo, (BitSequence *)hash3, (const BitSequence *) hash3, 512 ); + // 12 Hamsi parallel 4way 32 bit + mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 ); + hamsi512_4way( &ctx.hamsi, vhash, 64 ); + hamsi512_4way_close( &ctx.hamsi, vhash ); + mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 ); +/* // 12 Hamsi sph_hamsi512( &ctx.hamsi, hash0, 64 ); sph_hamsi512_close( &ctx.hamsi, hash0 ); @@ -186,7 +195,7 @@ void x15_4way_hash( void *state, const void *input ) memcpy( &ctx.hamsi, &x15_4way_ctx.hamsi, sizeof(sph_hamsi512_context) ); sph_hamsi512( &ctx.hamsi, hash3, 64 ); sph_hamsi512_close( &ctx.hamsi, hash3 ); - +*/ // 13 Fugue sph_fugue512( &ctx.fugue, hash0, 64 ); sph_fugue512_close( &ctx.fugue, hash0 ); diff --git a/algo/x15/x15-gate.c b/algo/x15/x15-gate.c index 3663820e..75e655e7 100644 --- a/algo/x15/x15-gate.c +++ b/algo/x15/x15-gate.c @@ -11,7 +11,7 @@ bool register_x15_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_x15; gate->hash = (void*)&x15hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT; return true; }; diff --git a/algo/x15/x15-gate.h b/algo/x15/x15-gate.h index 5af00433..fc54df03 100644 --- a/algo/x15/x15-gate.h +++ b/algo/x15/x15-gate.h @@ -4,7 +4,7 @@ #include "algo-gate-api.h" #include -#if defined(HASH_4WAY) && defined(__AES__) +#if defined(__AVX2__) && defined(__AES__) #define X15_4WAY #endif diff --git a/algo/x17/x16r-4way.c b/algo/x17/x16r-4way.c new file mode 100644 index 00000000..14c5ba52 --- /dev/null +++ b/algo/x17/x16r-4way.c @@ -0,0 +1,396 @@ +/** + * x16r algo implementation + * + * Implementation by tpruvot@github Jan 2018 + * Optimized by JayDDee@github Jan 2018 + */ +#include "x16r-gate.h" + +#if defined (X16R_4WAY) + +#include +#include +#include +#include "algo/blake/blake-hash-4way.h" +#include "algo/bmw/bmw-hash-4way.h" +#include "algo/groestl/aes_ni/hash-groestl.h" +#include "algo/groestl/aes_ni/hash-groestl.h" +#include "algo/skein/skein-hash-4way.h" +#include "algo/jh/jh-hash-4way.h" +#include "algo/keccak/keccak-hash-4way.h" +#include "algo/shavite/sph_shavite.h" +#include "algo/luffa/sse2/luffa_for_sse2.h" +#include "algo/cubehash/sse2/cubehash_sse2.h" +#include "algo/simd/sse2/nist.h" +#include "algo/echo/aes_ni/hash_api.h" +#include "algo/hamsi/hamsi-hash-4way.h" +#include "algo/fugue/sph_fugue.h" +#include "algo/shabal/shabal-hash-4way.h" +#include "algo/whirlpool/sph_whirlpool.h" +#include "algo/sha/sha2-hash-4way.h" + +static __thread uint32_t s_ntime = UINT32_MAX; +static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 }; + + +typedef struct { + blake512_4way_context blake; + bmw512_4way_context bmw; + hashState_echo echo; + hashState_groestl groestl; + skein512_4way_context skein; + jh512_4way_context jh; + keccak512_4way_context keccak; + hashState_luffa luffa; + cubehashParam cube; + sph_shavite512_context shavite; + hashState_sd simd; + hamsi512_4way_context hamsi; + sph_fugue512_context fugue; + shabal512_4way_context shabal; + sph_whirlpool_context whirlpool; + sha512_4way_context sha512; +} x16r_4way_ctx_holder; + +x16r_4way_ctx_holder x16r_4way_ctx __attribute__ ((aligned (64))); + +// Cube needs one full init so fast reinits can be done in the hash loop. +void init_x16r_4way_ctx() +{ + cubehashInit( &x16r_4way_ctx.cube, 512, 16, 32 ); +}; + + +void x16r_4way_hash( void* output, const void* input ) +{ + uint32_t hash0[16] __attribute__ ((aligned (64))); + uint32_t hash1[16] __attribute__ ((aligned (64))); + uint32_t hash2[16] __attribute__ ((aligned (64))); + uint32_t hash3[16] __attribute__ ((aligned (64))); + uint32_t vhash[16*4] __attribute__ ((aligned (64))); + uint32_t inp0[24] __attribute__ ((aligned (64))); + uint32_t inp1[24] __attribute__ ((aligned (64))); + uint32_t inp2[24] __attribute__ ((aligned (64))); + uint32_t inp3[24] __attribute__ ((aligned (64))); + + x16r_4way_ctx_holder ctx; + + void *in0 = (void*) inp0; + void *in1 = (void*) inp1; + void *in2 = (void*) inp2; + void *in3 = (void*) inp3; + int size = 80; + + mm256_deinterleave_4x64( inp0, inp1, inp2, inp3, input, 640 ); + + if ( s_ntime == UINT32_MAX ) + { + const uint8_t* tmp = (uint8_t*) inp0; + x16r_getAlgoString( &tmp[4], hashOrder ); + } + + // Input data is both 64 bit interleaved (input) + // and deinterleaved in inp0-3. + // If First function uses 64 bit data it is not required to interleave inp + // first. It may use the inerleaved data dmost convenient, ie 4way 64 bit. + // All other functions assume data is deinterleaved in hash0-3 + // All functions must exit with data deinterleaved in hash0-3. + // Alias in0-3 points to either inp0-3 or hash0-3 according to + // its hashOrder position. Size is also set accordingly. + for ( int i = 0; i < 16; i++ ) + { + const char elem = hashOrder[i]; + const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; + + switch ( algo ) + { + case BLAKE: + blake512_4way_init( &ctx.blake ); + if ( i == 0 ) + blake512_4way( &ctx.blake, input, size ); + else + { + mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 ); + blake512_4way( &ctx.blake, vhash, size ); + } + blake512_4way_close( &ctx.blake, vhash ); + mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + break; + case BMW: + bmw512_4way_init( &ctx.bmw ); + if ( i == 0 ) + bmw512_4way( &ctx.bmw, input, size ); + else + { + mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 ); + bmw512_4way( &ctx.bmw, vhash, size ); + } + bmw512_4way_close( &ctx.bmw, vhash ); + mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + break; + case GROESTL: + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash0, + (const char*)in0, size<<3 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash1, + (const char*)in1, size<<3 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash2, + (const char*)in2, size<<3 ); + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash3, + (const char*)in3, size<<3 ); + break; + case SKEIN: + skein512_4way_init( &ctx.skein ); + if ( i == 0 ) + skein512_4way( &ctx.skein, input, size ); + else + { + mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 ); + skein512_4way( &ctx.skein, vhash, size ); + } + skein512_4way_close( &ctx.skein, vhash ); + mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + break; + case JH: + jh512_4way_init( &ctx.jh ); + if ( i == 0 ) + jh512_4way( &ctx.jh, input, size ); + else + { + mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 ); + jh512_4way( &ctx.jh, vhash, size ); + } + jh512_4way_close( &ctx.jh, vhash ); + mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + break; + case KECCAK: + keccak512_4way_init( &ctx.keccak ); + if ( i == 0 ) + keccak512_4way( &ctx.keccak, input, size ); + else + { + mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 ); + keccak512_4way( &ctx.keccak, vhash, size ); + } + keccak512_4way_close( &ctx.keccak, vhash ); + mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + break; + case LUFFA: + init_luffa( &ctx.luffa, 512 ); + update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0, + (const BitSequence*)in0, size ); + init_luffa( &ctx.luffa, 512 ); + update_and_final_luffa( &ctx.luffa, (BitSequence*)hash1, + (const BitSequence*)in1, size ); + init_luffa( &ctx.luffa, 512 ); + update_and_final_luffa( &ctx.luffa, (BitSequence*)hash2, + (const BitSequence*)in2, size ); + init_luffa( &ctx.luffa, 512 ); + update_and_final_luffa( &ctx.luffa, (BitSequence*)hash3, + (const BitSequence*)in3, size ); + break; + case CUBEHASH: + cubehashReinit( &ctx.cube ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash0, + (const byte*)in0, size ); + cubehashReinit( &ctx.cube ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash1, + (const byte*)in1, size ); + cubehashReinit( &ctx.cube ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash2, + (const byte*)in2, size ); + cubehashReinit( &ctx.cube ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash3, + (const byte*)in3, size ); + break; + case SHAVITE: + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, in0, size ); + sph_shavite512_close( &ctx.shavite, hash0 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, in1, size ); + sph_shavite512_close( &ctx.shavite, hash1 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, in2, size ); + sph_shavite512_close( &ctx.shavite, hash2 ); + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, in3, size ); + sph_shavite512_close( &ctx.shavite, hash3 ); + break; + case SIMD: + init_sd( &ctx.simd, 512 ); + update_final_sd( &ctx.simd, (BitSequence *)hash0, + (const BitSequence*)in0, size<<3 ); + init_sd( &ctx.simd, 512 ); + update_final_sd( &ctx.simd, (BitSequence *)hash1, + (const BitSequence*)in1, size<<3 ); + init_sd( &ctx.simd, 512 ); + update_final_sd( &ctx.simd, (BitSequence *)hash2, + (const BitSequence*)in2, size<<3 ); + init_sd( &ctx.simd, 512 ); + update_final_sd( &ctx.simd, (BitSequence *)hash3, + (const BitSequence*)in3, size<<3 ); + break; + case ECHO: + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence *)hash0, + (const BitSequence*)in0, size<<3 ); + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence *)hash1, + (const BitSequence*)in1, size<<3 ); + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence *)hash2, + (const BitSequence*)in2, size<<3 ); + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence *)hash3, + (const BitSequence*)in3, size<<3 ); + break; + case HAMSI: + mm_interleave_4x32( vhash, in0, in1, in2, in3, size<<3 ); + hamsi512_4way_init( &ctx.hamsi ); + hamsi512_4way( &ctx.hamsi, vhash, size ); + hamsi512_4way_close( &ctx.hamsi, vhash ); + mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 ); + break; + case FUGUE: + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, in0, size ); + sph_fugue512_close( &ctx.fugue, hash0 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, in1, size ); + sph_fugue512_close( &ctx.fugue, hash1 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, in2, size ); + sph_fugue512_close( &ctx.fugue, hash2 ); + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, in3, size ); + sph_fugue512_close( &ctx.fugue, hash3 ); + break; + case SHABAL: + mm_interleave_4x32( vhash, in0, in1, in2, in3, size<<3 ); + shabal512_4way_init( &ctx.shabal ); + shabal512_4way( &ctx.shabal, vhash, size ); + shabal512_4way_close( &ctx.shabal, vhash ); + mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 ); + break; + case WHIRLPOOL: + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in0, size ); + sph_whirlpool_close( &ctx.whirlpool, hash0 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in1, size ); + sph_whirlpool_close( &ctx.whirlpool, hash1 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in2, size ); + sph_whirlpool_close( &ctx.whirlpool, hash2 ); + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in3, size ); + sph_whirlpool_close( &ctx.whirlpool, hash3 ); + break; + case SHA_512: + mm256_interleave_4x64( vhash, in0, in1, in2, in3, size<<3 ); + sha512_4way_init( &ctx.sha512 ); + sha512_4way( &ctx.sha512, vhash, size ); + sha512_4way_close( &ctx.sha512, vhash ); + mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); + break; + } + in0 = (void*) hash0; + in1 = (void*) hash1; + in2 = (void*) hash2; + in3 = (void*) hash3; + size = 64; + } + memcpy( output, hash0, 32 ); + memcpy( output+32, hash1, 32 ); + memcpy( output+64, hash2, 32 ); + memcpy( output+96, hash3, 32 ); +} + +int scanhash_x16r_4way( int thr_id, struct work *work, uint32_t max_nonce, + uint64_t *hashes_done ) +{ + uint32_t hash[4*16] __attribute__ ((aligned (64))); + uint32_t vdata[24*4] __attribute__ ((aligned (64))); + uint32_t endiandata[20] __attribute__((aligned(64))); + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t Htarg = ptarget[7]; + const uint32_t first_nonce = pdata[19]; + uint32_t n = first_nonce; + uint32_t *nonces = work->nonces; + bool *found = work->nfound; + int num_found = 0; + uint32_t *noncep0 = vdata + 73; // 9*8 + 1 + uint32_t *noncep1 = vdata + 75; + uint32_t *noncep2 = vdata + 77; + uint32_t *noncep3 = vdata + 79; + volatile uint8_t *restart = &(work_restart[thr_id].restart); + + for ( int k=0; k < 19; k++ ) + be32enc( &endiandata[k], pdata[k] ); + + if ( s_ntime != pdata[17] ) + { + uint32_t ntime = swab32(pdata[17]); + x16r_getAlgoString( (const char*) (&endiandata[1]), hashOrder ); + s_ntime = ntime; + if ( opt_debug && !thr_id ) + applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime ); + } + + if ( opt_benchmark ) + ptarget[7] = 0x0cff; + + uint64_t *edata = (uint64_t*)endiandata; + mm256_interleave_4x64( (uint64_t*)vdata, edata, edata, edata, edata, 640 ); + + do + { + found[0] = found[1] = found[2] = found[3] = false; + be32enc( noncep0, n ); + be32enc( noncep1, n+1 ); + be32enc( noncep2, n+2 ); + be32enc( noncep3, n+3 ); + x16r_4way_hash( hash, vdata ); + pdata[19] = n; + + if ( hash[7] <= Htarg && fulltest( hash, ptarget ) ) + { + found[0] = true; + num_found++; + nonces[0] = n; + work_set_target_ratio( work, hash ); + } + if ( (hash+8)[7] <= Htarg && fulltest( hash, ptarget ) ) + { + found[1] = true; + num_found++; + nonces[1] = n+1; + work_set_target_ratio( work, hash+8 ); + } + if ( (hash+16)[7] <= Htarg && fulltest( hash, ptarget ) ) + { + found[2] = true; + num_found++; + nonces[2] = n+2; + work_set_target_ratio( work, hash+16 ); + } + if ( (hash+24)[7] <= Htarg && fulltest( hash, ptarget ) ) + { + found[3] = true; + num_found++; + nonces[3] = n+3; + work_set_target_ratio( work, hash+24 ); + } + n += 4; + } while ( ( num_found == 0 ) && ( n < max_nonce ) && !(*restart) ); + + *hashes_done = n - first_nonce + 1; + return num_found; +} + +#endif diff --git a/algo/x17/x16r-gate.c b/algo/x17/x16r-gate.c new file mode 100644 index 00000000..3f22124d --- /dev/null +++ b/algo/x17/x16r-gate.c @@ -0,0 +1,35 @@ +#include "x16r-gate.h" + +void x16r_getAlgoString( const uint8_t* prevblock, char *output ) +{ + char *sptr = output; + for ( int j = 0; j < X16R_HASH_FUNC_COUNT; j++ ) + { + uint8_t b = (15 - j) >> 1; // 16 first ascii hex chars (lsb in uint256) + uint8_t algoDigit = (j & 1) ? prevblock[b] & 0xF : prevblock[b] >> 4; + if (algoDigit >= 10) + sprintf(sptr, "%c", 'A' + (algoDigit - 10)); + else + sprintf(sptr, "%u", (uint32_t) algoDigit); + sptr++; + } + *sptr = '\0'; +} + + +bool register_x16r_algo( algo_gate_t* gate ) +{ +#if defined (X16R_4WAY) + init_x16r_4way_ctx(); + gate->scanhash = (void*)&scanhash_x16r_4way; + gate->hash = (void*)&x16r_4way_hash; +#else + init_x16r_ctx(); + gate->scanhash = (void*)&scanhash_x16r; + gate->hash = (void*)&x16r_hash; +#endif + gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT; + gate->set_target = (void*)&alt_set_target; + return true; +}; + diff --git a/algo/x17/x16r-gate.h b/algo/x17/x16r-gate.h new file mode 100644 index 00000000..b679a61a --- /dev/null +++ b/algo/x17/x16r-gate.h @@ -0,0 +1,54 @@ +#ifndef X16R_GATE_H__ +#define X16R_GATE_H__ 1 + +#include "algo-gate-api.h" +#include "avxdefs.h" +#include + +#if defined(__AVX2__) && defined(__AES__) + #define X16R_4WAY +#endif + +enum x16r_Algo { + BLAKE = 0, + BMW, + GROESTL, + JH, + KECCAK, + SKEIN, + LUFFA, + CUBEHASH, + SHAVITE, + SIMD, + ECHO, + HAMSI, + FUGUE, + SHABAL, + WHIRLPOOL, + SHA_512, + X16R_HASH_FUNC_COUNT +}; + +bool register_x16r_algo( algo_gate_t* gate ); +void x16r_getAlgoString( const uint8_t* prevblock, char *output ); + +#if defined(X16R_4WAY) + +void x16r_4way_hash( void *state, const void *input ); + +int scanhash_x16r_4way( int thr_id, struct work *work, uint32_t max_nonce, + uint64_t *hashes_done ); + +void init_x16r_4way_ctx(); + +#endif + +void x16r_hash( void *state, const void *input ); + +int scanhash_x16r( int thr_id, struct work *work, uint32_t max_nonce, + uint64_t *hashes_done ); + +void init_x16r_ctx(); + +#endif + diff --git a/algo/x17/x16r.c b/algo/x17/x16r.c new file mode 100644 index 00000000..08b5a422 --- /dev/null +++ b/algo/x17/x16r.c @@ -0,0 +1,252 @@ +/** + * x16r algo implementation + * + * Implementation by tpruvot@github Jan 2018 + * Optimized by JayDDee@github Jan 2018 + */ +#include "x16r-gate.h" + +#include +#include +#include +#include "algo/blake/sph_blake.h" +#include "algo/bmw/sph_bmw.h" +#include "algo/groestl/sph_groestl.h" +#include "algo/jh/sph_jh.h" +#include "algo/keccak/sph_keccak.h" +#include "algo/skein/sph_skein.h" +#include "algo/shavite/sph_shavite.h" +#include "algo/luffa/sse2/luffa_for_sse2.h" +#include "algo/cubehash/sse2/cubehash_sse2.h" +#include "algo/simd/sse2/nist.h" +#include "algo/echo/sph_echo.h" +#include "algo/hamsi/sph_hamsi.h" +#include "algo/fugue/sph_fugue.h" +#include "algo/shabal/sph_shabal.h" +#include "algo/whirlpool/sph_whirlpool.h" +#include +#ifndef NO_AES_NI + #include "algo/echo/aes_ni/hash_api.h" + #include "algo/groestl/aes_ni/hash-groestl.h" +#endif + +static __thread uint32_t s_ntime = UINT32_MAX; +static __thread char hashOrder[X16R_HASH_FUNC_COUNT + 1] = { 0 }; + +typedef struct { +#ifdef NO_AES_NI + sph_groestl512_context groestl; + sph_echo512_context echo; +#else + hashState_echo echo; + hashState_groestl groestl; +#endif + sph_blake512_context blake; + sph_bmw512_context bmw; + sph_skein512_context skein; + sph_jh512_context jh; + sph_keccak512_context keccak; + hashState_luffa luffa; + cubehashParam cube; + sph_shavite512_context shavite; + hashState_sd simd; + sph_hamsi512_context hamsi; + sph_fugue512_context fugue; + sph_shabal512_context shabal; + sph_whirlpool_context whirlpool; + SHA512_CTX sha512; +} x16r_ctx_holder; + +x16r_ctx_holder x16r_ctx __attribute__ ((aligned (64))); + +void init_x16r_ctx() +{ +//#ifdef NO_AES_NI +// sph_groestl512_init(&x16r_ctx.groestl ); +// sph_echo512_init(&x16r_ctx.echo); +//#else +// init_echo( &x16r_ctx.echo, 512 ); +// init_groestl( &x16r_ctx.groestl, 64 ); +//#endif +// sph_blake512_init( &x16r_ctx.blake ); +// sph_bmw512_init( &x16r_ctx.bmw ); +// sph_skein512_init( &x16r_ctx.bmw ); +// sph_jh512_init( &x16r_ctx.jh ); +// sph_keccak512_init( &x16r_ctx.keccak ); +// init_luffa( &x16r_ctx.luffa, 512 ); + cubehashInit( &x16r_ctx.cube, 512, 16, 32 ); +// sph_shavite512_init( &x16r_ctx.shavite ); +// init_sd( &x16r_ctx.simd, 512 ); +// sph_hamsi512_init( &x16r_ctx.hamsi ); +// sph_fugue512_init( &x16r_ctx.fugue ); +// sph_shabal512_init( &x16r_ctx.shabal ); +// sph_whirlpool_init( &x16r_ctx.whirlpool ); +// SHA512_Init( &x16r_ctx.sha512 ); +}; + +void x16r_hash( void* output, const void* input ) +{ + uint32_t _ALIGN(128) hash[16]; + x16r_ctx_holder ctx; + void *in = (void*) input; + int size = 80; + + if ( s_ntime == UINT32_MAX ) + { + const uint8_t* in8 = (uint8_t*) input; + x16r_getAlgoString( &in8[4], hashOrder ); + } + + for ( int i = 0; i < 16; i++ ) + { + const char elem = hashOrder[i]; + const uint8_t algo = elem >= 'A' ? elem - 'A' + 10 : elem - '0'; + + switch ( algo ) + { + case BLAKE: + sph_blake512_init( &ctx.blake ); + sph_blake512( &ctx.blake, in, size ); + sph_blake512_close( &ctx.blake, hash ); + break; + case BMW: + sph_bmw512_init( &ctx.bmw ); + sph_bmw512(&ctx.bmw, in, size); + sph_bmw512_close(&ctx.bmw, hash); + break; + case GROESTL: +#ifdef NO_AES_NI + sph_groestl512_init( &ctx.groestl ); + sph_groestl512( &ctx.groestl, in, size<<3 ); + sph_groestl512_close(&ctx.groestl, hash); +#else + init_groestl( &ctx.groestl, 64 ); + update_and_final_groestl( &ctx.groestl, (char*)hash, + (const char*)in, size<<3 ); +#endif + break; + case SKEIN: + sph_skein512_init( &ctx.skein ); + sph_skein512( &ctx.skein, in, size ); + sph_skein512_close( &ctx.skein, hash ); + break; + case JH: + sph_jh512_init( &ctx.jh ); + sph_jh512(&ctx.jh, in, size ); + sph_jh512_close(&ctx.jh, hash ); + break; + case KECCAK: + sph_keccak512_init( &ctx.keccak ); + sph_keccak512( &ctx.keccak, in, size ); + sph_keccak512_close( &ctx.keccak, hash ); + break; + case LUFFA: + init_luffa( &ctx.luffa, 512 ); + update_and_final_luffa( &ctx.luffa, (BitSequence*)hash, + (const BitSequence*)in, size ); + break; + case CUBEHASH: + memcpy( &ctx.cube, &x16r_ctx.cube, sizeof(cubehashParam) ); + cubehashUpdateDigest( &ctx.cube, (byte*) hash, + (const byte*)in, size ); + break; + case SHAVITE: + sph_shavite512_init( &ctx.shavite ); + sph_shavite512( &ctx.shavite, in, size ); + sph_shavite512_close( &ctx.shavite, hash ); + break; + case SIMD: + init_sd( &ctx.simd, 512 ); + update_final_sd( &ctx.simd, (BitSequence *)hash, + (const BitSequence*)in, size<<3 ); + break; + case ECHO: +#ifdef NO_AES_NI + sph_echo512_init( &ctx.echo ); + sph_echo512( &ctx.echo, in, size ); + sph_echo512_close( &ctx.echo, hash ); +#else + init_echo( &ctx.echo, 512 ); + update_final_echo ( &ctx.echo, (BitSequence *)hash, + (const BitSequence*)in, size<<3 ); +#endif + break; + case HAMSI: + sph_hamsi512_init( &ctx.hamsi ); + sph_hamsi512( &ctx.hamsi, in, size ); + sph_hamsi512_close( &ctx.hamsi, hash ); + break; + case FUGUE: + sph_fugue512_init( &ctx.fugue ); + sph_fugue512( &ctx.fugue, in, size ); + sph_fugue512_close( &ctx.fugue, hash ); + break; + case SHABAL: + sph_shabal512_init( &ctx.shabal ); + sph_shabal512( &ctx.shabal, in, size ); + sph_shabal512_close( &ctx.shabal, hash ); + break; + case WHIRLPOOL: + sph_whirlpool_init( &ctx.whirlpool ); + sph_whirlpool( &ctx.whirlpool, in, size ); + sph_whirlpool_close( &ctx.whirlpool, hash ); + break; + case SHA_512: + SHA512_Init( &ctx.sha512 ); + SHA512_Update( &ctx.sha512, in, size ); + SHA512_Final( (unsigned char*) hash, &ctx.sha512 ); + break; + } + in = (void*) hash; + size = 64; + } + memcpy(output, hash, 32); +} + +int scanhash_x16r( int thr_id, struct work *work, uint32_t max_nonce, + uint64_t *hashes_done ) +{ + uint32_t _ALIGN(128) hash32[8]; + uint32_t _ALIGN(128) endiandata[20]; + uint32_t *pdata = work->data; + uint32_t *ptarget = work->target; + const uint32_t Htarg = ptarget[7]; + const uint32_t first_nonce = pdata[19]; + uint32_t nonce = first_nonce; + volatile uint8_t *restart = &(work_restart[thr_id].restart); + + for ( int k=0; k < 19; k++ ) + be32enc( &endiandata[k], pdata[k] ); + + if ( s_ntime != pdata[17] ) + { + uint32_t ntime = swab32(pdata[17]); + x16r_getAlgoString( (const char*) (&endiandata[1]), hashOrder ); + s_ntime = ntime; + if ( opt_debug && !thr_id ) + applog( LOG_DEBUG, "hash order %s (%08x)", hashOrder, ntime ); + } + + if ( opt_benchmark ) + ptarget[7] = 0x0cff; + + do + { + be32enc( &endiandata[19], nonce ); + x16r_hash( hash32, endiandata ); + + if ( hash32[7] <= Htarg && fulltest( hash32, ptarget ) ) + { + work_set_target_ratio( work, hash32 ); + pdata[19] = nonce; + *hashes_done = pdata[19] - first_nonce; + return 1; + } + nonce++; + + } while ( nonce < max_nonce && !(*restart) ); + + pdata[19] = nonce; + *hashes_done = pdata[19] - first_nonce + 1; + return 0; +} diff --git a/algo/x17/x17-4way.c b/algo/x17/x17-4way.c index 555fe9b4..12471b4d 100644 --- a/algo/x17/x17-4way.c +++ b/algo/x17/x17-4way.c @@ -1,6 +1,6 @@ #include "x17-gate.h" -#if defined(__AVX2__) && defined(__AES__) +#if defined(X17_4WAY) #include #include @@ -17,12 +17,12 @@ #include "algo/shavite/sph_shavite.h" #include "algo/simd/sse2/nist.h" #include "algo/echo/aes_ni/hash_api.h" -#include "algo/hamsi/sph_hamsi.h" +#include "algo/hamsi/hamsi-hash-4way.h" #include "algo/fugue/sph_fugue.h" #include "algo/shabal/shabal-hash-4way.h" #include "algo/whirlpool/sph_whirlpool.h" -#include "algo/haval/sph-haval.h" -#include +#include "algo/haval/haval-hash-4way.h" +#include "algo/sha/sha2-hash-4way.h" typedef struct { blake512_4way_context blake; @@ -36,12 +36,12 @@ typedef struct { sph_shavite512_context shavite; hashState_sd simd; hashState_echo echo; - sph_hamsi512_context hamsi; + hamsi512_4way_context hamsi; sph_fugue512_context fugue; shabal512_4way_context shabal; sph_whirlpool_context whirlpool; - SHA512_CTX sha512; - sph_haval256_5_context haval; + sha512_4way_context sha512; + haval256_5_4way_context haval; } x17_4way_ctx_holder; x17_4way_ctx_holder x17_4way_ctx __attribute__ ((aligned (64))); @@ -59,11 +59,11 @@ void init_x17_4way_ctx() sph_shavite512_init( &x17_4way_ctx.shavite ); init_sd( &x17_4way_ctx.simd, 512 ); init_echo( &x17_4way_ctx.echo, 512 ); - sph_hamsi512_init( &x17_4way_ctx.hamsi ); + hamsi512_4way_init( &x17_4way_ctx.hamsi ); sph_fugue512_init( &x17_4way_ctx.fugue ); shabal512_4way_init( &x17_4way_ctx.shabal ); - SHA512_Init( &x17_4way_ctx.sha512 ); - sph_haval256_5_init( &x17_4way_ctx.haval ); + sha512_4way_init( &x17_4way_ctx.sha512 ); + haval256_5_4way_init( &x17_4way_ctx.haval ); }; void x17_4way_hash( void *state, const void *input ) @@ -73,6 +73,7 @@ void x17_4way_hash( void *state, const void *input ) uint64_t hash2[8] __attribute__ ((aligned (64))); uint64_t hash3[8] __attribute__ ((aligned (64))); uint64_t vhash[8*4] __attribute__ ((aligned (64))); + uint64_t vhash32[8*4] __attribute__ ((aligned (64))); x17_4way_ctx_holder ctx; memcpy( &ctx, &x17_4way_ctx, sizeof(x17_4way_ctx) ); @@ -111,10 +112,9 @@ void x17_4way_hash( void *state, const void *input ) keccak512_4way( &ctx.keccak, vhash, 64 ); keccak512_4way_close( &ctx.keccak, vhash ); - // Serial to the end mm256_deinterleave_4x64( hash0, hash1, hash2, hash3, vhash, 512 ); - // 7 Luffa + // 7 Luffa serial update_and_final_luffa( &ctx.luffa, (BitSequence*)hash0, (const BitSequence*)hash0, 64 ); memcpy( &ctx.luffa, &x17_4way_ctx.luffa, sizeof(hashState_luffa) ); @@ -178,18 +178,11 @@ void x17_4way_hash( void *state, const void *input ) update_final_echo( &ctx.echo, (BitSequence *)hash3, (const BitSequence *) hash3, 512 ); - // 12 Hamsi - sph_hamsi512( &ctx.hamsi, hash0, 64 ); - sph_hamsi512_close( &ctx.hamsi, hash0 ); - memcpy( &ctx.hamsi, &x17_4way_ctx.hamsi, sizeof(sph_hamsi512_context) ); - sph_hamsi512( &ctx.hamsi, hash1, 64 ); - sph_hamsi512_close( &ctx.hamsi, hash1 ); - memcpy( &ctx.hamsi, &x17_4way_ctx.hamsi, sizeof(sph_hamsi512_context) ); - sph_hamsi512( &ctx.hamsi, hash2, 64 ); - sph_hamsi512_close( &ctx.hamsi, hash2 ); - memcpy( &ctx.hamsi, &x17_4way_ctx.hamsi, sizeof(sph_hamsi512_context) ); - sph_hamsi512( &ctx.hamsi, hash3, 64 ); - sph_hamsi512_close( &ctx.hamsi, hash3 ); + // 12 Hamsi parallel 4way 32 bit + mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, 512 ); + hamsi512_4way( &ctx.hamsi, vhash, 64 ); + hamsi512_4way_close( &ctx.hamsi, vhash ); + mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, 512 ); // 13 Fugue sph_fugue512( &ctx.fugue, hash0, 64 ); @@ -226,39 +219,17 @@ void x17_4way_hash( void *state, const void *input ) sph_whirlpool( &ctx.whirlpool, hash3, 64 ); sph_whirlpool_close( &ctx.whirlpool, hash3 ); - // 16 SHA512 - SHA512_Update( &ctx.sha512, hash0, 64 ); - SHA512_Final( (unsigned char*)hash0, &ctx.sha512 ); - memcpy( &ctx.sha512, &x17_4way_ctx.sha512, sizeof(SHA512_CTX) ); - SHA512_Update( &ctx.sha512, hash1, 64 ); - SHA512_Final( (unsigned char*)hash1, &ctx.sha512 ); - memcpy( &ctx.sha512, &x17_4way_ctx.sha512, sizeof(SHA512_CTX) ); - SHA512_Update( &ctx.sha512, hash2, 64 ); - SHA512_Final( (unsigned char*)hash2, &ctx.sha512 ); - memcpy( &ctx.sha512, &x17_4way_ctx.sha512, sizeof(SHA512_CTX) ); - SHA512_Update( &ctx.sha512, hash3, 64 ); - SHA512_Final( (unsigned char*)hash3, &ctx.sha512 ); - - // 17 Haval - sph_haval256_5( &ctx.haval, (const void*)hash0, 64 ); - sph_haval256_5_close( &ctx.haval, hash0 ); - memcpy( &ctx.haval, &x17_4way_ctx.haval, - sizeof(sph_haval256_5_context) ); - sph_haval256_5( &ctx.haval, (const void*)hash1, 64 ); - sph_haval256_5_close( &ctx.haval, hash1 ); - memcpy( &ctx.haval, &x17_4way_ctx.haval, - sizeof(sph_haval256_5_context) ); - sph_haval256_5( &ctx.haval, (const void*)hash2, 64 ); - sph_haval256_5_close( &ctx.haval, hash2 ); - memcpy( &ctx.haval, &x17_4way_ctx.haval, - sizeof(sph_haval256_5_context) ); - sph_haval256_5( &ctx.haval, (const void*)hash3, 64 ); - sph_haval256_5_close( &ctx.haval, hash3 ); - - memcpy( state, hash0, 32 ); - memcpy( state+32, hash1, 32 ); - memcpy( state+64, hash2, 32 ); - memcpy( state+96, hash3, 32 ); + // 16 SHA512 parallel 64 bit + mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, 512 ); + sha512_4way( &ctx.sha512, vhash, 64 ); + sha512_4way_close( &ctx.sha512, vhash ); + + // 17 Haval parallel 32 bit + mm256_reinterleave_4x32( vhash32, vhash, 512 ); + haval256_5_4way( &ctx.haval, vhash32, 64 ); + haval256_5_4way_close( &ctx.haval, vhash ); + + mm_deinterleave_4x32( state, state+32, state+64, state+96, vhash, 256 ); } int scanhash_x17_4way( int thr_id, struct work *work, uint32_t max_nonce, diff --git a/algo/x17/x17-gate.c b/algo/x17/x17-gate.c index 95cd77b5..3aa1cc76 100644 --- a/algo/x17/x17-gate.c +++ b/algo/x17/x17-gate.c @@ -11,7 +11,7 @@ bool register_x17_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_x17; gate->hash = (void*)&x17_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT; return true; }; diff --git a/algo/x17/x17-gate.h b/algo/x17/x17-gate.h index 7767fd01..32dadff0 100644 --- a/algo/x17/x17-gate.h +++ b/algo/x17/x17-gate.h @@ -4,7 +4,7 @@ #include "algo-gate-api.h" #include -#if defined(HASH_4WAY) && defined(__AES__) +#if defined(__AVX2__) && defined(__AES__) #define X17_4WAY #endif diff --git a/algo/x17/xevan-4way.c b/algo/x17/xevan-4way.c index 091e1440..847dadde 100644 --- a/algo/x17/xevan-4way.c +++ b/algo/x17/xevan-4way.c @@ -1,6 +1,6 @@ #include "xevan-gate.h" -#if defined(__AVX2__) && defined(__AES__) +#if defined(XEVAN_4WAY) #include #include @@ -17,13 +17,12 @@ #include "algo/cubehash/sse2/cubehash_sse2.h" #include "algo/simd/sse2/nist.h" #include "algo/echo/aes_ni/hash_api.h" -#include "algo/hamsi/sph_hamsi.h" +#include "algo/hamsi/hamsi-hash-4way.h" #include "algo/fugue/sph_fugue.h" #include "algo/shabal/shabal-hash-4way.h" #include "algo/whirlpool/sph_whirlpool.h" -#include "algo/sha/sph_sha2.h" -#include "algo/haval/sph-haval.h" -#include +#include "algo/sha/sha2-hash-4way.h" +#include "algo/haval/haval-hash-4way.h" typedef struct { blake512_4way_context blake; @@ -37,12 +36,12 @@ typedef struct { sph_shavite512_context shavite; hashState_sd simd; hashState_echo echo; - sph_hamsi512_context hamsi; + hamsi512_4way_context hamsi; sph_fugue512_context fugue; shabal512_4way_context shabal; sph_whirlpool_context whirlpool; - SHA512_CTX sha512; - sph_haval256_5_context haval; + sha512_4way_context sha512; + haval256_5_4way_context haval; } xevan_4way_ctx_holder; xevan_4way_ctx_holder xevan_4way_ctx __attribute__ ((aligned (64))); @@ -62,12 +61,12 @@ void init_xevan_4way_ctx() sph_shavite512_init( &xevan_4way_ctx.shavite ); init_sd( &xevan_4way_ctx.simd, 512 ); init_echo( &xevan_4way_ctx.echo, 512 ); - sph_hamsi512_init( &xevan_4way_ctx.hamsi ); + hamsi512_4way_init( &xevan_4way_ctx.hamsi ); sph_fugue512_init( &xevan_4way_ctx.fugue ); shabal512_4way_init( &xevan_4way_ctx.shabal ); sph_whirlpool_init( &xevan_4way_ctx.whirlpool ); - SHA512_Init( &xevan_4way_ctx.sha512 ); - sph_haval256_5_init( &xevan_4way_ctx.haval ); + sha512_4way_init( &xevan_4way_ctx.sha512 ); + haval256_5_4way_init( &xevan_4way_ctx.haval ); }; void xevan_4way_blake512_midstate( const void* input ) @@ -84,6 +83,7 @@ void xevan_4way_hash( void *output, const void *input ) uint64_t hash2[16] __attribute__ ((aligned (64))); uint64_t hash3[16] __attribute__ ((aligned (64))); uint64_t vhash[16<<2] __attribute__ ((aligned (64))); + uint64_t vhash32[16<<2] __attribute__ ((aligned (64))); const int dataLen = 128; const int midlen = 64; // bytes const int tail = 80 - midlen; // 16 @@ -193,17 +193,11 @@ void xevan_4way_hash( void *output, const void *input ) update_final_echo( &ctx.echo, (BitSequence *)hash3, (const BitSequence *) hash3, dataLen<<3 ); - sph_hamsi512( &ctx.hamsi, hash0, dataLen ); - sph_hamsi512_close( &ctx.hamsi, hash0 ); - memcpy( &ctx.hamsi, &xevan_4way_ctx.hamsi, sizeof(sph_hamsi512_context) ); - sph_hamsi512( &ctx.hamsi, hash1, dataLen ); - sph_hamsi512_close( &ctx.hamsi, hash1 ); - memcpy( &ctx.hamsi, &xevan_4way_ctx.hamsi, sizeof(sph_hamsi512_context) ); - sph_hamsi512( &ctx.hamsi, hash2, dataLen ); - sph_hamsi512_close( &ctx.hamsi, hash2 ); - memcpy( &ctx.hamsi, &xevan_4way_ctx.hamsi, sizeof(sph_hamsi512_context) ); - sph_hamsi512( &ctx.hamsi, hash3, dataLen ); - sph_hamsi512_close( &ctx.hamsi, hash3 ); + // Parallel 32 bit + mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 ); + hamsi512_4way( &ctx.hamsi, vhash, dataLen ); + hamsi512_4way_close( &ctx.hamsi, vhash ); + mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 ); sph_fugue512( &ctx.fugue, hash0, dataLen ); sph_fugue512_close( &ctx.fugue, hash0 ); @@ -217,7 +211,7 @@ void xevan_4way_hash( void *output, const void *input ) sph_fugue512( &ctx.fugue, hash3, dataLen ); sph_fugue512_close( &ctx.fugue, hash3 ); - // Parallel 4way + // Parallel 4way 32 bit mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 ); shabal512_4way( &ctx.shabal, vhash, dataLen ); shabal512_4way_close( &ctx.shabal, vhash ); @@ -239,32 +233,14 @@ void xevan_4way_hash( void *output, const void *input ) sph_whirlpool( &ctx.whirlpool, hash3, dataLen ); sph_whirlpool_close( &ctx.whirlpool, hash3 ); - SHA512_Update( &ctx.sha512, hash0, dataLen ); - SHA512_Final( (unsigned char*)hash0, &ctx.sha512 ); - memcpy( &ctx.sha512, &xevan_4way_ctx.sha512, sizeof(SHA512_CTX) ); - SHA512_Update( &ctx.sha512, hash1, dataLen ); - SHA512_Final( (unsigned char*)hash1, &ctx.sha512 ); - memcpy( &ctx.sha512, &xevan_4way_ctx.sha512, sizeof(SHA512_CTX) ); - SHA512_Update( &ctx.sha512, hash2, dataLen ); - SHA512_Final( (unsigned char*)hash2, &ctx.sha512 ); - memcpy( &ctx.sha512, &xevan_4way_ctx.sha512, sizeof(SHA512_CTX) ); - SHA512_Update( &ctx.sha512, hash3, dataLen ); - SHA512_Final( (unsigned char*)hash3, &ctx.sha512 ); - - sph_haval256_5( &ctx.haval, (const void*)hash0, dataLen ); - sph_haval256_5_close( &ctx.haval, hash0 ); - memcpy( &ctx.haval, &xevan_4way_ctx.haval, - sizeof(sph_haval256_5_context) ); - sph_haval256_5( &ctx.haval, (const void*)hash1, dataLen ); - sph_haval256_5_close( &ctx.haval, hash1 ); - memcpy( &ctx.haval, &xevan_4way_ctx.haval, - sizeof(sph_haval256_5_context) ); - sph_haval256_5( &ctx.haval, (const void*)hash2, dataLen ); - sph_haval256_5_close( &ctx.haval, hash2 ); - memcpy( &ctx.haval, &xevan_4way_ctx.haval, - sizeof(sph_haval256_5_context) ); - sph_haval256_5( &ctx.haval, (const void*)hash3, dataLen ); - sph_haval256_5_close( &ctx.haval, hash3 ); + mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 ); + sha512_4way( &ctx.sha512, vhash, dataLen ); + sha512_4way_close( &ctx.sha512, vhash ); + + mm256_reinterleave_4x32( vhash32, vhash, dataLen<<3 ); + haval256_5_4way( &ctx.haval, vhash32, dataLen ); + haval256_5_4way_close( &ctx.haval, vhash ); + mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 ); mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 ); memset( &vhash[ 4<<2 ], 0, (dataLen-32) << 2 ); @@ -366,17 +342,10 @@ void xevan_4way_hash( void *output, const void *input ) update_final_echo( &ctx.echo, (BitSequence *)hash3, (const BitSequence *) hash3, dataLen<<3 ); - sph_hamsi512( &ctx.hamsi, hash0, dataLen ); - sph_hamsi512_close( &ctx.hamsi, hash0 ); - memcpy( &ctx.hamsi, &xevan_4way_ctx.hamsi, sizeof(sph_hamsi512_context) ); - sph_hamsi512( &ctx.hamsi, hash1, dataLen ); - sph_hamsi512_close( &ctx.hamsi, hash1 ); - memcpy( &ctx.hamsi, &xevan_4way_ctx.hamsi, sizeof(sph_hamsi512_context) ); - sph_hamsi512( &ctx.hamsi, hash2, dataLen ); - sph_hamsi512_close( &ctx.hamsi, hash2 ); - memcpy( &ctx.hamsi, &xevan_4way_ctx.hamsi, sizeof(sph_hamsi512_context) ); - sph_hamsi512( &ctx.hamsi, hash3, dataLen ); - sph_hamsi512_close( &ctx.hamsi, hash3 ); + mm_interleave_4x32( vhash, hash0, hash1, hash2, hash3, dataLen<<3 ); + hamsi512_4way( &ctx.hamsi, vhash, dataLen ); + hamsi512_4way_close( &ctx.hamsi, vhash ); + mm_deinterleave_4x32( hash0, hash1, hash2, hash3, vhash, dataLen<<3 ); sph_fugue512( &ctx.fugue, hash0, dataLen ); sph_fugue512_close( &ctx.fugue, hash0 ); @@ -410,37 +379,16 @@ void xevan_4way_hash( void *output, const void *input ) sph_whirlpool( &ctx.whirlpool, hash3, dataLen ); sph_whirlpool_close( &ctx.whirlpool, hash3 ); - SHA512_Update( &ctx.sha512, hash0, dataLen ); - SHA512_Final( (unsigned char*)hash0, &ctx.sha512 ); - memcpy( &ctx.sha512, &xevan_4way_ctx.sha512, sizeof(SHA512_CTX) ); - SHA512_Update( &ctx.sha512, hash1, dataLen ); - SHA512_Final( (unsigned char*)hash1, &ctx.sha512 ); - memcpy( &ctx.sha512, &xevan_4way_ctx.sha512, sizeof(SHA512_CTX) ); - SHA512_Update( &ctx.sha512, hash2, dataLen ); - SHA512_Final( (unsigned char*)hash2, &ctx.sha512 ); - memcpy( &ctx.sha512, &xevan_4way_ctx.sha512, sizeof(SHA512_CTX) ); - SHA512_Update( &ctx.sha512, hash3, dataLen ); - SHA512_Final( (unsigned char*)hash3, &ctx.sha512 ); - - sph_haval256_5( &ctx.haval, (const void*)hash0, dataLen ); - sph_haval256_5_close( &ctx.haval, hash0 ); - memcpy( &ctx.haval, &xevan_4way_ctx.haval, - sizeof(sph_haval256_5_context) ); - sph_haval256_5( &ctx.haval, (const void*)hash1, dataLen ); - sph_haval256_5_close( &ctx.haval, hash1 ); - memcpy( &ctx.haval, &xevan_4way_ctx.haval, - sizeof(sph_haval256_5_context) ); - sph_haval256_5( &ctx.haval, (const void*)hash2, dataLen ); - sph_haval256_5_close( &ctx.haval, hash2 ); - memcpy( &ctx.haval, &xevan_4way_ctx.haval, - sizeof(sph_haval256_5_context) ); - sph_haval256_5( &ctx.haval, (const void*)hash3, dataLen ); - sph_haval256_5_close( &ctx.haval, hash3 ); - - memcpy( output, hash0, 32 ); - memcpy( output+32, hash1, 32 ); - memcpy( output+64, hash2, 32 ); - memcpy( output+96, hash3, 32 ); + mm256_interleave_4x64( vhash, hash0, hash1, hash2, hash3, dataLen<<3 ); + sha512_4way( &ctx.sha512, vhash, dataLen ); + sha512_4way_close( &ctx.sha512, vhash ); + + mm256_reinterleave_4x32( vhash32, vhash, dataLen<<3 ); + haval256_5_4way( &ctx.haval, vhash32, dataLen ); + haval256_5_4way_close( &ctx.haval, vhash32 ); + + mm_deinterleave_4x32( output, output+32, output+64, output+96, + vhash32, 256 ); } int scanhash_xevan_4way( int thr_id, struct work *work, uint32_t max_nonce, diff --git a/algo/x17/xevan-gate.c b/algo/x17/xevan-gate.c index 910ad8bf..f0ebc60b 100644 --- a/algo/x17/xevan-gate.c +++ b/algo/x17/xevan-gate.c @@ -16,7 +16,7 @@ bool register_xevan_algo( algo_gate_t* gate ) gate->scanhash = (void*)&scanhash_xevan; gate->hash = (void*)&xevan_hash; #endif - gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT | FOUR_WAY_OPT; + gate->optimizations = SSE2_OPT | AES_OPT | AVX_OPT | AVX2_OPT; gate->set_target = (void*)&xevan_set_target; gate->get_max64 = (void*)&get_max64_0xffffLL; return true; diff --git a/algo/x17/xevan-gate.h b/algo/x17/xevan-gate.h index 51f7716b..650b4f12 100644 --- a/algo/x17/xevan-gate.h +++ b/algo/x17/xevan-gate.h @@ -4,7 +4,7 @@ #include "algo-gate-api.h" #include -#if defined(HASH_4WAY) && defined(__AES__) +#if defined(__AVX2__) && defined(__AES__) #define XEVAN_4WAY #endif diff --git a/avxdefs.h b/avxdefs.h index cf273bb0..fa018b47 100644 --- a/avxdefs.h +++ b/avxdefs.h @@ -2,16 +2,16 @@ #define AVXDEFS_H__ // Some tools to help using AVX and AVX2. -// At this time SSE2 is sufficient for all 128 bit code in this file -// but could change without notice. -// 256 bit requires AVX2. +// SSE2 is required for most 128 vector operations with the exception of +// _mm_shuffle_epi8, used by byteswap, which needs SSSE3. +// AVX2 is required for all 256 bit vector operations. // AVX512 has more powerful 256 bit instructions but with AVX512 available // there is little reason to use them. // Proper alignment of data is required, 16 bytes for 128 bit vectors and // 32 bytes for 256 bit vectors. 64 byte alignment is recommended for // best cache alignment. // -// There exist dupplicates of some functions. In general the first defined +// There exist duplicates of some functions. In general the first defined // is preferred as it is more efficient but also more restrictive and may // not be applicable. The less efficient versions are more flexible. @@ -40,13 +40,6 @@ // Constant minus 1 #define mm_neg1 _mm_set1_epi64x( 0xFFFFFFFFFFFFFFFFULL ) -// Lane index, useful for byte rotate using shuffle -#define mm_lanex_64 _mm_set_epi64( 1ULL, 0ULL ); -#define mm_lanex_32 _mm_set_epi32( 3UL, 2UL, 1UL, 0UL ); -#define mm_lanex_16 _mm_set_epi16( 7U, 6U, 5U, 4U, 3U, 2U, 1U, 0U ); -#define mm_lanex_8 _mm_set_epi8( 15U, 14U, 13U, 12U, 11U, 10U , 9U, 8U, \ - 7U, 6U, 5U, 4U, 3U, 2U, 1U, 0U ); - // // Basic operations without equivalent SIMD intrinsic @@ -335,16 +328,6 @@ inline __m128i mm_byteswap_16( __m128i x ) // Constant minus 1 #define mm256_neg1 _mm256_set1_epi64x( 0xFFFFFFFFFFFFFFFFULL ) -// Lane index, useful for rotate using permutevar -#define mm256_lane_64 _mm_set_epi64x( 3ULL, 2ULL, 1ULL, 0ULL ); -#define mm256_lane_32 _mm_set_epi32( 7UL, 6UL, 5UL, 4UL, 3UL, 2UL, 1UL, 0UL ); -#define mm256_lane_16 _mm_set_epi16( 15U, 14U, 13U, 12U, 11U, 10U , 9U, 8U, \ - 7U, 6U, 5U, 4U, 3U, 2U, 1U, 0U ); -#define mm256_lane_8 _mm_set_epi8( 31U, 30U, 29U, 28U, 27U, 26U, 25U, 24U, \ - 23U, 22U, 21U, 20U, 19U, 18U, 17U, 16U, \ - 15U, 14U, 13U, 12U, 11U, 10U , 9U, 8U, \ - 7U, 6U, 5U, 4U, 3U, 2U, 1U, 0U ); - // // Basic operations without SIMD equivalent @@ -480,7 +463,7 @@ inline bool memcmp_256( __m256i src1, __m256i src2, int n ) #define mm256_rotr128_1x32( x ) _mm256_shuffle_epi32( x, 0x39 ) #define mm256_rotl128_1x32( x ) _mm256_shuffle_epi32( x, 0x93 ) -// Swap 32 bits in each 64 bit element olf 256 bit vector +// Swap 32 bits in each 64 bit element of 256 bit vector #define mm256_swap64_32( x ) _mm256_shuffle_epi32( x, 0xb1 ) // Less efficient but more versatile. Use only for rotations that are not diff --git a/build-4way.sh b/build-4way.sh deleted file mode 100755 index 35dabfa7..00000000 --- a/build-4way.sh +++ /dev/null @@ -1,15 +0,0 @@ -#!/bin/bash - -# Linux build - -make distclean || echo clean - -rm -f config.status -./autogen.sh || echo done - -#CFLAGS="-O3 -march=native -Wall -DFOUR_WAY" ./configure --with-curl --with-crypto=$HOME/usr -CFLAGS="-O3 -march=native -Wall -DFOUR_WAY" ./configure --with-curl - -make -j 4 - -strip -s cpuminer diff --git a/build.sh b/build.sh index 6f203a88..bf713ea9 100755 --- a/build.sh +++ b/build.sh @@ -18,6 +18,7 @@ rm -f config.status # Debian 7.7 / Ubuntu 14.04 (gcc 4.7+) #extracflags="$extracflags -Ofast -flto -fuse-linker-plugin -ftree-loop-if-convert-stores" +#CFLAGS="-O3 -march=native -Wall" ./configure --with-curl --with-crypto=$HOME/usr CFLAGS="-O3 -march=native -Wall" ./configure --with-curl #CFLAGS="-O3 -march=native -Wall" CXXFLAGS="$CFLAGS -std=gnu++11" ./configure --with-curl diff --git a/configure b/configure index e0ce3351..c98f3133 100755 --- a/configure +++ b/configure @@ -1,6 +1,6 @@ #! /bin/sh # Guess values for system-dependent variables and create Makefiles. -# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.7.10. +# Generated by GNU Autoconf 2.69 for cpuminer-opt 3.8.0. # # # Copyright (C) 1992-1996, 1998-2012 Free Software Foundation, Inc. @@ -577,8 +577,8 @@ MAKEFLAGS= # Identity of this package. PACKAGE_NAME='cpuminer-opt' PACKAGE_TARNAME='cpuminer-opt' -PACKAGE_VERSION='3.7.10' -PACKAGE_STRING='cpuminer-opt 3.7.10' +PACKAGE_VERSION='3.8.0' +PACKAGE_STRING='cpuminer-opt 3.8.0' PACKAGE_BUGREPORT='' PACKAGE_URL='' @@ -1321,7 +1321,7 @@ if test "$ac_init_help" = "long"; then # Omit some internal or obsolete options to make the list less imposing. # This message is too long to be a string in the A/UX 3.1 sh. cat <<_ACEOF -\`configure' configures cpuminer-opt 3.7.10 to adapt to many kinds of systems. +\`configure' configures cpuminer-opt 3.8.0 to adapt to many kinds of systems. Usage: $0 [OPTION]... [VAR=VALUE]... @@ -1392,7 +1392,7 @@ fi if test -n "$ac_init_help"; then case $ac_init_help in - short | recursive ) echo "Configuration of cpuminer-opt 3.7.10:";; + short | recursive ) echo "Configuration of cpuminer-opt 3.8.0:";; esac cat <<\_ACEOF @@ -1497,7 +1497,7 @@ fi test -n "$ac_init_help" && exit $ac_status if $ac_init_version; then cat <<\_ACEOF -cpuminer-opt configure 3.7.10 +cpuminer-opt configure 3.8.0 generated by GNU Autoconf 2.69 Copyright (C) 2012 Free Software Foundation, Inc. @@ -2000,7 +2000,7 @@ cat >config.log <<_ACEOF This file contains any messages produced by compilers while running configure, to aid debugging if configure makes a mistake. -It was created by cpuminer-opt $as_me 3.7.10, which was +It was created by cpuminer-opt $as_me 3.8.0, which was generated by GNU Autoconf 2.69. Invocation command line was $ $0 $@ @@ -2981,7 +2981,7 @@ fi # Define the identity of the package. PACKAGE='cpuminer-opt' - VERSION='3.7.10' + VERSION='3.8.0' cat >>confdefs.h <<_ACEOF @@ -6677,7 +6677,7 @@ cat >>$CONFIG_STATUS <<\_ACEOF || ac_write_fail=1 # report actual input values of CONFIG_FILES etc. instead of their # values after options handling. ac_log=" -This file was extended by cpuminer-opt $as_me 3.7.10, which was +This file was extended by cpuminer-opt $as_me 3.8.0, which was generated by GNU Autoconf 2.69. Invocation command line was CONFIG_FILES = $CONFIG_FILES @@ -6743,7 +6743,7 @@ _ACEOF cat >>$CONFIG_STATUS <<_ACEOF || ac_write_fail=1 ac_cs_config="`$as_echo "$ac_configure_args" | sed 's/^ //; s/[\\""\`\$]/\\\\&/g'`" ac_cs_version="\\ -cpuminer-opt config.status 3.7.10 +cpuminer-opt config.status 3.8.0 configured by $0, generated by GNU Autoconf 2.69, with options \\"\$ac_cs_config\\" diff --git a/configure.ac b/configure.ac index 4b844916..686830aa 100644 --- a/configure.ac +++ b/configure.ac @@ -1,4 +1,4 @@ -AC_INIT([cpuminer-opt], [3.7.10]) +AC_INIT([cpuminer-opt], [3.8.0]) AC_PREREQ([2.59c]) AC_CANONICAL_SYSTEM diff --git a/cpu-miner.c b/cpu-miner.c index 0f4f7936..01c825e1 100644 --- a/cpu-miner.c +++ b/cpu-miner.c @@ -103,7 +103,11 @@ enum algos opt_algo = ALGO_NULL; int opt_scrypt_n = 0; int opt_pluck_n = 128; int opt_n_threads = 0; -int64_t opt_affinity = -1; +#ifdef __GNUC__ +__int128_t opt_affinity = -1LL; +#else +int64_t opt_affinity = -1LL; +#endif int opt_priority = 0; int num_cpus; char *rpc_url = NULL;; @@ -195,7 +199,11 @@ static inline void drop_policy(void) #define pthread_setaffinity_np(tid,sz,s) {} /* only do process affinity */ #endif +#ifdef __GNUC__ +static void affine_to_cpu_mask( int id, unsigned __int128 mask ) +#else static void affine_to_cpu_mask( int id, unsigned long long mask ) +#endif { cpu_set_t set; CPU_ZERO(&set); @@ -204,7 +212,11 @@ static void affine_to_cpu_mask( int id, unsigned long long mask ) for ( uint8_t i = 0; i < ncpus; i++ ) { // cpu mask +#ifdef __GNUC__ + if( ( mask & ( (unsigned __int128)1ULL << i ) ) ) CPU_SET( i, &set ); +#else if( (ncpus > 64) || ( mask & (1ULL << i) ) ) CPU_SET( i, &set ); +#endif } if ( id == -1 ) { @@ -787,7 +799,7 @@ static int share_result( int result, struct work *work, const char *reason ) if ( rate == 100.0 ) sprintf( rate_s, "%.0f", rate ); else - sprintf( rate_s, "%.1f", ( rate < 99.9 ) ? rate : 99.9 ); + sprintf( rate_s, "%.1f", ( rate < 99.9 ) ? rate : 99.9 ); } else { @@ -1545,6 +1557,11 @@ void scrypt_set_target( struct work* work, double job_diff ) work_set_target( work, job_diff / (65536.0 * opt_diff_factor) ); } +void alt_set_target( struct work* work, double job_diff ) +{ + work_set_target( work, job_diff / (256.0 * opt_diff_factor) ); +} + // Default is do_nothing (assumed LE) void set_work_data_big_endian( struct work *work ) { @@ -1695,6 +1712,7 @@ static void *miner_thread( void *userdata ) drop_policy(); } // CPU thread affinity +/* if ( num_cpus > 64 ) { // opt_affinity ignored with more than 64 cpus. @@ -1703,15 +1721,21 @@ static void *miner_thread( void *userdata ) thr_id, thr_id % num_cpus ); affine_to_cpu_mask( thr_id, -1 ); } - else if ( num_cpus > 1 ) + else +*/ + if ( num_cpus > 1 ) { - if ( (opt_affinity == -1) && (opt_n_threads) > 1 ) + if ( (opt_affinity == -1LL) && (opt_n_threads) > 1 ) { if (opt_debug) applog( LOG_DEBUG, "Binding thread %d to cpu %d (mask %x)", thr_id, thr_id % num_cpus, ( 1ULL << (thr_id % num_cpus) ) ); - +#ifdef __GNUC__ + affine_to_cpu_mask( thr_id, + (unsigned __int128)1LL << (thr_id % num_cpus) ); +#else affine_to_cpu_mask( thr_id, 1ULL << (thr_id % num_cpus) ); +#endif } else if (opt_affinity != -1) { @@ -1849,31 +1873,23 @@ static void *miner_thread( void *userdata ) if ( work.nfound[n] ) { *algo_gate.get_nonceptr( work.data ) = work.nonces[n]; - if ( !submit_work(mythr, &work) ) + if ( !submit_work( mythr, &work ) ) { - applog(LOG_WARNING, "Failed to submit share." ); + applog( LOG_WARNING, "Failed to submit share." ); break; } + applog( LOG_NOTICE, "Share submitted." ); num_submitted++; } -#if FOUR_WAY -if (num_submitted > 1) - applog(LOG_NOTICE, "4 WAY hash nonces submitted: %u" CL_MAG " BONUS!" CL_N, num_submitted); -else - applog(LOG_NOTICE, "4 WAY hash nonces submitted: %u", num_submitted); -#endif // must be a one way algo, nonce is already in work data if ( !num_submitted ) { - if ( !submit_work(mythr, &work) ) + if ( !submit_work( mythr, &work ) ) { - applog(LOG_WARNING, "Failed to submir share."); + applog( LOG_WARNING, "Failed to submit share." ); break; } -#if FOUR_WAY -applog(LOG_NOTICE, "1 WAY hash nonce submitted"); -#endif - + applog( LOG_NOTICE, "Share submitted." ); } // prevent stale work in solo @@ -2915,20 +2931,20 @@ bool check_cpu_capability () bool sw_has_avx = false; bool sw_has_avx2 = false; bool sw_has_sha = false; - bool sw_has_4way = false; +// bool sw_has_4way = false; set_t algo_features = algo_gate.optimizations; bool algo_has_sse2 = set_incl( SSE2_OPT, algo_features ); bool algo_has_aes = set_incl( AES_OPT, algo_features ); bool algo_has_avx = set_incl( AVX_OPT, algo_features ); bool algo_has_avx2 = set_incl( AVX2_OPT, algo_features ); bool algo_has_sha = set_incl( SHA_OPT, algo_features ); - bool algo_has_4way = set_incl( FOUR_WAY_OPT, algo_features ); +// bool algo_has_4way = set_incl( FOUR_WAY_OPT, algo_features ); bool use_aes; bool use_sse2; bool use_avx; bool use_avx2; bool use_sha; - bool use_4way; +// bool use_4way; bool use_none; #ifdef __AES__ @@ -2946,9 +2962,9 @@ bool check_cpu_capability () #ifdef __SHA__ sw_has_sha = true; #endif - #ifdef HASH_4WAY - sw_has_4way = true; - #endif +// #ifdef HASH_4WAY +// sw_has_4way = true; +// #endif #if !((__AES__) || (__SSE2__)) printf("Neither __AES__ nor __SSE2__ defined.\n"); @@ -2978,7 +2994,7 @@ bool check_cpu_capability () if ( sw_has_aes ) printf( " AES" ); if ( sw_has_avx ) printf( " AVX" ); if ( sw_has_avx2 ) printf( " AVX2" ); - if ( sw_has_4way ) printf( " 4WAY" ); +// if ( sw_has_4way ) printf( " 4WAY" ); if ( sw_has_sha ) printf( " SHA" ); @@ -2990,7 +3006,7 @@ bool check_cpu_capability () if ( algo_has_aes ) printf( " AES" ); if ( algo_has_avx ) printf( " AVX" ); if ( algo_has_avx2 ) printf( " AVX2" ); - if ( algo_has_4way ) printf( " 4WAY" ); +// if ( algo_has_4way ) printf( " 4WAY" ); if ( algo_has_sha ) printf( " SHA" ); } printf(".\n"); @@ -3028,9 +3044,9 @@ bool check_cpu_capability () use_avx = cpu_has_avx && sw_has_avx && algo_has_avx; use_avx2 = cpu_has_avx2 && sw_has_avx2 && algo_has_avx2; use_sha = cpu_has_sha && sw_has_sha && algo_has_sha; - use_4way = cpu_has_avx2 && sw_has_4way && algo_has_4way; - use_none = !( use_sse2 || use_aes || use_avx || use_avx2 || use_sha - || use_4way ); +// use_4way = cpu_has_avx2 && sw_has_4way && algo_has_4way; + use_none = !( use_sse2 || use_aes || use_avx || use_avx2 || use_sha ); +// || use_4way ); // Display best options printf( "Start mining with" ); @@ -3041,7 +3057,7 @@ bool check_cpu_capability () if ( use_avx2 ) printf( " AVX2" ); else if ( use_avx ) printf( " AVX" ); else if ( use_sse2 ) printf( " SSE2" ); - if ( use_4way ) printf( " 4WAY" ); +// if ( use_4way ) printf( " 4WAY" ); if ( use_sha ) printf( " SHA" ); } printf( ".\n\n" ); diff --git a/miner.h b/miner.h index b753f68e..625772f5 100644 --- a/miner.h +++ b/miner.h @@ -12,9 +12,11 @@ #endif //#endif -#if defined(FOUR_WAY) && defined(__AVX2__) - #define HASH_4WAY -#endif +//#if defined(FOUR_WAY) && defined(__AVX2__) +// keep this until all algos remove reference to HASH_4WAY +//#if defined(__AVX2__) +// #define HASH_4WAY +//#endif #ifdef _MSC_VER @@ -481,6 +483,7 @@ uint32_t* get_stratum_job_ntime(); enum algos { ALGO_NULL, + ALGO_ANIME, ALGO_ARGON2, ALGO_AXIOM, ALGO_BASTION, @@ -543,6 +546,7 @@ enum algos { ALGO_X13SM3, ALGO_X14, ALGO_X15, + ALGO_X16R, ALGO_X17, ALGO_XEVAN, ALGO_YESCRYPT, @@ -553,6 +557,7 @@ enum algos { }; static const char* const algo_names[] = { NULL, + "anime", "argon2", "axiom", "bastion", @@ -615,6 +620,7 @@ static const char* const algo_names[] = { "x13sm3", "x14", "x15", + "x16r", "x17", "xevan", "yescrypt", @@ -680,6 +686,7 @@ static char const usage[] = "\ Usage: " PACKAGE_NAME " [OPTIONS]\n\ Options:\n\ -a, --algo=ALGO specify the algorithm to use\n\ + anime Animecoin (ANI)\n\ argon2\n\ axiom Shabal-256 MemoHash\n\ bastion\n\ @@ -742,6 +749,7 @@ Options:\n\ x13sm3 hsr (Hshare)\n\ x14 X14\n\ x15 X15\n\ + x16r Ravencoin (RVN)\n\ x17\n\ xevan Bitsend (BSD)\n\ yescrypt Globlboost-Y (BSTY)\n\ diff --git a/winbuild-cross.sh b/winbuild-cross.sh index b91f3651..65fa8a87 100755 --- a/winbuild-cross.sh +++ b/winbuild-cross.sh @@ -20,29 +20,29 @@ cp $LOCAL_LIB/curl/lib/.libs/libcurl-4.dll release/ make distclean || echo clean rm -f config.status ./autogen.sh || echo done -CFLAGS="-O3 -march=core-avx2 -msha -Wall -DFOUR_WAY" ./configure $F +CFLAGS="-O3 -march=core-avx2 -msha -Wall" ./configure $F make strip -s cpuminer.exe -mv cpuminer.exe release/cpuminer-4way-sha.exe +mv cpuminer.exe release/cpuminer-avx2-sha.exe -make clean || echo clean -rm -f config.status -CFLAGS="-O3 -march=core-avx2 -Wall -DFOUR_WAY" ./configure $F -make -mv cpuminer.exe release/cpuminer-4way.exe +#make clean || echo clean +#rm -f config.status +#CFLAGS="-O3 -march=core-avx2 -Wall -DFOUR_WAY" ./configure $F +#make +#mv cpuminer.exe release/cpuminer-4way.exe -make clean || echo clean -CFLAGS="-O3 -march=corei7-avx -msha -Wall" ./configure $F -make -strip -s cpuminer.exe -mv cpuminer.exe release/cpuminer-avx-sha.exe +#make clean || echo clean +#CFLAGS="-O3 -march=corei7-avx -msha -Wall" ./configure $F +#make +#strip -s cpuminer.exe +#mv cpuminer.exe release/cpuminer-avx-sha.exe make clean || echo clean rm -f config.status CFLAGS="-O3 -march=core-avx2 -Wall" ./configure $F make strip -s cpuminer.exe -mv cpuminer.exe release/cpuminer-aes-avx2.exe +mv cpuminer.exe release/cpuminer-avx2.exe #make clean || echo clean #rm -f config.status @@ -66,12 +66,12 @@ make strip -s cpuminer.exe mv cpuminer.exe release/cpuminer-aes-sse42.exe -make clean || echo clean -rm -f config.status -CFLAGS="-O3 -march=corei7 -Wall" ./configure $F -make -strip -s cpuminer.exe -mv cpuminer.exe release/cpuminer-sse42.exe +#make clean || echo clean +#rm -f config.status +#CFLAGS="-O3 -march=corei7 -Wall" ./configure $F +#make +#strip -s cpuminer.exe +#mv cpuminer.exe release/cpuminer-sse42.exe make clean || echo clean rm -f config.status