diff --git a/AUTHORS b/AUTHORS index ee336b2e..77055c25 100644 --- a/AUTHORS +++ b/AUTHORS @@ -29,6 +29,7 @@ List of Copyright holders Copyright (C) 1996-1999 Peter Gutmann, Paul Kendall, and Chris Wedgwood Copyright (C) 1996-2006 Peter Gutmann, Matt Thomlinson and Blake Coverett Copyright (C) 2003 Nikos Mavroyanopoulos + Copyright (c) 2006 CRYPTOGAMS Copyright (C) 2006-2007 NTT (Nippon Telegraph and Telephone Corporation) Copyright (C) 2012-2019 g10 Code GmbH Copyright (C) 2012 Simon Josefsson, Niels Möller diff --git a/LICENSES b/LICENSES index f6733a69..c19284e2 100644 --- a/LICENSES +++ b/LICENSES @@ -54,7 +54,6 @@ with any binary distributions derived from the GNU C Library. SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #+end_quote - For files: - random/jitterentropy-base.c - random/jitterentropy.h @@ -99,6 +98,48 @@ with any binary distributions derived from the GNU C Library. * DAMAGE. #+end_quote + For files: + - cipher/cipher-gcm-ppc.c + +#+begin_quote + Copyright (c) 2006, CRYPTOGAMS by + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + * Redistributions of source code must retain copyright notices, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials + provided with the distribution. + + * Neither the name of the CRYPTOGAMS nor the names of its + copyright holder and contributors may be used to endorse or + promote products derived from this software without specific + prior written permission. + + ALTERNATIVELY, provided that this notice is retained in full, this + product may be distributed under the terms of the GNU General Public + License (GPL), in which case the provisions of the GPL apply INSTEAD OF + those given above. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#+end_quote + * X License For files: diff --git a/cipher/Makefile.am b/cipher/Makefile.am index 1728e9f9..ab5d2a38 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -66,6 +66,7 @@ blowfish.c blowfish-amd64.S blowfish-arm.S \ cast5.c cast5-amd64.S cast5-arm.S \ chacha20.c chacha20-sse2-amd64.S chacha20-ssse3-amd64.S chacha20-avx2-amd64.S \ chacha20-armv7-neon.S \ +cipher-gcm-ppc.c \ crc.c \ crc-intel-pclmul.c crc-ppc.c \ des.c des-amd64.S \ @@ -165,3 +166,9 @@ crc-ppc.o: $(srcdir)/crc-ppc.c Makefile crc-ppc.lo: $(srcdir)/crc-ppc.c Makefile `echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< ` + +cipher-gcm-ppc.o: $(srcdir)/cipher-gcm-ppc.c Makefile + `echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< ` + +cipher-gcm-ppc.lo: $(srcdir)/cipher-gcm-ppc.c Makefile + `echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< ` diff --git a/cipher/cipher-gcm-ppc.c b/cipher/cipher-gcm-ppc.c new file mode 100644 index 00000000..ed27ef15 --- /dev/null +++ b/cipher/cipher-gcm-ppc.c @@ -0,0 +1,510 @@ +/* cipher-gcm-ppc.c - Power 8 vpmsum accelerated Galois Counter Mode + * implementation + * Copyright (C) 2019 Shawn Landden + * + * This file is part of Libgcrypt. + * + * Libgcrypt is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser general Public License as + * published by the Free Software Foundation; either version 2.1 of + * the License, or (at your option) any later version. + * + * Libgcrypt is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public + * License along with this program; if not, see . + * + * Based on GHASH implementation by Andy Polyakov from CRYPTOGAMS + * distribution (ppc/ghashp8-ppc.pl). Specifically, it uses his register + * allocation (which then defers to your compiler's register allocation), + * instead of re-implementing Gerald Estrin's Scheme of parallelized + * multiplication of polynomials, as I did not understand this algorithm at + * the time. + * + * Original copyright license follows: + * + * Copyright (c) 2006, CRYPTOGAMS by + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain copyright notices, + * this list of conditions and the following disclaimer. + * + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * * Neither the name of the CRYPTOGAMS nor the names of its + * copyright holder and contributors may be used to endorse or + * promote products derived from this software without specific + * prior written permission. + * + * ALTERNATIVELY, provided that this notice is retained in full, this + * product may be distributed under the terms of the GNU General Public + * License (GPL), in which case the provisions of the GPL apply INSTEAD OF + * those given above. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * SPDX-License-Identifier: (BSD-3-Clause OR GPL-2.0-only) + */ + +#include +#include +#include +#include +#include +#include + +#include "g10lib.h" +#include "cipher.h" +#include "bufhelp.h" +#include "./cipher-internal.h" + +#ifdef GCM_USE_PPC_VPMSUM + +#include + +#define ALWAYS_INLINE inline __attribute__((always_inline)) +#define NO_INSTRUMENT_FUNCTION __attribute__((no_instrument_function)) + +#define ASM_FUNC_ATTR NO_INSTRUMENT_FUNCTION +#define ASM_FUNC_ATTR_INLINE ASM_FUNC_ATTR ALWAYS_INLINE + +typedef vector unsigned char vector16x_u8; +typedef vector signed char vector16x_s8; +typedef vector unsigned long long vector2x_u64; +typedef vector unsigned long long block; + +static ASM_FUNC_ATTR_INLINE block +asm_vpmsumd(block a, block b) +{ + block r; + __asm__("vpmsumd %0, %1, %2" + : "=v" (r) + : "v" (a), "v" (b)); + return r; +} + +static ASM_FUNC_ATTR_INLINE block +asm_swap_u64(block a) +{ + __asm__("xxswapd %x0, %x1" + : "=wa" (a) + : "wa" (a)); + return a; +} + +static ASM_FUNC_ATTR_INLINE block +asm_rot_block_left(block a) +{ + block zero = {0, 0}; + block mask = {2, 0}; + return __builtin_shuffle(a, zero, mask); +} + +static ASM_FUNC_ATTR_INLINE block +asm_rot_block_right(block a) +{ + block zero = {0, 0}; + block mask = {1, 2}; + return __builtin_shuffle(a, zero, mask); +} + +/* vsl is a slightly strange function in the way the shift is passed... */ +static ASM_FUNC_ATTR_INLINE block +asm_ashl_128(block a, vector16x_u8 shift) +{ + block r; + __asm__("vsl %0, %1, %2" + : "=v" (r) + : "v" (a), "v" (shift)); + return r; +} + +#define ALIGNED_LOAD(in_ptr) \ + (vec_aligned_ld (0, (const unsigned char *)(in_ptr))) + +static ASM_FUNC_ATTR_INLINE block +vec_aligned_ld(unsigned long offset, const unsigned char *ptr) +{ +#ifndef WORDS_BIGENDIAN + block vec; + __asm__ ("lvx %0,%1,%2\n\t" + : "=v" (vec) + : "r" (offset), "r" ((uintptr_t)ptr) + : "memory", "r0"); + return vec; +#else + return vec_vsx_ld (offset, ptr); +#endif +} + +#define STORE_TABLE(gcm_table, slot, vec) \ + vec_aligned_st (((block)vec), slot * 16, (unsigned char *)(gcm_table)); + + +static ASM_FUNC_ATTR_INLINE void +vec_aligned_st(block vec, unsigned long offset, unsigned char *ptr) +{ +#ifndef WORDS_BIGENDIAN + __asm__ ("stvx %0,%1,%2\n\t" + : + : "v" (vec), "r" (offset), "r" ((uintptr_t)ptr) + : "memory", "r0"); +#else + vec_vsx_st ((vector16x_u8)vec, offset, ptr); +#endif +} + +#define VEC_LOAD_BE(in_ptr, bswap_const) \ + (vec_load_be (0, (const unsigned char *)(in_ptr), bswap_const)) + +static ASM_FUNC_ATTR_INLINE block +vec_load_be(unsigned long offset, const unsigned char *ptr, + vector unsigned char be_bswap_const) +{ +#ifndef WORDS_BIGENDIAN + block vec; + /* GCC vec_vsx_ld is generating two instructions on little-endian. Use + * lxvw4x directly instead. */ + __asm__ ("lxvw4x %x0,%1,%2\n\t" + : "=wa" (vec) + : "r" (offset), "r" ((uintptr_t)ptr) + : "memory", "r0"); + __asm__ ("vperm %0,%1,%1,%2\n\t" + : "=v" (vec) + : "v" (vec), "v" (be_bswap_const)); + return vec; +#else + (void)be_bswap_const; + return vec_vsx_ld (offset, ptr); +#endif +} + +/* Power ghash based on papers: + "The Galois/Counter Mode of Operation (GCM)"; David A. McGrew, John Viega + "Intel® Carry-Less Multiplication Instruction and its Usage for Computing + the GCM Mode - Rev 2.01"; Shay Gueron, Michael E. Kounavis. + + After saving the magic c2 constant and pre-formatted version of the key, + we pre-process the key for parallel hashing. This takes advantage of the + identity of addition over a galois field being identital to XOR, and thus + can be parellized (S 2.2, page 3). We multiply and add (galois field + versions) the key over multiple iterations and save the result. This can + later be galois added (XORed) with parallel processed input (Estrin's + Scheme). + + The ghash "key" is a salt. */ +void ASM_FUNC_ATTR +_gcry_ghash_setup_ppc_vpmsum (uint64_t *gcm_table, void *gcm_key) +{ + vector16x_u8 bswap_const = + { 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 }; + vector16x_u8 c2 = + { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0b11000010 }; + block T0, T1, T2; + block C2, H, H1, H1l, H1h, H2, H2l, H2h; + block H3l, H3, H3h, H4l, H4, H4h, T3, T4; + vector16x_s8 most_sig_of_H, t7, carry; + vector16x_u8 one = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }; + + H = VEC_LOAD_BE(gcm_key, bswap_const); + most_sig_of_H = vec_splat((vector16x_s8)H, 15); + t7 = vec_splat_s8(7); + carry = most_sig_of_H >> t7; + carry &= c2; /* only interested in certain carries. */ + H1 = asm_ashl_128(H, one); + H1 ^= (block)carry; /* complete the <<< 1 */ + + T1 = asm_swap_u64 (H1); + H1l = asm_rot_block_right (T1); + H1h = asm_rot_block_left (T1); + C2 = asm_rot_block_right ((block)c2); + + STORE_TABLE (gcm_table, 0, C2); + STORE_TABLE (gcm_table, 1, H1l); + STORE_TABLE (gcm_table, 2, T1); + STORE_TABLE (gcm_table, 3, H1h); + + /* pre-process coefficients for Gerald Estrin's scheme for parallel + * multiplication of polynomials + */ + H2l = asm_vpmsumd (H1l, H1); /* do not need to mask in + because 0 * anything -> 0 */ + H2 = asm_vpmsumd (T1, H1); + H2h = asm_vpmsumd (H1h, H1); + + /* reduce 1 */ + T0 = asm_vpmsumd (H2l, C2); + + H2l ^= asm_rot_block_left (H2);; + H2h ^= asm_rot_block_right (H2); + H2l = asm_swap_u64 (H2l); + H2l ^= T0; + /* reduce 2 */ + T0 = asm_swap_u64 (H2l); + H2l = asm_vpmsumd (H2l, C2); + H2 = H2l ^ H2h ^ T0; + + T2 = asm_swap_u64 (H2); + H2l = asm_rot_block_right (T2); + H2h = asm_rot_block_left (T2); + + STORE_TABLE (gcm_table, 4, H2l); + STORE_TABLE (gcm_table, 5, T2); + STORE_TABLE (gcm_table, 6, H2h); + + H3l = asm_vpmsumd (H2l, H1); + H4l = asm_vpmsumd (H2l, H2); + H3 = asm_vpmsumd (T2, H1); + H4 = asm_vpmsumd (T2, H2); + H3h = asm_vpmsumd (H2h, H1); + H4h = asm_vpmsumd (H2h, H2); + + T3 = asm_vpmsumd (H3l, C2); + T4 = asm_vpmsumd (H4l, C2); + + H3l ^= asm_rot_block_left (H3); + H3h ^= asm_rot_block_right (H3); + H4l ^= asm_rot_block_left (H4); + H4h ^= asm_rot_block_right (H4); + + H3 = asm_swap_u64 (H3l); + H4 = asm_swap_u64 (H4l); + + H3 ^= T3; + H4 ^= T4; + + /* We could have also b64 switched reduce and reduce2, however as we are + using the unrotated H and H2 above to vpmsum, this is marginally better. */ + T3 = asm_swap_u64 (H3); + T4 = asm_swap_u64 (H4); + + H3 = asm_vpmsumd (H3, C2); + H4 = asm_vpmsumd (H4, C2); + + T3 ^= H3h; + T4 ^= H4h; + H3 ^= T3; + H4 ^= T4; + H3 = asm_swap_u64 (H3); + H4 = asm_swap_u64 (H4); + + H3l = asm_rot_block_right (H3); + H3h = asm_rot_block_left (H3); + H4l = asm_rot_block_right (H4); + H4h = asm_rot_block_left (H4); + + STORE_TABLE (gcm_table, 7, H3l); + STORE_TABLE (gcm_table, 8, H3); + STORE_TABLE (gcm_table, 9, H3h); + STORE_TABLE (gcm_table, 10, H4l); + STORE_TABLE (gcm_table, 11, H4); + STORE_TABLE (gcm_table, 12, H4h); +} + +ASM_FUNC_ATTR_INLINE +block +vec_perm2(block l, block r, vector16x_u8 perm) { + block ret; + __asm__ ("vperm %0,%1,%2,%3\n\t" + : "=v" (ret) + : "v" (l), "v" (r), "v" (perm)); + return ret; +} + +void ASM_FUNC_ATTR +_gcry_ghash_ppc_vpmsum (const byte *result, const void *const gcm_table, + const byte *const buf, const size_t nblocks) +{ + /* This const is strange, it is reversing the bytes, and also reversing + the u32s that get switched by lxvw4 and it also addresses bytes big-endian, + and is here due to lack of proper peep-hole optimization. */ + vector16x_u8 bswap_const = + { 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 }; + vector16x_u8 bswap_8_const = + { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; + block c2, H0l, H0m, H0h, H4l, H4m, H4h, H2m, H3l, H3m, H3h, Hl; + block Hm, Hh, in, in0, in1, in2, in3, Hm_right, Hl_rotate, cur; + size_t blocks_remaining = nblocks, off = 0; + size_t not_multiple_of_four; + block t0; + + cur = vec_load_be (0, result, bswap_const); + + c2 = vec_aligned_ld (0, gcm_table); + H0l = vec_aligned_ld (16, gcm_table); + H0m = vec_aligned_ld (32, gcm_table); + H0h = vec_aligned_ld (48, gcm_table); + + for (not_multiple_of_four = nblocks % 4; not_multiple_of_four; + not_multiple_of_four--) + { + in = vec_load_be (off, buf, bswap_const); + off += 16; + blocks_remaining--; + cur ^= in; + + Hl = asm_vpmsumd (cur, H0l); + Hm = asm_vpmsumd (cur, H0m); + Hh = asm_vpmsumd (cur, H0h); + + t0 = asm_vpmsumd (Hl, c2); + + Hl ^= asm_rot_block_left (Hm); + + Hm_right = asm_rot_block_right (Hm); + Hh ^= Hm_right; + Hl_rotate = asm_swap_u64 (Hl); + Hl_rotate ^= t0; + Hl = asm_swap_u64 (Hl_rotate); + Hl_rotate = asm_vpmsumd (Hl_rotate, c2); + Hl ^= Hh; + Hl ^= Hl_rotate; + + cur = Hl; + } + + if (blocks_remaining > 0) + { + vector16x_u8 hiperm = + { + 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, + 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0 + }; + vector16x_u8 loperm = + { + 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18, + 0xf, 0xe, 0xd, 0xc, 0xb, 0xa, 0x9, 0x8 + }; + block Xl, Xm, Xh, Xl1, Xm1, Xh1, Xm2, Xl3, Xm3, Xh3, Xl_rotate; + block H21l, H21h, merge_l, merge_h; + + H2m = vec_aligned_ld (48 + 32, gcm_table); + H3l = vec_aligned_ld (48 * 2 + 16, gcm_table); + H3m = vec_aligned_ld (48 * 2 + 32, gcm_table); + H3h = vec_aligned_ld (48 * 2 + 48, gcm_table); + H4l = vec_aligned_ld (48 * 3 + 16, gcm_table); + H4m = vec_aligned_ld (48 * 3 + 32, gcm_table); + H4h = vec_aligned_ld (48 * 3 + 48, gcm_table); + + in0 = vec_load_be (off, buf, bswap_const); + in1 = vec_load_be (off + 16, buf, bswap_const); + in2 = vec_load_be (off + 32, buf, bswap_const); + in3 = vec_load_be (off + 48, buf, bswap_const); + blocks_remaining -= 4; + off += 64; + + Xh = in0 ^ cur; + + Xl1 = asm_vpmsumd (in1, H3l); + Xm1 = asm_vpmsumd (in1, H3m); + Xh1 = asm_vpmsumd (in1, H3h); + + H21l = vec_perm2 (H2m, H0m, hiperm); + H21h = vec_perm2 (H2m, H0m, loperm); + merge_l = vec_perm2 (in2, in3, loperm); + merge_h = vec_perm2 (in2, in3, hiperm); + + Xm2 = asm_vpmsumd (in2, H2m); + Xl3 = asm_vpmsumd (merge_l, H21l); + Xm3 = asm_vpmsumd (in3, H0m); + Xh3 = asm_vpmsumd (merge_h, H21h); + + Xm2 ^= Xm1; + Xl3 ^= Xl1; + Xm3 ^= Xm2; + Xh3 ^= Xh1; + + /* Gerald Estrin's scheme for parallel multiplication of polynomials */ + for (;blocks_remaining > 0; blocks_remaining -= 4, off += 64) + { + in0 = vec_load_be (off, buf, bswap_const); + in1 = vec_load_be (off + 16, buf, bswap_const); + in2 = vec_load_be (off + 32, buf, bswap_const); + in3 = vec_load_be (off + 48, buf, bswap_const); + + Xl = asm_vpmsumd (Xh, H4l); + Xm = asm_vpmsumd (Xh, H4m); + Xh = asm_vpmsumd (Xh, H4h); + Xl1 = asm_vpmsumd (in1, H3l); + Xm1 = asm_vpmsumd (in1, H3m); + Xh1 = asm_vpmsumd (in1, H3h); + + Xl ^= Xl3; + Xm ^= Xm3; + Xh ^= Xh3; + merge_l = vec_perm2 (in2, in3, loperm); + merge_h = vec_perm2 (in2, in3, hiperm); + + t0 = asm_vpmsumd (Xl, c2); + Xl3 = asm_vpmsumd (merge_l, H21l); + Xh3 = asm_vpmsumd (merge_h, H21h); + + Xl ^= asm_rot_block_left (Xm); + Xh ^= asm_rot_block_right (Xm); + + Xl = asm_swap_u64 (Xl); + Xl ^= t0; + + Xl_rotate = asm_swap_u64 (Xl); + Xm2 = asm_vpmsumd (in2, H2m); + Xm3 = asm_vpmsumd (in3, H0m); + Xl = asm_vpmsumd (Xl, c2); + + Xl3 ^= Xl1; + Xh3 ^= Xh1; + Xh ^= in0; + Xm2 ^= Xm1; + Xh ^= Xl_rotate; + Xm3 ^= Xm2; + Xh ^= Xl; + } + + Xl = asm_vpmsumd (Xh, H4l); + Xm = asm_vpmsumd (Xh, H4m); + Xh = asm_vpmsumd (Xh, H4h); + + Xl ^= Xl3; + Xm ^= Xm3; + + t0 = asm_vpmsumd (Xl, c2); + + Xh ^= Xh3; + Xl ^= asm_rot_block_left (Xm); + Xh ^= asm_rot_block_right (Xm); + + Xl = asm_swap_u64 (Xl); + Xl ^= t0; + + Xl_rotate = asm_swap_u64 (Xl); + Xl = asm_vpmsumd (Xl, c2); + Xl_rotate ^= Xh; + Xl ^= Xl_rotate; + + cur = Xl; + } + + cur = (block)vec_perm ((vector16x_u8)cur, (vector16x_u8)cur, bswap_8_const); + STORE_TABLE (result, 0, cur); +} + +#endif /* GCM_USE_PPC_VPMSUM */ diff --git a/cipher/cipher-gcm.c b/cipher/cipher-gcm.c index 32ec9fa0..b84a0698 100644 --- a/cipher/cipher-gcm.c +++ b/cipher/cipher-gcm.c @@ -61,6 +61,28 @@ ghash_armv8_ce_pmull (gcry_cipher_hd_t c, byte *result, const byte *buf, #endif +#ifdef GCM_USE_PPC_VPMSUM +extern void _gcry_ghash_setup_ppc_vpmsum (void *gcm_table, void *gcm_key); + +/* result is 128-bits */ +extern unsigned int _gcry_ghash_ppc_vpmsum (byte *result, void *gcm_table, + const byte *buf, size_t nblocks); + +static void +ghash_setup_ppc_vpmsum (gcry_cipher_hd_t c) +{ + _gcry_ghash_setup_ppc_vpmsum(c->u_mode.gcm.gcm_table, c->u_mode.gcm.u_ghash_key.key); +} + +static unsigned int +ghash_ppc_vpmsum (gcry_cipher_hd_t c, byte *result, const byte *buf, + size_t nblocks) +{ + _gcry_ghash_ppc_vpmsum(result, c->u_mode.gcm.gcm_table, buf, + nblocks); + return 0; +} +#endif /* GCM_USE_PPC_VPMSUM */ #ifdef GCM_USE_TABLES static const u16 gcmR[256] = { @@ -403,7 +425,8 @@ ghash_internal (gcry_cipher_hd_t c, byte *result, const byte *buf, static void setupM (gcry_cipher_hd_t c) { -#if defined(GCM_USE_INTEL_PCLMUL) || defined(GCM_USE_ARM_PMULL) +#if defined(GCM_USE_INTEL_PCLMUL) || defined(GCM_USE_ARM_PMULL) || \ + defined(GCM_USE_S390X_CRYPTO) || defined(GCM_USE_PPC_VPMSUM) unsigned int features = _gcry_get_hw_features (); #endif @@ -423,7 +446,24 @@ setupM (gcry_cipher_hd_t c) ghash_setup_armv8_ce_pmull (c); } #endif - else +#ifdef GCM_USE_PPC_VPMSUM + else if (features & HWF_PPC_VCRYPTO) + { + c->u_mode.gcm.ghash_fn = ghash_ppc_vpmsum; + ghash_setup_ppc_vpmsum (c); + } +#endif +#ifdef GCM_USE_S390X_CRYPTO + else if (features & HWF_S390X_MSA) + { + if (kimd_query () & km_function_to_mask (KMID_FUNCTION_GHASH)) + { + c->u_mode.gcm.ghash_fn = ghash_s390x_kimd; + } + } +#endif + + if (c->u_mode.gcm.ghash_fn == NULL) { c->u_mode.gcm.ghash_fn = ghash_internal; fillM (c); diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h index a95e084b..a5fd3097 100644 --- a/cipher/cipher-internal.h +++ b/cipher/cipher-internal.h @@ -87,6 +87,18 @@ #endif /* GCM_USE_ARM_PMULL */ +/* GCM_USE_PPC_VPMSUM indicates whether to compile GCM with PPC Power 8 + * polynomial multiplication instruction. */ +#undef GCM_USE_PPC_VPMSUM +#if defined(GCM_USE_TABLES) +#if defined(ENABLE_PPC_CRYPTO_SUPPORT) && defined(__powerpc64__) && \ + !defined(WORDS_BIGENDIAN) && defined(HAVE_COMPATIBLE_CC_PPC_ALTIVEC) && \ + defined(HAVE_GCC_INLINE_ASM_PPC_ALTIVEC) && __GNUC__ >= 4 +# define GCM_USE_PPC_VPMSUM 1 +# define NEED_16BYTE_ALIGNED_CONTEXT 1 /* this also aligns gcm_table */ +#endif +#endif /* GCM_USE_PPC_VPMSUM */ + typedef unsigned int (*ghash_fn_t) (gcry_cipher_hd_t c, byte *result, const byte *buf, size_t nblocks); @@ -277,9 +289,6 @@ struct gcry_cipher_handle unsigned char key[MAX_BLOCKSIZE]; } u_ghash_key; - /* GHASH implementation in use. */ - ghash_fn_t ghash_fn; - /* Pre-calculated table for GCM. */ #ifdef GCM_USE_TABLES #if (SIZEOF_UNSIGNED_LONG == 8 || defined(__x86_64__)) @@ -290,6 +299,9 @@ struct gcry_cipher_handle u32 gcm_table[4 * 16]; #endif #endif + + /* GHASH implementation in use. */ + ghash_fn_t ghash_fn; } gcm; /* Mode specific storage for OCB mode. */ diff --git a/configure.ac b/configure.ac index be35ce42..202ac888 100644 --- a/configure.ac +++ b/configure.ac @@ -2752,6 +2752,25 @@ case "${host}" in ;; esac +# Arch specific GCM implementations +case "${host}" in + powerpc64le-*-*) + GCRYPT_DIGESTS="$GCRYPT_DIGESTS cipher-gcm-ppc.lo" + ;; + powerpc64-*-*) + GCRYPT_DIGESTS="$GCRYPT_DIGESTS cipher-gcm-ppc.lo" + ;; + powerpc-*-*) + GCRYPT_DIGESTS="$GCRYPT_DIGESTS cipher-gcm-ppc.lo" + ;; +esac + +LIST_MEMBER(sm3, $enabled_digests) +if test "$found" = "1" ; then + GCRYPT_DIGESTS="$GCRYPT_DIGESTS sm3.lo" + AC_DEFINE(USE_SM3, 1, [Defined if this module should be included]) +fi + LIST_MEMBER(scrypt, $enabled_kdfs) if test "$found" = "1" ; then GCRYPT_KDFS="$GCRYPT_KDFS scrypt.lo" diff --git a/tests/basic.c b/tests/basic.c index 0bd80201..06808d4a 100644 --- a/tests/basic.c +++ b/tests/basic.c @@ -1553,6 +1553,22 @@ _check_gcm_cipher (unsigned int step) "\x0f\xc0\xc3\xb7\x80\xf2\x44\x45\x2d\xa3\xeb\xf1\xc5\xd8\x2c\xde" "\xa2\x41\x89\x97\x20\x0e\xf8\x2e\x44\xae\x7e\x3f", "\xa4\x4a\x82\x66\xee\x1c\x8e\xb0\xc8\xb5\xd4\xcf\x5a\xe9\xf1\x9a" }, + { GCRY_CIPHER_AES256, + "\xfe\xff\xe9\x92\x86\x65\x73\x1c\x6d\x6a\x8f\x94\x67\x30\x83\x08" + "\xfe\xff\xe9\x92\x86\x65\x73\x1c\x6d\x6a\x8f\x94\x67\x30\x83\x08", + "\xca\xfe\xba\xbe\xfa\xce\xdb\xad\xde\xca\xf8\x88", 12, + "\xfe\xed\xfa\xce\xde\xad\xbe\xef\xfe\xed\xfa\xce\xde\xad\xbe\xef" + "\xab\xad\xda\xd2", 20, + "\xd9\x31\x32\x25\xf8\x84\x06\xe5\xa5\x59\x09\xc5\xaf\xf5\x26\x9a" + "\x86\xa7\xa9\x53\x15\x34\xf7\xda\x2e\x4c\x30\x3d\x8a\x31\x8a\x72" + "\x1c\x3c\x0c\x95\x95\x68\x09\x53\x2f\xcf\x0e\x24\x49\xa6\xb5\x25" + "\xb1\x6a\xed\xf5\xaa\x0d\xe6\x57\xba\x63\x7b\x39", + 60, + "\x52\x2d\xc1\xf0\x99\x56\x7d\x07\xf4\x7f\x37\xa3\x2a\x84\x42\x7d" + "\x64\x3a\x8c\xdc\xbf\xe5\xc0\xc9\x75\x98\xa2\xbd\x25\x55\xd1\xaa" + "\x8c\xb0\x8e\x48\x59\x0d\xbb\x3d\xa7\xb0\x8b\x10\x56\x82\x88\x38" + "\xc5\xf6\x1e\x63\x93\xba\x7a\x0a\xbc\xc9\xf6\x62", + "\x76\xfc\x6e\xce\x0f\x4e\x17\x68\xcd\xdf\x88\x53\xbb\x2d\x55\x1b" }, /* Test vectors for overflowing CTR. */ /* After setiv, ctr_low: 0xffffffff */ { GCRY_CIPHER_AES256, diff --git a/cipher/cipher-gcm-ppc.c b/cipher/cipher-gcm-ppc.c index ed27ef15..2f60c09d 100644 --- a/cipher/cipher-gcm-ppc.c +++ b/cipher/cipher-gcm-ppc.c @@ -93,112 +93,157 @@ typedef vector signed char vector16x_s8; typedef vector unsigned long long vector2x_u64; typedef vector unsigned long long block; +static ASM_FUNC_ATTR_INLINE block +asm_xor(block a, block b) +{ + block r; + __asm__ volatile ("xxlxor %x0, %x1, %x2" + : "=wa" (r) + : "wa" (a), "wa" (b)); + return r; +} + static ASM_FUNC_ATTR_INLINE block asm_vpmsumd(block a, block b) { block r; - __asm__("vpmsumd %0, %1, %2" - : "=v" (r) - : "v" (a), "v" (b)); + __asm__ volatile ("vpmsumd %0, %1, %2" + : "=v" (r) + : "v" (a), "v" (b)); return r; } static ASM_FUNC_ATTR_INLINE block asm_swap_u64(block a) { - __asm__("xxswapd %x0, %x1" - : "=wa" (a) - : "wa" (a)); - return a; + block r; + __asm__ volatile ("xxswapd %x0, %x1" + : "=wa" (r) + : "wa" (a)); + return r; } static ASM_FUNC_ATTR_INLINE block -asm_rot_block_left(block a) +asm_mergelo(block l, block r) { - block zero = {0, 0}; - block mask = {2, 0}; - return __builtin_shuffle(a, zero, mask); + block ret; + __asm__ volatile ("xxmrgld %x0, %x1, %x2\n\t" + : "=wa" (ret) + : "wa" (l), "wa" (r)); + return ret; } static ASM_FUNC_ATTR_INLINE block -asm_rot_block_right(block a) +asm_mergehi(block l, block r) { - block zero = {0, 0}; - block mask = {1, 2}; - return __builtin_shuffle(a, zero, mask); + block ret; + __asm__ volatile ("xxmrghd %x0, %x1, %x2\n\t" + : "=wa" (ret) + : "wa" (l), "wa" (r)); + return ret; } -/* vsl is a slightly strange function in the way the shift is passed... */ static ASM_FUNC_ATTR_INLINE block -asm_ashl_128(block a, vector16x_u8 shift) +asm_rot_block_left(block a) { block r; - __asm__("vsl %0, %1, %2" - : "=v" (r) - : "v" (a), "v" (shift)); + block zero = { 0, 0 }; + __asm__ volatile ("xxmrgld %x0, %x1, %x2" + : "=wa" (r) + : "wa" (a), "wa" (zero)); return r; } -#define ALIGNED_LOAD(in_ptr) \ - (vec_aligned_ld (0, (const unsigned char *)(in_ptr))) +static ASM_FUNC_ATTR_INLINE block +asm_rot_block_right(block a) +{ + block r; + block zero = { 0, 0 }; + __asm__ volatile ("xxsldwi %x0, %x2, %x1, 2" + : "=wa" (r) + : "wa" (a), "wa" (zero)); + return r; +} +/* vsl is a slightly strange function in the way the shift is passed... */ static ASM_FUNC_ATTR_INLINE block -vec_aligned_ld(unsigned long offset, const unsigned char *ptr) +asm_ashl_128(block a, vector16x_u8 shift) { -#ifndef WORDS_BIGENDIAN - block vec; - __asm__ ("lvx %0,%1,%2\n\t" - : "=v" (vec) - : "r" (offset), "r" ((uintptr_t)ptr) - : "memory", "r0"); - return vec; -#else - return vec_vsx_ld (offset, ptr); -#endif + block r; + __asm__ volatile ("vsl %0, %1, %2" + : "=v" (r) + : "v" (a), "v" (shift)); + return r; } #define STORE_TABLE(gcm_table, slot, vec) \ - vec_aligned_st (((block)vec), slot * 16, (unsigned char *)(gcm_table)); - + vec_store_he (((block)vec), slot * 16, (unsigned char *)(gcm_table)); static ASM_FUNC_ATTR_INLINE void -vec_aligned_st(block vec, unsigned long offset, unsigned char *ptr) +vec_store_he(block vec, unsigned long offset, unsigned char *ptr) { #ifndef WORDS_BIGENDIAN - __asm__ ("stvx %0,%1,%2\n\t" - : - : "v" (vec), "r" (offset), "r" ((uintptr_t)ptr) - : "memory", "r0"); + /* GCC vec_vsx_ld is generating two instructions on little-endian. Use + * lxvd2x directly instead. */ +#if __GNUC__ >= 4 + if (__builtin_constant_p (offset) && offset == 0) + __asm__ volatile ("stxvd2x %x0, 0, %1\n\t" + : + : "wa" (vec), "r" ((uintptr_t)ptr) + : "memory", "r0"); + else +#endif + __asm__ volatile ("stxvd2x %x0, %1, %2\n\t" + : + : "wa" (vec), "r" (offset), "r" ((uintptr_t)ptr) + : "memory", "r0"); #else vec_vsx_st ((vector16x_u8)vec, offset, ptr); #endif } #define VEC_LOAD_BE(in_ptr, bswap_const) \ - (vec_load_be (0, (const unsigned char *)(in_ptr), bswap_const)) + vec_be_swap(vec_load_he (0, (const unsigned char *)(in_ptr)), bswap_const) static ASM_FUNC_ATTR_INLINE block -vec_load_be(unsigned long offset, const unsigned char *ptr, - vector unsigned char be_bswap_const) +vec_load_he(unsigned long offset, const unsigned char *ptr) { #ifndef WORDS_BIGENDIAN block vec; /* GCC vec_vsx_ld is generating two instructions on little-endian. Use - * lxvw4x directly instead. */ - __asm__ ("lxvw4x %x0,%1,%2\n\t" - : "=wa" (vec) - : "r" (offset), "r" ((uintptr_t)ptr) - : "memory", "r0"); - __asm__ ("vperm %0,%1,%1,%2\n\t" - : "=v" (vec) - : "v" (vec), "v" (be_bswap_const)); + * lxvd2x directly instead. */ +#if __GNUC__ >= 4 + if (__builtin_constant_p (offset) && offset == 0) + __asm__ volatile ("lxvd2x %x0, 0, %1\n\t" + : "=wa" (vec) + : "r" ((uintptr_t)ptr) + : "memory", "r0"); + else +#endif + __asm__ volatile ("lxvd2x %x0, %1, %2\n\t" + : "=wa" (vec) + : "r" (offset), "r" ((uintptr_t)ptr) + : "memory", "r0"); return vec; #else - (void)be_bswap_const; return vec_vsx_ld (offset, ptr); #endif } +static ASM_FUNC_ATTR_INLINE block +vec_be_swap(block vec, vector16x_u8 be_bswap_const) +{ +#ifndef WORDS_BIGENDIAN + __asm__ volatile ("vperm %0, %1, %1, %2\n\t" + : "=v" (vec) + : "v" (vec), "v" (be_bswap_const)); +#else + (void)be_bswap_const; +#endif + return vec; +} + + /* Power ghash based on papers: "The Galois/Counter Mode of Operation (GCM)"; David A. McGrew, John Viega "Intel® Carry-Less Multiplication Instruction and its Usage for Computing @@ -216,15 +261,16 @@ vec_load_be(unsigned long offset, const unsigned char *ptr, void ASM_FUNC_ATTR _gcry_ghash_setup_ppc_vpmsum (uint64_t *gcm_table, void *gcm_key) { - vector16x_u8 bswap_const = - { 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 }; - vector16x_u8 c2 = + static const vector16x_u8 bswap_const = + { ~7, ~6, ~5, ~4, ~3, ~2, ~1, ~0, ~15, ~14, ~13, ~12, ~11, ~10, ~9, ~8 }; + static const vector16x_u8 c2 = { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0b11000010 }; + static const vector16x_u8 one = + { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }; block T0, T1, T2; block C2, H, H1, H1l, H1h, H2, H2l, H2h; block H3l, H3, H3h, H4l, H4, H4h, T3, T4; vector16x_s8 most_sig_of_H, t7, carry; - vector16x_u8 one = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }; H = VEC_LOAD_BE(gcm_key, bswap_const); most_sig_of_H = vec_splat((vector16x_s8)H, 15); @@ -255,7 +301,7 @@ _gcry_ghash_setup_ppc_vpmsum (uint64_t *gcm_table, void *gcm_key) /* reduce 1 */ T0 = asm_vpmsumd (H2l, C2); - H2l ^= asm_rot_block_left (H2);; + H2l ^= asm_rot_block_left (H2); H2h ^= asm_rot_block_right (H2); H2l = asm_swap_u64 (H2l); H2l ^= T0; @@ -321,45 +367,30 @@ _gcry_ghash_setup_ppc_vpmsum (uint64_t *gcm_table, void *gcm_key) STORE_TABLE (gcm_table, 12, H4h); } -ASM_FUNC_ATTR_INLINE -block -vec_perm2(block l, block r, vector16x_u8 perm) { - block ret; - __asm__ ("vperm %0,%1,%2,%3\n\t" - : "=v" (ret) - : "v" (l), "v" (r), "v" (perm)); - return ret; -} - void ASM_FUNC_ATTR -_gcry_ghash_ppc_vpmsum (const byte *result, const void *const gcm_table, - const byte *const buf, const size_t nblocks) +_gcry_ghash_ppc_vpmsum (byte *result, const void *const gcm_table, + const byte *buf, const size_t nblocks) { - /* This const is strange, it is reversing the bytes, and also reversing - the u32s that get switched by lxvw4 and it also addresses bytes big-endian, - and is here due to lack of proper peep-hole optimization. */ - vector16x_u8 bswap_const = - { 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 }; - vector16x_u8 bswap_8_const = - { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 }; + static const vector16x_u8 bswap_const = + { ~7, ~6, ~5, ~4, ~3, ~2, ~1, ~0, ~15, ~14, ~13, ~12, ~11, ~10, ~9, ~8 }; block c2, H0l, H0m, H0h, H4l, H4m, H4h, H2m, H3l, H3m, H3h, Hl; block Hm, Hh, in, in0, in1, in2, in3, Hm_right, Hl_rotate, cur; - size_t blocks_remaining = nblocks, off = 0; + size_t blocks_remaining = nblocks; size_t not_multiple_of_four; block t0; - cur = vec_load_be (0, result, bswap_const); + cur = vec_be_swap (vec_load_he (0, result), bswap_const); - c2 = vec_aligned_ld (0, gcm_table); - H0l = vec_aligned_ld (16, gcm_table); - H0m = vec_aligned_ld (32, gcm_table); - H0h = vec_aligned_ld (48, gcm_table); + c2 = vec_load_he (0, gcm_table); + H0l = vec_load_he (16, gcm_table); + H0m = vec_load_he (32, gcm_table); + H0h = vec_load_he (48, gcm_table); for (not_multiple_of_four = nblocks % 4; not_multiple_of_four; not_multiple_of_four--) { - in = vec_load_be (off, buf, bswap_const); - off += 16; + in = vec_be_swap (vec_load_he (0, buf), bswap_const); + buf += 16; blocks_remaining--; cur ^= in; @@ -385,62 +416,64 @@ _gcry_ghash_ppc_vpmsum (const byte *result, const void *const gcm_table, if (blocks_remaining > 0) { - vector16x_u8 hiperm = - { - 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10, - 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0 - }; - vector16x_u8 loperm = - { - 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18, - 0xf, 0xe, 0xd, 0xc, 0xb, 0xa, 0x9, 0x8 - }; block Xl, Xm, Xh, Xl1, Xm1, Xh1, Xm2, Xl3, Xm3, Xh3, Xl_rotate; block H21l, H21h, merge_l, merge_h; - - H2m = vec_aligned_ld (48 + 32, gcm_table); - H3l = vec_aligned_ld (48 * 2 + 16, gcm_table); - H3m = vec_aligned_ld (48 * 2 + 32, gcm_table); - H3h = vec_aligned_ld (48 * 2 + 48, gcm_table); - H4l = vec_aligned_ld (48 * 3 + 16, gcm_table); - H4m = vec_aligned_ld (48 * 3 + 32, gcm_table); - H4h = vec_aligned_ld (48 * 3 + 48, gcm_table); - - in0 = vec_load_be (off, buf, bswap_const); - in1 = vec_load_be (off + 16, buf, bswap_const); - in2 = vec_load_be (off + 32, buf, bswap_const); - in3 = vec_load_be (off + 48, buf, bswap_const); - blocks_remaining -= 4; - off += 64; - - Xh = in0 ^ cur; + block t1, t2; + + H2m = vec_load_he (48 + 32, gcm_table); + H3l = vec_load_he (48 * 2 + 16, gcm_table); + H3m = vec_load_he (48 * 2 + 32, gcm_table); + H3h = vec_load_he (48 * 2 + 48, gcm_table); + H4l = vec_load_he (48 * 3 + 16, gcm_table); + H4m = vec_load_he (48 * 3 + 32, gcm_table); + H4h = vec_load_he (48 * 3 + 48, gcm_table); + + in0 = vec_load_he (0, buf); + in1 = vec_load_he (16, buf); + in2 = vec_load_he (32, buf); + in3 = vec_load_he (48, buf); + in0 = vec_be_swap(in0, bswap_const); + in1 = vec_be_swap(in1, bswap_const); + in2 = vec_be_swap(in2, bswap_const); + in3 = vec_be_swap(in3, bswap_const); + + Xh = asm_xor (in0, cur); Xl1 = asm_vpmsumd (in1, H3l); Xm1 = asm_vpmsumd (in1, H3m); Xh1 = asm_vpmsumd (in1, H3h); - H21l = vec_perm2 (H2m, H0m, hiperm); - H21h = vec_perm2 (H2m, H0m, loperm); - merge_l = vec_perm2 (in2, in3, loperm); - merge_h = vec_perm2 (in2, in3, hiperm); + H21l = asm_mergehi (H2m, H0m); + H21h = asm_mergelo (H2m, H0m); + merge_l = asm_mergelo (in2, in3); + merge_h = asm_mergehi (in2, in3); Xm2 = asm_vpmsumd (in2, H2m); Xl3 = asm_vpmsumd (merge_l, H21l); Xm3 = asm_vpmsumd (in3, H0m); Xh3 = asm_vpmsumd (merge_h, H21h); - Xm2 ^= Xm1; - Xl3 ^= Xl1; - Xm3 ^= Xm2; - Xh3 ^= Xh1; + Xm2 = asm_xor (Xm2, Xm1); + Xl3 = asm_xor (Xl3, Xl1); + Xm3 = asm_xor (Xm3, Xm2); + Xh3 = asm_xor (Xh3, Xh1); /* Gerald Estrin's scheme for parallel multiplication of polynomials */ - for (;blocks_remaining > 0; blocks_remaining -= 4, off += 64) + while (1) { - in0 = vec_load_be (off, buf, bswap_const); - in1 = vec_load_be (off + 16, buf, bswap_const); - in2 = vec_load_be (off + 32, buf, bswap_const); - in3 = vec_load_be (off + 48, buf, bswap_const); + buf += 64; + blocks_remaining -= 4; + if (!blocks_remaining) + break; + + in0 = vec_load_he (0, buf); + in1 = vec_load_he (16, buf); + in2 = vec_load_he (32, buf); + in3 = vec_load_he (48, buf); + in1 = vec_be_swap(in1, bswap_const); + in2 = vec_be_swap(in2, bswap_const); + in3 = vec_be_swap(in3, bswap_const); + in0 = vec_be_swap(in0, bswap_const); Xl = asm_vpmsumd (Xh, H4l); Xm = asm_vpmsumd (Xh, H4m); @@ -449,62 +482,63 @@ _gcry_ghash_ppc_vpmsum (const byte *result, const void *const gcm_table, Xm1 = asm_vpmsumd (in1, H3m); Xh1 = asm_vpmsumd (in1, H3h); - Xl ^= Xl3; - Xm ^= Xm3; - Xh ^= Xh3; - merge_l = vec_perm2 (in2, in3, loperm); - merge_h = vec_perm2 (in2, in3, hiperm); + Xl = asm_xor (Xl, Xl3); + Xm = asm_xor (Xm, Xm3); + Xh = asm_xor (Xh, Xh3); + merge_l = asm_mergelo (in2, in3); + merge_h = asm_mergehi (in2, in3); t0 = asm_vpmsumd (Xl, c2); Xl3 = asm_vpmsumd (merge_l, H21l); Xh3 = asm_vpmsumd (merge_h, H21h); - Xl ^= asm_rot_block_left (Xm); - Xh ^= asm_rot_block_right (Xm); + t1 = asm_rot_block_left (Xm); + t2 = asm_rot_block_right (Xm); + Xl = asm_xor(Xl, t1); + Xh = asm_xor(Xh, t2); Xl = asm_swap_u64 (Xl); - Xl ^= t0; + Xl = asm_xor(Xl, t0); Xl_rotate = asm_swap_u64 (Xl); Xm2 = asm_vpmsumd (in2, H2m); Xm3 = asm_vpmsumd (in3, H0m); Xl = asm_vpmsumd (Xl, c2); - Xl3 ^= Xl1; - Xh3 ^= Xh1; - Xh ^= in0; - Xm2 ^= Xm1; - Xh ^= Xl_rotate; - Xm3 ^= Xm2; - Xh ^= Xl; + Xl3 = asm_xor (Xl3, Xl1); + Xh3 = asm_xor (Xh3, Xh1); + Xh = asm_xor (Xh, in0); + Xm2 = asm_xor (Xm2, Xm1); + Xh = asm_xor (Xh, Xl_rotate); + Xm3 = asm_xor (Xm3, Xm2); + Xh = asm_xor (Xh, Xl); } Xl = asm_vpmsumd (Xh, H4l); Xm = asm_vpmsumd (Xh, H4m); Xh = asm_vpmsumd (Xh, H4h); - Xl ^= Xl3; - Xm ^= Xm3; + Xl = asm_xor (Xl, Xl3); + Xm = asm_xor (Xm, Xm3); t0 = asm_vpmsumd (Xl, c2); - Xh ^= Xh3; - Xl ^= asm_rot_block_left (Xm); - Xh ^= asm_rot_block_right (Xm); + Xh = asm_xor (Xh, Xh3); + t1 = asm_rot_block_left (Xm); + t2 = asm_rot_block_right (Xm); + Xl = asm_xor (Xl, t1); + Xh = asm_xor (Xh, t2); Xl = asm_swap_u64 (Xl); - Xl ^= t0; + Xl = asm_xor (Xl, t0); Xl_rotate = asm_swap_u64 (Xl); Xl = asm_vpmsumd (Xl, c2); - Xl_rotate ^= Xh; - Xl ^= Xl_rotate; - - cur = Xl; + Xh = asm_xor (Xh, Xl_rotate); + cur = asm_xor (Xh, Xl); } - cur = (block)vec_perm ((vector16x_u8)cur, (vector16x_u8)cur, bswap_8_const); - STORE_TABLE (result, 0, cur); + vec_store_he (vec_be_swap (cur, bswap_const), 0, result); } #endif /* GCM_USE_PPC_VPMSUM */ diff --git a/cipher/Makefile.am b/cipher/Makefile.am index ab5d2a38..7a777ef2 100644 --- a/cipher/Makefile.am +++ b/cipher/Makefile.am @@ -42,8 +42,7 @@ libcipher_la_LIBADD = $(GCRYPT_MODULES) libcipher_la_SOURCES = \ cipher.c cipher-internal.h \ cipher-cbc.c cipher-cfb.c cipher-ofb.c cipher-ctr.c cipher-aeswrap.c \ -cipher-ccm.c cipher-cmac.c cipher-gcm.c cipher-gcm-intel-pclmul.c \ - cipher-gcm-armv8-aarch32-ce.S cipher-gcm-armv8-aarch64-ce.S \ +cipher-ccm.c cipher-cmac.c cipher-gcm.c \ cipher-poly1305.c cipher-ocb.c cipher-xts.c \ cipher-selftest.c cipher-selftest.h \ pubkey.c pubkey-internal.h pubkey-util.c \ @@ -66,7 +65,8 @@ blowfish.c blowfish-amd64.S blowfish-arm.S \ cast5.c cast5-amd64.S cast5-arm.S \ chacha20.c chacha20-sse2-amd64.S chacha20-ssse3-amd64.S chacha20-avx2-amd64.S \ chacha20-armv7-neon.S \ -cipher-gcm-ppc.c \ +cipher-gcm-ppc.c cipher-gcm-intel-pclmul.c \ + cipher-gcm-armv8-aarch32-ce.S cipher-gcm-armv8-aarch64-ce.S \ crc.c \ crc-intel-pclmul.c crc-ppc.c \ des.c des-amd64.S \ diff --git a/configure.ac b/configure.ac index fd447906..9bcb1318 100644 --- a/configure.ac +++ b/configure.ac @@ -2754,14 +2754,18 @@ esac # Arch specific GCM implementations case "${host}" in - powerpc64le-*-*) - GCRYPT_DIGESTS="$GCRYPT_DIGESTS cipher-gcm-ppc.lo" + i?86-*-* | x86_64-*-*) + GCRYPT_DIGESTS="$GCRYPT_DIGESTS cipher-gcm-intel-pclmul.lo" ;; - powerpc64-*-*) - GCRYPT_DIGESTS="$GCRYPT_DIGESTS cipher-gcm-ppc.lo" + arm*-*-*) + GCRYPT_DIGESTS="$GCRYPT_DIGESTS cipher-gcm-armv7-neon.lo" + GCRYPT_DIGESTS="$GCRYPT_DIGESTS cipher-gcm-armv8-aarch32-ce.lo" + ;; + aarch64-*-*) + GCRYPT_DIGESTS="$GCRYPT_DIGESTS cipher-gcm-armv8-aarch64-ce.lo" ;; - powerpc-*-*) - GCRYPT_DIGESTS="$GCRYPT_DIGESTS cipher-gcm-ppc.lo" + powerpc64le-*-* | powerpc64-*-* | powerpc-*-*) + GCRYPT_DIGESTS="$GCRYPT_DIGESTS cipher-gcm-ppc.lo" ;; esac