1 files changed, 445 insertions, 0 deletions
diff --git a/third_party/bearssl/src/aes_pwr8.c b/third_party/bearssl/src/aes_pwr8.c
new file mode 100644
index 0000000..b2c63c3
--- /dev/null
+++ b/third_party/bearssl/src/aes_pwr8.c
@@ -0,0 +1,445 @@
+/*
+ * Copyright (c) 2017 Thomas Pornin <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining 
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be 
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#define BR_POWER_ASM_MACROS   1
+#include "inner.h"
+
+/*
+ * This code contains the AES key schedule implementation using the
+ * POWER8 opcodes.
+ */
+
+#if BR_POWER8
+
+static void
+key_schedule_128(unsigned char *sk, const unsigned char *key)
+{
+	long cc;
+
+	static const uint32_t fmod[] = { 0x11B, 0x11B, 0x11B, 0x11B };
+#if BR_POWER8_LE
+	static const uint32_t idx2be[] = {
+		0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
+	};
+#endif
+
+	cc = 0;
+
+	/*
+	 * We use the VSX instructions for loading and storing the
+	 * key/subkeys, since they support unaligned accesses. The rest
+	 * of the computation is VMX only. VMX register 0 is VSX
+	 * register 32.
+	 */
+	asm volatile (
+
+		/*
+		 * v0 = all-zero word
+		 * v1 = constant -8 / +8, copied into four words
+		 * v2 = current subkey
+		 * v3 = Rcon (x4 words)
+		 * v6 = constant 8, copied into four words
+		 * v7 = constant 0x11B, copied into four words
+		 * v8 = constant for byteswapping words
+		 */
+		vspltisw(0, 0)
+#if BR_POWER8_LE
+		vspltisw(1, -8)
+#else
+		vspltisw(1, 8)
+#endif
+		lxvw4x(34, 0, %[key])
+		vspltisw(3, 1)
+		vspltisw(6, 8)
+		lxvw4x(39, 0, %[fmod])
+#if BR_POWER8_LE
+		lxvw4x(40, 0, %[idx2be])
+#endif
+
+		/*
+		 * First subkey is a copy of the key itself.
+		 */
+#if BR_POWER8_LE
+		vperm(4, 2, 2, 8)
+		stxvw4x(36, 0, %[sk])
+#else
+		stxvw4x(34, 0, %[sk])
+#endif
+
+		/*
+		 * Loop must run 10 times.
+		 */
+		li(%[cc], 10)
+		mtctr(%[cc])
+	label(loop)
+		/* Increment subkey address */
+		addi(%[sk], %[sk], 16)
+
+		/* Compute SubWord(RotWord(temp)) xor Rcon  (into v4, splat) */
+		vrlw(4, 2, 1)
+		vsbox(4, 4)
+#if BR_POWER8_LE
+		vxor(4, 4, 3)
+#else
+		vsldoi(5, 3, 0, 3)
+		vxor(4, 4, 5)
+#endif
+		vspltw(4, 4, 3)
+
+		/* XOR words for next subkey */
+		vsldoi(5, 0, 2, 12)
+		vxor(2, 2, 5)
+		vsldoi(5, 0, 2, 12)
+		vxor(2, 2, 5)
+		vsldoi(5, 0, 2, 12)
+		vxor(2, 2, 5)
+		vxor(2, 2, 4)
+
+		/* Store next subkey */
+#if BR_POWER8_LE
+		vperm(4, 2, 2, 8)
+		stxvw4x(36, 0, %[sk])
+#else
+		stxvw4x(34, 0, %[sk])
+#endif
+
+		/* Update Rcon */
+		vadduwm(3, 3, 3)
+		vsrw(4, 3, 6)
+		vsubuwm(4, 0, 4)
+		vand(4, 4, 7)
+		vxor(3, 3, 4)
+
+		bdnz(loop)
+
+: [sk] "+b" (sk), [cc] "+b" (cc)
+: [key] "b" (key), [fmod] "b" (fmod)
+#if BR_POWER8_LE
+	, [idx2be] "b" (idx2be)
+#endif
+: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "ctr", "memory"
+	);
+}
+
+static void
+key_schedule_192(unsigned char *sk, const unsigned char *key)
+{
+	long cc;
+
+#if BR_POWER8_LE
+	static const uint32_t idx2be[] = {
+		0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
+	};
+#endif
+
+	cc = 0;
+
+	/*
+	 * We use the VSX instructions for loading and storing the
+	 * key/subkeys, since they support unaligned accesses. The rest
+	 * of the computation is VMX only. VMX register 0 is VSX
+	 * register 32.
+	 */
+	asm volatile (
+
+		/*
+		 * v0 = all-zero word
+		 * v1 = constant -8 / +8, copied into four words
+		 * v2, v3 = current subkey
+		 * v5 = Rcon (x4 words) (already shifted on big-endian)
+		 * v6 = constant 8, copied into four words
+		 * v8 = constant for byteswapping words
+		 *
+		 * The left two words of v3 are ignored.
+		 */
+		vspltisw(0, 0)
+#if BR_POWER8_LE
+		vspltisw(1, -8)
+#else
+		vspltisw(1, 8)
+#endif
+		li(%[cc], 8)
+		lxvw4x(34, 0, %[key])
+		lxvw4x(35, %[cc], %[key])
+		vsldoi(3, 3, 0, 8)
+		vspltisw(5, 1)
+#if !BR_POWER8_LE
+		vsldoi(5, 5, 0, 3)
+#endif
+		vspltisw(6, 8)
+#if BR_POWER8_LE
+		lxvw4x(40, 0, %[idx2be])
+#endif
+
+		/*
+		 * Loop must run 8 times. Each iteration produces 256
+		 * bits of subkeys, with a 64-bit overlap.
+		 */
+		li(%[cc], 8)
+		mtctr(%[cc])
+		li(%[cc], 16)
+	label(loop)
+
+		/*
+		 * Last 6 words in v2:v3l. Compute next 6 words into
+		 * v3r:v4.
+		 */
+		vrlw(10, 3, 1)
+		vsbox(10, 10)
+		vxor(10, 10, 5)
+		vspltw(10, 10, 1)
+		vsldoi(11, 0, 10, 8)
+
+		vsldoi(12, 0, 2, 12)
+		vxor(12, 2, 12)
+		vsldoi(13, 0, 12, 12)
+		vxor(12, 12, 13)
+		vsldoi(13, 0, 12, 12)
+		vxor(12, 12, 13)
+
+		vspltw(13, 12, 3)
+		vxor(13, 13, 3)
+		vsldoi(14, 0, 3, 12)
+		vxor(13, 13, 14)
+
+		vsldoi(4, 12, 13, 8)
+		vsldoi(14, 0, 3, 8)
+		vsldoi(3, 14, 12, 8)
+
+		vxor(3, 3, 11)
+		vxor(4, 4, 10)
+
+		/*
+		 * Update Rcon. Since for a 192-bit key, we use only 8
+		 * such constants, we will not hit the field modulus,
+		 * so a simple shift (addition) works well.
+		 */
+		vadduwm(5, 5, 5)
+
+		/*
+		 * Write out the two left 128-bit words
+		 */
+#if BR_POWER8_LE
+		vperm(10, 2, 2, 8)
+		vperm(11, 3, 3, 8)
+		stxvw4x(42, 0, %[sk])
+		stxvw4x(43, %[cc], %[sk])
+#else
+		stxvw4x(34, 0, %[sk])
+		stxvw4x(35, %[cc], %[sk])
+#endif
+		addi(%[sk], %[sk], 24)
+
+		/*
+		 * Shift words for next iteration.
+		 */
+		vsldoi(2, 3, 4, 8)
+		vsldoi(3, 4, 0, 8)
+
+		bdnz(loop)
+
+		/*
+		 * The loop wrote the first 50 subkey words, but we need
+		 * to produce 52, so we must do one last write.
+		 */
+#if BR_POWER8_LE
+		vperm(10, 2, 2, 8)
+		stxvw4x(42, 0, %[sk])
+#else
+		stxvw4x(34, 0, %[sk])
+#endif
+
+: [sk] "+b" (sk), [cc] "+b" (cc)
+: [key] "b" (key)
+#if BR_POWER8_LE
+	, [idx2be] "b" (idx2be)
+#endif
+: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+  "v8", "v9", "v10", "v11", "v12", "v13", "v14", "ctr", "memory"
+	);
+}
+
+static void
+key_schedule_256(unsigned char *sk, const unsigned char *key)
+{
+	long cc;
+
+#if BR_POWER8_LE
+	static const uint32_t idx2be[] = {
+		0x03020100, 0x07060504, 0x0B0A0908, 0x0F0E0D0C
+	};
+#endif
+
+	cc = 0;
+
+	/*
+	 * We use the VSX instructions for loading and storing the
+	 * key/subkeys, since they support unaligned accesses. The rest
+	 * of the computation is VMX only. VMX register 0 is VSX
+	 * register 32.
+	 */
+	asm volatile (
+
+		/*
+		 * v0 = all-zero word
+		 * v1 = constant -8 / +8, copied into four words
+		 * v2, v3 = current subkey
+		 * v6 = Rcon (x4 words) (already shifted on big-endian)
+		 * v7 = constant 8, copied into four words
+		 * v8 = constant for byteswapping words
+		 *
+		 * The left two words of v3 are ignored.
+		 */
+		vspltisw(0, 0)
+#if BR_POWER8_LE
+		vspltisw(1, -8)
+#else
+		vspltisw(1, 8)
+#endif
+		li(%[cc], 16)
+		lxvw4x(34, 0, %[key])
+		lxvw4x(35, %[cc], %[key])
+		vspltisw(6, 1)
+#if !BR_POWER8_LE
+		vsldoi(6, 6, 0, 3)
+#endif
+		vspltisw(7, 8)
+#if BR_POWER8_LE
+		lxvw4x(40, 0, %[idx2be])
+#endif
+
+		/*
+		 * Loop must run 7 times. Each iteration produces two
+		 * subkeys.
+		 */
+		li(%[cc], 7)
+		mtctr(%[cc])
+		li(%[cc], 16)
+	label(loop)
+
+		/*
+		 * Current words are in v2:v3. Compute next word in v4.
+		 */
+		vrlw(10, 3, 1)
+		vsbox(10, 10)
+		vxor(10, 10, 6)
+		vspltw(10, 10, 3)
+
+		vsldoi(4, 0, 2, 12)
+		vxor(4, 2, 4)
+		vsldoi(5, 0, 4, 12)
+		vxor(4, 4, 5)
+		vsldoi(5, 0, 4, 12)
+		vxor(4, 4, 5)
+		vxor(4, 4, 10)
+
+		/*
+		 * Then other word in v5.
+		 */
+		vsbox(10, 4)
+		vspltw(10, 10, 3)
+
+		vsldoi(5, 0, 3, 12)
+		vxor(5, 3, 5)
+		vsldoi(11, 0, 5, 12)
+		vxor(5, 5, 11)
+		vsldoi(11, 0, 5, 12)
+		vxor(5, 5, 11)
+		vxor(5, 5, 10)
+
+		/*
+		 * Update Rcon. Since for a 256-bit key, we use only 7
+		 * such constants, we will not hit the field modulus,
+		 * so a simple shift (addition) works well.
+		 */
+		vadduwm(6, 6, 6)
+
+		/*
+		 * Write out the two left 128-bit words
+		 */
+#if BR_POWER8_LE
+		vperm(10, 2, 2, 8)
+		vperm(11, 3, 3, 8)
+		stxvw4x(42, 0, %[sk])
+		stxvw4x(43, %[cc], %[sk])
+#else
+		stxvw4x(34, 0, %[sk])
+		stxvw4x(35, %[cc], %[sk])
+#endif
+		addi(%[sk], %[sk], 32)
+
+		/*
+		 * Replace v2:v3 with v4:v5.
+		 */
+		vxor(2, 0, 4)
+		vxor(3, 0, 5)
+
+		bdnz(loop)
+
+		/*
+		 * The loop wrote the first 14 subkeys, but we need 15,
+		 * so we must do an extra write.
+		 */
+#if BR_POWER8_LE
+		vperm(10, 2, 2, 8)
+		stxvw4x(42, 0, %[sk])
+#else
+		stxvw4x(34, 0, %[sk])
+#endif
+
+: [sk] "+b" (sk), [cc] "+b" (cc)
+: [key] "b" (key)
+#if BR_POWER8_LE
+	, [idx2be] "b" (idx2be)
+#endif
+: "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
+  "v8", "v9", "v10", "v11", "v12", "v13", "v14", "ctr", "memory"
+	);
+}
+
+/* see inner.h */
+int
+br_aes_pwr8_supported(void)
+{
+	return 1;
+}
+
+/* see inner.h */
+unsigned
+br_aes_pwr8_keysched(unsigned char *sk, const void *key, size_t len)
+{
+	switch (len) {
+	case 16:
+		key_schedule_128(sk, key);
+		return 10;
+	case 24:
+		key_schedule_192(sk, key);
+		return 12;
+	default:
+		key_schedule_256(sk, key);
+		return 14;
+	}
+}
+
+#endif