@@ -1519,5 +1519,184 @@ void MacroAssembler::sha512_AVX2(XMMRegister msg, XMMRegister state0, XMMRegiste
1519
1519
}
1520
1520
}
1521
1521
1522
+ // Implemented using Intel IpSec implementation (intel-ipsec-mb on github)
1523
+ void MacroAssembler::sha512_update_ni_x1 (Register arg_hash, Register arg_msg, Register ofs, Register limit, bool multi_block) {
1524
+ Label done_hash, block_loop;
1525
+ address K512_W = StubRoutines::x86::k512_W_addr ();
1526
+
1527
+ vbroadcasti128 (xmm15, ExternalAddress (StubRoutines::x86::pshuffle_byte_flip_mask_addr_sha512 ()), Assembler::AVX_256bit, r10);
1528
+
1529
+ // load current hash value and transform
1530
+ vmovdqu (xmm0, Address (arg_hash));
1531
+ vmovdqu (xmm1, Address (arg_hash, 32 ));
1532
+ // ymm0 = D C B A, ymm1 = H G F E
1533
+ vperm2i128 (xmm2, xmm0, xmm1, 0x20 );
1534
+ vperm2i128 (xmm3, xmm0, xmm1, 0x31 );
1535
+ // ymm2 = F E B A, ymm3 = H G D C
1536
+ vpermq (xmm13, xmm2, 0x1b , Assembler::AVX_256bit);
1537
+ vpermq (xmm14, xmm3, 0x1b , Assembler::AVX_256bit);
1538
+ // ymm13 = A B E F, ymm14 = C D G H
1539
+
1540
+ lea (rax, ExternalAddress (K512_W));
1541
+ align (32 );
1542
+ bind (block_loop);
1543
+ vmovdqu (xmm11, xmm13);// ABEF
1544
+ vmovdqu (xmm12, xmm14);// CDGH
1545
+
1546
+ // R0 - R3
1547
+ vmovdqu (xmm0, Address (arg_msg, 0 * 32 ));
1548
+ vpshufb (xmm3, xmm0, xmm15, Assembler::AVX_256bit);// ymm0 / ymm3 = W[0..3]
1549
+ vpaddq (xmm0, xmm3, Address (rax, 0 * 32 ), Assembler::AVX_256bit);
1550
+ sha512rnds2 (xmm12, xmm11, xmm0);
1551
+ vperm2i128 (xmm0, xmm0, xmm0, 0x01 );
1552
+ sha512rnds2 (xmm11, xmm12, xmm0);
1553
+
1554
+ // R4 - R7
1555
+ vmovdqu (xmm0, Address (arg_msg, 1 * 32 ));
1556
+ vpshufb (xmm4, xmm0, xmm15, Assembler::AVX_256bit);// ymm0 / ymm4 = W[4..7]
1557
+ vpaddq (xmm0, xmm4, Address (rax, 1 * 32 ), Assembler::AVX_256bit);
1558
+ sha512rnds2 (xmm12, xmm11, xmm0);
1559
+ vperm2i128 (xmm0, xmm0, xmm0, 0x01 );
1560
+ sha512rnds2 (xmm11, xmm12, xmm0);
1561
+ sha512msg1 (xmm3, xmm4); // ymm3 = W[0..3] + S0(W[1..4])
1562
+
1563
+ // R8 - R11
1564
+ vmovdqu (xmm0, Address (arg_msg, 2 * 32 ));
1565
+ vpshufb (xmm5, xmm0, xmm15, Assembler::AVX_256bit);// ymm0 / ymm5 = W[8..11]
1566
+ vpaddq (xmm0, xmm5, Address (rax, 2 * 32 ), Assembler::AVX_256bit);
1567
+ sha512rnds2 (xmm12, xmm11, xmm0);
1568
+ vperm2i128 (xmm0, xmm0, xmm0, 0x01 );
1569
+ sha512rnds2 (xmm11, xmm12, xmm0);
1570
+ sha512msg1 (xmm4, xmm5);// ymm4 = W[4..7] + S0(W[5..8])
1571
+
1572
+ // R12 - R15
1573
+ vmovdqu (xmm0, Address (arg_msg, 3 * 32 ));
1574
+ vpshufb (xmm6, xmm0, xmm15, Assembler::AVX_256bit); // ymm0 / ymm6 = W[12..15]
1575
+ vpaddq (xmm0, xmm6, Address (rax, 3 * 32 ), Assembler::AVX_256bit);
1576
+ vpermq (xmm8, xmm6, 0x1b , Assembler::AVX_256bit); // ymm8 = W[12] W[13] W[14] W[15]
1577
+ vpermq (xmm9, xmm5, 0x39 , Assembler::AVX_256bit); // ymm9 = W[8] W[11] W[10] W[9]
1578
+ vpblendd (xmm8, xmm8, xmm9, 0x3f , Assembler::AVX_256bit); // ymm8 = W[12] W[11] W[10] W[9]
1579
+ vpaddq (xmm3, xmm3, xmm8, Assembler::AVX_256bit);
1580
+ sha512msg2 (xmm3, xmm6);// W[16..19] = xmm3 + W[9..12] + S1(W[14..17])
1581
+ sha512rnds2 (xmm12, xmm11, xmm0);
1582
+ vperm2i128 (xmm0, xmm0, xmm0, 0x01 );
1583
+ sha512rnds2 (xmm11, xmm12, xmm0);
1584
+ sha512msg1 (xmm5, xmm6); // ymm5 = W[8..11] + S0(W[9..12])
1585
+
1586
+ // R16 - R19, R32 - R35, R48 - R51
1587
+ for (int i = 4 , j = 3 ; j > 0 ; j--) {
1588
+ vpaddq (xmm0, xmm3, Address (rax, i * 32 ), Assembler::AVX_256bit);
1589
+ vpermq (xmm8, xmm3, 0x1b , Assembler::AVX_256bit);// ymm8 = W[16] W[17] W[18] W[19]
1590
+ vpermq (xmm9, xmm6, 0x39 , Assembler::AVX_256bit);// ymm9 = W[12] W[15] W[14] W[13]
1591
+ vpblendd (xmm7, xmm8, xmm9, 0x3f , Assembler::AVX_256bit);// xmm7 = W[16] W[15] W[14] W[13]
1592
+ vpaddq (xmm4, xmm4, xmm7, Assembler::AVX_256bit);// ymm4 = W[4..7] + S0(W[5..8]) + W[13..16]
1593
+ sha512msg2 (xmm4, xmm3);// ymm4 += S1(W[14..17])
1594
+ sha512rnds2 (xmm12, xmm11, xmm0);
1595
+ vperm2i128 (xmm0, xmm0, xmm0, 0x01 );
1596
+ sha512rnds2 (xmm11, xmm12, xmm0);
1597
+ sha512msg1 (xmm6, xmm3); // ymm6 = W[12..15] + S0(W[13..16])
1598
+ i += 1 ;
1599
+ // R20 - R23, R36 - R39, R52 - R55
1600
+ vpaddq (xmm0, xmm4, Address (rax, i * 32 ), Assembler::AVX_256bit);
1601
+ vpermq (xmm8, xmm4, 0x1b , Assembler::AVX_256bit);// ymm8 = W[20] W[21] W[22] W[23]
1602
+ vpermq (xmm9, xmm3, 0x39 , Assembler::AVX_256bit);// ymm9 = W[16] W[19] W[18] W[17]
1603
+ vpblendd (xmm7, xmm8, xmm9, 0x3f , Assembler::AVX_256bit);// ymm7 = W[20] W[19] W[18] W[17]
1604
+ vpaddq (xmm5, xmm5, xmm7, Assembler::AVX_256bit);// ymm5 = W[8..11] + S0(W[9..12]) + W[17..20]
1605
+ sha512msg2 (xmm5, xmm4);// ymm5 += S1(W[18..21])
1606
+ sha512rnds2 (xmm12, xmm11, xmm0);
1607
+ vperm2i128 (xmm0, xmm0, xmm0, 0x01 );
1608
+ sha512rnds2 (xmm11, xmm12, xmm0);
1609
+ sha512msg1 (xmm3, xmm4); // ymm3 = W[16..19] + S0(W[17..20])
1610
+ i += 1 ;
1611
+ // R24 - R27, R40 - R43, R56 - R59
1612
+ vpaddq (xmm0, xmm5, Address (rax, i * 32 ), Assembler::AVX_256bit);
1613
+ vpermq (xmm8, xmm5, 0x1b , Assembler::AVX_256bit);// ymm8 = W[24] W[25] W[26] W[27]
1614
+ vpermq (xmm9, xmm4, 0x39 , Assembler::AVX_256bit);// ymm9 = W[20] W[23] W[22] W[21]
1615
+ vpblendd (xmm7, xmm8, xmm9, 0x3f , Assembler::AVX_256bit);// ymm7 = W[24] W[23] W[22] W[21]
1616
+ vpaddq (xmm6, xmm6, xmm7, Assembler::AVX_256bit);// ymm6 = W[12..15] + S0(W[13..16]) + W[21..24]
1617
+ sha512msg2 (xmm6, xmm5);// ymm6 += S1(W[22..25])
1618
+ sha512rnds2 (xmm12, xmm11, xmm0);
1619
+ vperm2i128 (xmm0, xmm0, xmm0, 0x01 );
1620
+ sha512rnds2 (xmm11, xmm12, xmm0);
1621
+ sha512msg1 (xmm4, xmm5);// ymm4 = W[20..23] + S0(W[21..24])
1622
+ i += 1 ;
1623
+ // R28 - R31, R44 - R47, R60 - R63
1624
+ vpaddq (xmm0, xmm6, Address (rax, i * 32 ), Assembler::AVX_256bit);
1625
+ vpermq (xmm8, xmm6, 0x1b , Assembler::AVX_256bit);// ymm8 = W[28] W[29] W[30] W[31]
1626
+ vpermq (xmm9, xmm5, 0x39 , Assembler::AVX_256bit);// ymm9 = W[24] W[27] W[26] W[25]
1627
+ vpblendd (xmm7, xmm8, xmm9, 0x3f , Assembler::AVX_256bit);// ymm7 = W[28] W[27] W[26] W[25]
1628
+ vpaddq (xmm3, xmm3, xmm7, Assembler::AVX_256bit);// ymm3 = W[16..19] + S0(W[17..20]) + W[25..28]
1629
+ sha512msg2 (xmm3, xmm6); // ymm3 += S1(W[26..29])
1630
+ sha512rnds2 (xmm12, xmm11, xmm0);
1631
+ vperm2i128 (xmm0, xmm0, xmm0, 0x01 );
1632
+ sha512rnds2 (xmm11, xmm12, xmm0);
1633
+ sha512msg1 (xmm5, xmm6);// ymm5 = W[24..27] + S0(W[25..28])
1634
+ i += 1 ;
1635
+ }
1636
+ // R64 - R67
1637
+ vpaddq (xmm0, xmm3, Address (rax, 16 * 32 ), Assembler::AVX_256bit);
1638
+ vpermq (xmm8, xmm3, 0x1b , Assembler::AVX_256bit);// ymm8 = W[64] W[65] W[66] W[67]
1639
+ vpermq (xmm9, xmm6, 0x39 , Assembler::AVX_256bit);// ymm9 = W[60] W[63] W[62] W[61]
1640
+ vpblendd (xmm7, xmm8, xmm9, 0x3f , Assembler::AVX_256bit);// ymm7 = W[64] W[63] W[62] W[61]
1641
+ vpaddq (xmm4, xmm4, xmm7, Assembler::AVX_256bit);// ymm4 = W[52..55] + S0(W[53..56]) + W[61..64]
1642
+ sha512msg2 (xmm4, xmm3);// ymm4 += S1(W[62..65])
1643
+ sha512rnds2 (xmm12, xmm11, xmm0);
1644
+ vperm2i128 (xmm0, xmm0, xmm0, 0x01 );
1645
+ sha512rnds2 (xmm11, xmm12, xmm0);
1646
+ sha512msg1 (xmm6, xmm3);// ymm6 = W[60..63] + S0(W[61..64])
1647
+
1648
+ // R68 - R71
1649
+ vpaddq (xmm0, xmm4, Address (rax, 17 * 32 ), Assembler::AVX_256bit);
1650
+ vpermq (xmm8, xmm4, 0x1b , Assembler::AVX_256bit);// ymm8 = W[68] W[69] W[70] W[71]
1651
+ vpermq (xmm9, xmm3, 0x39 , Assembler::AVX_256bit);// ymm9 = W[64] W[67] W[66] W[65]
1652
+ vpblendd (xmm7, xmm8, xmm9, 0x3f , Assembler::AVX_256bit);// ymm7 = W[68] W[67] W[66] W[65]
1653
+ vpaddq (xmm5, xmm5, xmm7, Assembler::AVX_256bit);// ymm5 = W[56..59] + S0(W[57..60]) + W[65..68]
1654
+ sha512msg2 (xmm5, xmm4);// ymm5 += S1(W[66..69])
1655
+ sha512rnds2 (xmm12, xmm11, xmm0);
1656
+ vperm2i128 (xmm0, xmm0, xmm0, 0x01 );
1657
+ sha512rnds2 (xmm11, xmm12, xmm0);
1658
+
1659
+ // R72 - R75
1660
+ vpaddq (xmm0, xmm5, Address (rax, 18 * 32 ), Assembler::AVX_256bit);
1661
+ vpermq (xmm8, xmm5, 0x1b , Assembler::AVX_256bit);// ymm8 = W[72] W[73] W[74] W[75]
1662
+ vpermq (xmm9, xmm4, 0x39 , Assembler::AVX_256bit);// ymm9 = W[68] W[71] W[70] W[69]
1663
+ vpblendd (xmm7, xmm8, xmm9, 0x3f , Assembler::AVX_256bit);// ymm7 = W[72] W[71] W[70] W[69]
1664
+ vpaddq (xmm6, xmm6, xmm7, Assembler::AVX_256bit);// ymm6 = W[60..63] + S0(W[61..64]) + W[69..72]
1665
+ sha512msg2 (xmm6, xmm5);// ymm6 += S1(W[70..73])
1666
+ sha512rnds2 (xmm12, xmm11, xmm0);
1667
+ vperm2i128 (xmm0, xmm0, xmm0, 0x01 );
1668
+ sha512rnds2 (xmm11, xmm12, xmm0);
1669
+
1670
+ // R76 - R79
1671
+ vpaddq (xmm0, xmm6, Address (rax, 19 * 32 ), Assembler::AVX_256bit);
1672
+ sha512rnds2 (xmm12, xmm11, xmm0);
1673
+ vperm2i128 (xmm0, xmm0, xmm0, 0x01 );
1674
+ sha512rnds2 (xmm11, xmm12, xmm0);
1675
+
1676
+ // update hash value
1677
+ vpaddq (xmm14, xmm14, xmm12, Assembler::AVX_256bit);
1678
+ vpaddq (xmm13, xmm13, xmm11, Assembler::AVX_256bit);
1679
+
1680
+ if (multi_block) {
1681
+ addptr (arg_msg, 4 * 32 );
1682
+ addptr (ofs, 128 );
1683
+ cmpptr (ofs, limit);
1684
+ jcc (Assembler::belowEqual, block_loop);
1685
+ movptr (rax, ofs); // return ofs
1686
+ }
1687
+
1688
+ // store the hash value back in memory
1689
+ // xmm13 = ABEF
1690
+ // xmm14 = CDGH
1691
+ vperm2i128 (xmm1, xmm13, xmm14, 0x31 );
1692
+ vperm2i128 (xmm2, xmm13, xmm14, 0x20 );
1693
+ vpermq (xmm1, xmm1, 0xb1 , Assembler::AVX_256bit);// ymm1 = D C B A
1694
+ vpermq (xmm2, xmm2, 0xb1 , Assembler::AVX_256bit);// ymm2 = H G F E
1695
+ vmovdqu (Address (arg_hash, 0 * 32 ), xmm1);
1696
+ vmovdqu (Address (arg_hash, 1 * 32 ), xmm2);
1697
+
1698
+ bind (done_hash);
1699
+ }
1700
+
1522
1701
#endif // #ifdef _LP64
1523
1702
0 commit comments