From 8913ef20675bb7ca09221ac58883dde3a7b5672a Mon Sep 17 00:00:00 2001 From: Joshua Simmons Date: Sat, 18 Feb 2023 14:26:57 +0100 Subject: [PATCH] Add stripped down blake3 implementation Upstream URL: https://github.com/BLAKE3-team/BLAKE3/ --- narcissus-core/src/blake3/LICENSE | 330 +++++ narcissus-core/src/blake3/constant_time_eq.rs | 153 ++ narcissus-core/src/blake3/guts.rs | 103 ++ narcissus-core/src/blake3/join.rs | 62 + narcissus-core/src/blake3/mod.rs | 1225 +++++++++++++++++ narcissus-core/src/blake3/platform.rs | 296 ++++ narcissus-core/src/blake3/portable.rs | 195 +++ narcissus-core/src/blake3/reference_impl.rs | 384 ++++++ narcissus-core/src/blake3/rust_avx2.rs | 477 +++++++ narcissus-core/src/blake3/rust_sse2.rs | 774 +++++++++++ narcissus-core/src/blake3/rust_sse41.rs | 762 ++++++++++ narcissus-core/src/blake3/test.rs | 507 +++++++ narcissus-core/src/lib.rs | 1 + 13 files changed, 5269 insertions(+) create mode 100644 narcissus-core/src/blake3/LICENSE create mode 100644 narcissus-core/src/blake3/constant_time_eq.rs create mode 100644 narcissus-core/src/blake3/guts.rs create mode 100644 narcissus-core/src/blake3/join.rs create mode 100644 narcissus-core/src/blake3/mod.rs create mode 100644 narcissus-core/src/blake3/platform.rs create mode 100644 narcissus-core/src/blake3/portable.rs create mode 100644 narcissus-core/src/blake3/reference_impl.rs create mode 100644 narcissus-core/src/blake3/rust_avx2.rs create mode 100644 narcissus-core/src/blake3/rust_sse2.rs create mode 100644 narcissus-core/src/blake3/rust_sse41.rs create mode 100644 narcissus-core/src/blake3/test.rs diff --git a/narcissus-core/src/blake3/LICENSE b/narcissus-core/src/blake3/LICENSE new file mode 100644 index 0000000..4d38c4b --- /dev/null +++ b/narcissus-core/src/blake3/LICENSE @@ -0,0 +1,330 @@ +This work is released into the public domain with CC0 1.0. Alternatively, it is +licensed under the Apache License 2.0. + +------------------------------------------------------------------------------- + +Creative Commons Legal Code + +CC0 1.0 Universal + + CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE + LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN + ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS + INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES + REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS + PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM + THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED + HEREUNDER. + +Statement of Purpose + +The laws of most jurisdictions throughout the world automatically confer +exclusive Copyright and Related Rights (defined below) upon the creator +and subsequent owner(s) (each and all, an "owner") of an original work of +authorship and/or a database (each, a "Work"). + +Certain owners wish to permanently relinquish those rights to a Work for +the purpose of contributing to a commons of creative, cultural and +scientific works ("Commons") that the public can reliably and without fear +of later claims of infringement build upon, modify, incorporate in other +works, reuse and redistribute as freely as possible in any form whatsoever +and for any purposes, including without limitation commercial purposes. +These owners may contribute to the Commons to promote the ideal of a free +culture and the further production of creative, cultural and scientific +works, or to gain reputation or greater distribution for their Work in +part through the use and efforts of others. + +For these and/or other purposes and motivations, and without any +expectation of additional consideration or compensation, the person +associating CC0 with a Work (the "Affirmer"), to the extent that he or she +is an owner of Copyright and Related Rights in the Work, voluntarily +elects to apply CC0 to the Work and publicly distribute the Work under its +terms, with knowledge of his or her Copyright and Related Rights in the +Work and the meaning and intended legal effect of CC0 on those rights. + +1. Copyright and Related Rights. A Work made available under CC0 may be +protected by copyright and related or neighboring rights ("Copyright and +Related Rights"). Copyright and Related Rights include, but are not +limited to, the following: + + i. the right to reproduce, adapt, distribute, perform, display, + communicate, and translate a Work; + ii. moral rights retained by the original author(s) and/or performer(s); +iii. publicity and privacy rights pertaining to a person's image or + likeness depicted in a Work; + iv. rights protecting against unfair competition in regards to a Work, + subject to the limitations in paragraph 4(a), below; + v. rights protecting the extraction, dissemination, use and reuse of data + in a Work; + vi. database rights (such as those arising under Directive 96/9/EC of the + European Parliament and of the Council of 11 March 1996 on the legal + protection of databases, and under any national implementation + thereof, including any amended or successor version of such + directive); and +vii. other similar, equivalent or corresponding rights throughout the + world based on applicable law or treaty, and any national + implementations thereof. + +2. Waiver. To the greatest extent permitted by, but not in contravention +of, applicable law, Affirmer hereby overtly, fully, permanently, +irrevocably and unconditionally waives, abandons, and surrenders all of +Affirmer's Copyright and Related Rights and associated claims and causes +of action, whether now known or unknown (including existing as well as +future claims and causes of action), in the Work (i) in all territories +worldwide, (ii) for the maximum duration provided by applicable law or +treaty (including future time extensions), (iii) in any current or future +medium and for any number of copies, and (iv) for any purpose whatsoever, +including without limitation commercial, advertising or promotional +purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each +member of the public at large and to the detriment of Affirmer's heirs and +successors, fully intending that such Waiver shall not be subject to +revocation, rescission, cancellation, termination, or any other legal or +equitable action to disrupt the quiet enjoyment of the Work by the public +as contemplated by Affirmer's express Statement of Purpose. + +3. Public License Fallback. Should any part of the Waiver for any reason +be judged legally invalid or ineffective under applicable law, then the +Waiver shall be preserved to the maximum extent permitted taking into +account Affirmer's express Statement of Purpose. In addition, to the +extent the Waiver is so judged Affirmer hereby grants to each affected +person a royalty-free, non transferable, non sublicensable, non exclusive, +irrevocable and unconditional license to exercise Affirmer's Copyright and +Related Rights in the Work (i) in all territories worldwide, (ii) for the +maximum duration provided by applicable law or treaty (including future +time extensions), (iii) in any current or future medium and for any number +of copies, and (iv) for any purpose whatsoever, including without +limitation commercial, advertising or promotional purposes (the +"License"). The License shall be deemed effective as of the date CC0 was +applied by Affirmer to the Work. Should any part of the License for any +reason be judged legally invalid or ineffective under applicable law, such +partial invalidity or ineffectiveness shall not invalidate the remainder +of the License, and in such case Affirmer hereby affirms that he or she +will not (i) exercise any of his or her remaining Copyright and Related +Rights in the Work or (ii) assert any associated claims and causes of +action with respect to the Work, in either case contrary to Affirmer's +express Statement of Purpose. + +4. Limitations and Disclaimers. + + a. No trademark or patent rights held by Affirmer are waived, abandoned, + surrendered, licensed or otherwise affected by this document. + b. Affirmer offers the Work as-is and makes no representations or + warranties of any kind concerning the Work, express, implied, + statutory or otherwise, including without limitation warranties of + title, merchantability, fitness for a particular purpose, non + infringement, or the absence of latent or other defects, accuracy, or + the present or absence of errors, whether or not discoverable, all to + the greatest extent permissible under applicable law. + c. Affirmer disclaims responsibility for clearing rights of other persons + that may apply to the Work or any use thereof, including without + limitation any person's Copyright and Related Rights in the Work. + Further, Affirmer disclaims responsibility for obtaining any necessary + consents, permissions or other rights required for any use of the + Work. + d. Affirmer understands and acknowledges that Creative Commons is not a + party to this document and has no duty or obligation with respect to + this CC0 or use of the Work. + +------------------------------------------------------------------------------- + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2019 Jack O'Connor and Samuel Neves + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/narcissus-core/src/blake3/constant_time_eq.rs b/narcissus-core/src/blake3/constant_time_eq.rs new file mode 100644 index 0000000..7710a18 --- /dev/null +++ b/narcissus-core/src/blake3/constant_time_eq.rs @@ -0,0 +1,153 @@ +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +#[inline] +fn optimizer_hide(mut value: u8) -> u8 { + // SAFETY: the input value is passed unchanged to the output, the inline assembly does nothing. + unsafe { + core::arch::asm!("/* {0} */", inout(reg_byte) value, options(pure, nomem, nostack, preserves_flags)); + value + } +} + +#[cfg(any( + target_arch = "arm", + target_arch = "aarch64", + target_arch = "riscv32", + target_arch = "riscv64" +))] +#[allow(asm_sub_register)] +#[inline] +fn optimizer_hide(mut value: u8) -> u8 { + // SAFETY: the input value is passed unchanged to the output, the inline assembly does nothing. + unsafe { + core::arch::asm!("/* {0} */", inout(reg) value, options(pure, nomem, nostack, preserves_flags)); + value + } +} + +#[cfg(not(any( + target_arch = "x86", + target_arch = "x86_64", + target_arch = "arm", + target_arch = "aarch64", + target_arch = "riscv32", + target_arch = "riscv64" +)))] +#[inline(never)] // This function is non-inline to prevent the optimizer from looking inside it. +fn optimizer_hide(value: u8) -> u8 { + // SAFETY: the result of casting a reference to a pointer is valid; the type is Copy. + unsafe { core::ptr::read_volatile(&value) } +} + +#[inline] +fn constant_time_ne(a: &[u8], b: &[u8]) -> u8 { + assert!(a.len() == b.len()); + + // These useless slices make the optimizer elide the bounds checks. + // See the comment in clone_from_slice() added on Rust commit 6a7bc47. + let len = a.len(); + let a = &a[..len]; + let b = &b[..len]; + + let mut tmp = 0; + for i in 0..len { + tmp |= a[i] ^ b[i]; + } + + // The compare with 0 must happen outside this function. + optimizer_hide(tmp) +} + +/// Compares two equal-sized byte strings in constant time. +/// +/// # Examples +/// +/// ``` +/// use constant_time_eq::constant_time_eq; +/// +/// assert!(constant_time_eq(b"foo", b"foo")); +/// assert!(!constant_time_eq(b"foo", b"bar")); +/// assert!(!constant_time_eq(b"bar", b"baz")); +/// # assert!(constant_time_eq(b"", b"")); +/// +/// // Not equal-sized, so won't take constant time. +/// assert!(!constant_time_eq(b"foo", b"")); +/// assert!(!constant_time_eq(b"foo", b"quux")); +/// ``` +pub fn constant_time_eq(a: &[u8], b: &[u8]) -> bool { + a.len() == b.len() && constant_time_ne(a, b) == 0 +} + +// Fixed-size array variant. + +#[inline] +fn constant_time_ne_n(a: &[u8; N], b: &[u8; N]) -> u8 { + let mut tmp = 0; + for i in 0..N { + tmp |= a[i] ^ b[i]; + } + + // The compare with 0 must happen outside this function. + optimizer_hide(tmp) +} + +/// Compares two fixed-size byte strings in constant time. +/// +/// # Examples +/// +/// ``` +/// use constant_time_eq::constant_time_eq_n; +/// +/// assert!(constant_time_eq_n(&[3; 20], &[3; 20])); +/// assert!(!constant_time_eq_n(&[3; 20], &[7; 20])); +/// ``` +#[inline] +pub fn constant_time_eq_n(a: &[u8; N], b: &[u8; N]) -> bool { + constant_time_ne_n(a, b) == 0 +} + +// Fixed-size variants for the most common sizes. + +/// Compares two 128-bit byte strings in constant time. +/// +/// # Examples +/// +/// ``` +/// use constant_time_eq::constant_time_eq_16; +/// +/// assert!(constant_time_eq_16(&[3; 16], &[3; 16])); +/// assert!(!constant_time_eq_16(&[3; 16], &[7; 16])); +/// ``` +#[inline] +pub fn constant_time_eq_16(a: &[u8; 16], b: &[u8; 16]) -> bool { + constant_time_eq_n(a, b) +} + +/// Compares two 256-bit byte strings in constant time. +/// +/// # Examples +/// +/// ``` +/// use constant_time_eq::constant_time_eq_32; +/// +/// assert!(constant_time_eq_32(&[3; 32], &[3; 32])); +/// assert!(!constant_time_eq_32(&[3; 32], &[7; 32])); +/// ``` +#[inline] +pub fn constant_time_eq_32(a: &[u8; 32], b: &[u8; 32]) -> bool { + constant_time_eq_n(a, b) +} + +/// Compares two 512-bit byte strings in constant time. +/// +/// # Examples +/// +/// ``` +/// use constant_time_eq::constant_time_eq_64; +/// +/// assert!(constant_time_eq_64(&[3; 64], &[3; 64])); +/// assert!(!constant_time_eq_64(&[3; 64], &[7; 64])); +/// ``` +#[inline] +pub fn constant_time_eq_64(a: &[u8; 64], b: &[u8; 64]) -> bool { + constant_time_eq_n(a, b) +} diff --git a/narcissus-core/src/blake3/guts.rs b/narcissus-core/src/blake3/guts.rs new file mode 100644 index 0000000..939287a --- /dev/null +++ b/narcissus-core/src/blake3/guts.rs @@ -0,0 +1,103 @@ +//! This undocumented and unstable module is for use cases like the `bao` crate, +//! which need to traverse the BLAKE3 Merkle tree and work with chunk and parent +//! chaining values directly. There might be breaking changes to this module +//! between patch versions. +//! +//! We could stabilize something like this module in the future. If you have a +//! use case for it, please let us know by filing a GitHub issue. + +pub const BLOCK_LEN: usize = 64; +pub const CHUNK_LEN: usize = 1024; + +#[derive(Clone, Debug)] +pub struct ChunkState(super::ChunkState); + +impl ChunkState { + // Currently this type only supports the regular hash mode. If an + // incremental user needs keyed_hash or derive_key, we can add that. + pub fn new(chunk_counter: u64) -> Self { + Self(super::ChunkState::new( + super::IV, + chunk_counter, + 0, + super::platform::Platform::detect(), + )) + } + + #[inline] + pub fn len(&self) -> usize { + self.0.len() + } + + #[inline] + pub fn update(&mut self, input: &[u8]) -> &mut Self { + self.0.update(input); + self + } + + pub fn finalize(&self, is_root: bool) -> super::Hash { + let output = self.0.output(); + if is_root { + output.root_hash() + } else { + output.chaining_value().into() + } + } +} + +// As above, this currently assumes the regular hash mode. If an incremental +// user needs keyed_hash or derive_key, we can add that. +pub fn parent_cv( + left_child: &super::Hash, + right_child: &super::Hash, + is_root: bool, +) -> super::Hash { + let output = super::parent_node_output( + left_child.as_bytes(), + right_child.as_bytes(), + super::IV, + 0, + super::platform::Platform::detect(), + ); + if is_root { + output.root_hash() + } else { + output.chaining_value().into() + } +} + +#[cfg(test)] +mod test { + use crate::blake3::{hash, Hasher}; + + use super::*; + + #[test] + fn test_chunk() { + assert_eq!( + hash(b"foo"), + ChunkState::new(0).update(b"foo").finalize(true) + ); + } + + #[test] + fn test_parents() { + let mut hasher = Hasher::new(); + let mut buf = [0; super::CHUNK_LEN]; + + buf[0] = 'a' as u8; + hasher.update(&buf); + let chunk0_cv = ChunkState::new(0).update(&buf).finalize(false); + + buf[0] = 'b' as u8; + hasher.update(&buf); + let chunk1_cv = ChunkState::new(1).update(&buf).finalize(false); + + hasher.update(b"c"); + let chunk2_cv = ChunkState::new(2).update(b"c").finalize(false); + + let parent = parent_cv(&chunk0_cv, &chunk1_cv, false); + let root = parent_cv(&parent, &chunk2_cv, true); + assert_eq!(hasher.finalize(), root); + } +} diff --git a/narcissus-core/src/blake3/join.rs b/narcissus-core/src/blake3/join.rs new file mode 100644 index 0000000..aa17159 --- /dev/null +++ b/narcissus-core/src/blake3/join.rs @@ -0,0 +1,62 @@ +//! The multi-threading abstractions used by `Hasher::update_with_join`. +//! +//! Different implementations of the `Join` trait determine whether +//! `Hasher::update_with_join` performs multi-threading on sufficiently large +//! inputs. The `SerialJoin` implementation is single-threaded, and the +//! `RayonJoin` implementation (gated by the `rayon` feature) is multi-threaded. +//! Interfaces other than `Hasher::update_with_join`, like [`hash`](crate::hash) +//! and [`Hasher::update`](crate::Hasher::update), always use `SerialJoin` +//! internally. +//! +//! The `Join` trait is an almost exact copy of the [`rayon::join`] API, and +//! `RayonJoin` is the only non-trivial implementation. Previously this trait +//! was public, but currently it's been re-privatized, as it's both 1) of no +//! value to most callers and 2) a pretty big implementation detail to commit +//! to. +//! +//! [`rayon::join`]: https://docs.rs/rayon/1.3.0/rayon/fn.join.html + +/// The trait that abstracts over single-threaded and multi-threaded recursion. +/// +/// See the [`join` module docs](index.html) for more details. +pub trait Join { + fn join(oper_a: A, oper_b: B) -> (RA, RB) + where + A: FnOnce() -> RA + Send, + B: FnOnce() -> RB + Send, + RA: Send, + RB: Send; +} + +/// The trivial, serial implementation of `Join`. The left and right sides are +/// executed one after the other, on the calling thread. The standalone hashing +/// functions and the `Hasher::update` method use this implementation +/// internally. +/// +/// See the [`join` module docs](index.html) for more details. +pub enum SerialJoin {} + +impl Join for SerialJoin { + #[inline] + fn join(oper_a: A, oper_b: B) -> (RA, RB) + where + A: FnOnce() -> RA + Send, + B: FnOnce() -> RB + Send, + RA: Send, + RB: Send, + { + (oper_a(), oper_b()) + } +} + +#[cfg(test)] +mod test { + use super::*; + + #[test] + fn test_serial_join() { + let oper_a = || 1 + 1; + let oper_b = || 2 + 2; + assert_eq!((2, 4), SerialJoin::join(oper_a, oper_b)); + } +} diff --git a/narcissus-core/src/blake3/mod.rs b/narcissus-core/src/blake3/mod.rs new file mode 100644 index 0000000..6971b00 --- /dev/null +++ b/narcissus-core/src/blake3/mod.rs @@ -0,0 +1,1225 @@ +//! The official Rust implementation of the [BLAKE3] cryptographic hash +//! function. +//! +//! # Examples +//! +//! ``` +//! # fn main() -> Result<(), Box> { +//! // Hash an input all at once. +//! let hash1 = blake3::hash(b"foobarbaz"); +//! +//! // Hash an input incrementally. +//! let mut hasher = blake3::Hasher::new(); +//! hasher.update(b"foo"); +//! hasher.update(b"bar"); +//! hasher.update(b"baz"); +//! let hash2 = hasher.finalize(); +//! assert_eq!(hash1, hash2); +//! +//! // Print a hash as hex. +//! println!("{}", hash1); +//! # Ok(()) +//! # } +//! ``` +//! +//! [BLAKE3]: https://blake3.io +//! [docs.rs]: https://docs.rs/ +//! [`Write`]: https://doc.rust-lang.org/std/io/trait.Write.html +//! [`Seek`]: https://doc.rust-lang.org/std/io/trait.Seek.html + +#[cfg(test)] +pub(crate) mod test; + +#[cfg(test)] +pub(crate) mod reference_impl; + +// The guts module is for incremental use cases like the `bao` crate that need +// to explicitly compute chunk and parent chaining values. It is semi-stable +// and likely to keep working, but largely undocumented and not intended for +// widespread use. +#[doc(hidden)] +pub mod guts; + +/// Undocumented and unstable, for benchmarks only. +#[doc(hidden)] +pub mod platform; + +mod portable; + +// Platform-specific implementations of the compression function. These +// BLAKE3-specific cfg flags are set in build.rs. +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +#[path = "rust_avx2.rs"] +mod avx2; +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +#[path = "rust_sse2.rs"] +mod sse2; +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +#[path = "rust_sse41.rs"] +mod sse41; + +mod join; + +use core::cmp; +use core::fmt; +use platform::{Platform, MAX_SIMD_DEGREE, MAX_SIMD_DEGREE_OR_2}; + +/// The number of bytes in a [`Hash`](struct.Hash.html), 32. +pub const OUT_LEN: usize = 32; + +/// The number of bytes in a key, 32. +pub const KEY_LEN: usize = 32; + +const MAX_DEPTH: usize = 54; // 2^54 * CHUNK_LEN = 2^64 +use guts::{BLOCK_LEN, CHUNK_LEN}; + +pub mod constant_time_eq; + +use crate::slice::array_chunks; +use crate::slice::split_array_ref; +use crate::FixedVec; + +// While iterating the compression function within a chunk, the CV is +// represented as words, to avoid doing two extra endianness conversions for +// each compression in the portable implementation. But the hash_many interface +// needs to hash both input bytes and parent nodes, so its better for its +// output CVs to be represented as bytes. +type CVWords = [u32; 8]; +type CVBytes = [u8; 32]; // little-endian + +const IV: &CVWords = &[ + 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19, +]; + +const MSG_SCHEDULE: [[usize; 16]; 7] = [ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], + [2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8], + [3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1], + [10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6], + [12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4], + [9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7], + [11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13], +]; + +// These are the internal flags that we use to domain separate root/non-root, +// chunk/parent, and chunk beginning/middle/end. These get set at the high end +// of the block flags word in the compression function, so their values start +// high and go down. +const CHUNK_START: u8 = 1 << 0; +const CHUNK_END: u8 = 1 << 1; +const PARENT: u8 = 1 << 2; +const ROOT: u8 = 1 << 3; +const KEYED_HASH: u8 = 1 << 4; +const DERIVE_KEY_CONTEXT: u8 = 1 << 5; +const DERIVE_KEY_MATERIAL: u8 = 1 << 6; + +#[inline] +fn counter_low(counter: u64) -> u32 { + counter as u32 +} + +#[inline] +fn counter_high(counter: u64) -> u32 { + (counter >> 32) as u32 +} + +/// An output of the default size, 32 bytes, which provides constant-time +/// equality checking. +/// +/// `Hash` implements [`From`] and [`Into`] for `[u8; 32]`, and it provides an +/// explicit [`as_bytes`] method returning `&[u8; 32]`. However, byte arrays +/// and slices don't provide constant-time equality checking, which is often a +/// security requirement in software that handles private data. `Hash` doesn't +/// implement [`Deref`] or [`AsRef`], to avoid situations where a type +/// conversion happens implicitly and the constant-time property is +/// accidentally lost. +/// +/// [`From`]: https://doc.rust-lang.org/std/convert/trait.From.html +/// [`Into`]: https://doc.rust-lang.org/std/convert/trait.Into.html +/// [`as_bytes`]: #method.as_bytes +/// [`Deref`]: https://doc.rust-lang.org/stable/std/ops/trait.Deref.html +/// [`AsRef`]: https://doc.rust-lang.org/std/convert/trait.AsRef.html +#[derive(Clone, Copy)] +pub struct Hash([u8; OUT_LEN]); + +impl Hash { + /// The raw bytes of the `Hash`. Note that byte arrays don't provide + /// constant-time equality checking, so if you need to compare hashes, + /// prefer the `Hash` type. + #[inline] + pub fn as_bytes(&self) -> &[u8; OUT_LEN] { + &self.0 + } +} + +impl From<[u8; OUT_LEN]> for Hash { + #[inline] + fn from(bytes: [u8; OUT_LEN]) -> Self { + Self(bytes) + } +} + +impl From for [u8; OUT_LEN] { + #[inline] + fn from(hash: Hash) -> Self { + hash.0 + } +} + +/// This implementation is constant-time. +impl PartialEq for Hash { + #[inline] + fn eq(&self, other: &Hash) -> bool { + constant_time_eq::constant_time_eq_32(&self.0, &other.0) + } +} + +impl std::hash::Hash for Hash { + fn hash(&self, state: &mut H) { + self.0.hash(state); + } +} + +/// This implementation is constant-time. +impl PartialEq<[u8; OUT_LEN]> for Hash { + #[inline] + fn eq(&self, other: &[u8; OUT_LEN]) -> bool { + constant_time_eq::constant_time_eq_32(&self.0, other) + } +} + +/// This implementation is constant-time if the target is 32 bytes long. +impl PartialEq<[u8]> for Hash { + #[inline] + fn eq(&self, other: &[u8]) -> bool { + constant_time_eq::constant_time_eq(&self.0, other) + } +} + +impl Eq for Hash {} + +impl fmt::Debug for Hash { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.debug_tuple("Hash").field(&self.0).finish() + } +} + +// Each chunk or parent node can produce either a 32-byte chaining value or, by +// setting the ROOT flag, any number of final output bytes. The Output struct +// captures the state just prior to choosing between those two possibilities. +#[derive(Clone)] +struct Output { + input_chaining_value: CVWords, + block: [u8; 64], + block_len: u8, + counter: u64, + flags: u8, + platform: Platform, +} + +impl Output { + fn chaining_value(&self) -> CVBytes { + let mut cv = self.input_chaining_value; + self.platform.compress_in_place( + &mut cv, + &self.block, + self.block_len, + self.counter, + self.flags, + ); + platform::le_bytes_from_words_32(&cv) + } + + fn root_hash(&self) -> Hash { + debug_assert_eq!(self.counter, 0); + let mut cv = self.input_chaining_value; + self.platform + .compress_in_place(&mut cv, &self.block, self.block_len, 0, self.flags | ROOT); + Hash(platform::le_bytes_from_words_32(&cv)) + } + + fn root_output_block(&self) -> [u8; 2 * OUT_LEN] { + self.platform.compress_xof( + &self.input_chaining_value, + &self.block, + self.block_len, + self.counter, + self.flags | ROOT, + ) + } +} + +#[derive(Clone)] +struct ChunkState { + cv: CVWords, + chunk_counter: u64, + buf: [u8; BLOCK_LEN], + buf_len: u8, + blocks_compressed: u8, + flags: u8, + platform: Platform, +} + +impl ChunkState { + fn new(key: &CVWords, chunk_counter: u64, flags: u8, platform: Platform) -> Self { + Self { + cv: *key, + chunk_counter, + buf: [0; BLOCK_LEN], + buf_len: 0, + blocks_compressed: 0, + flags, + platform, + } + } + + fn len(&self) -> usize { + BLOCK_LEN * self.blocks_compressed as usize + self.buf_len as usize + } + + fn fill_buf(&mut self, input: &mut &[u8]) { + let want = BLOCK_LEN - self.buf_len as usize; + let take = cmp::min(want, input.len()); + self.buf[self.buf_len as usize..][..take].copy_from_slice(&input[..take]); + self.buf_len += take as u8; + *input = &input[take..]; + } + + fn start_flag(&self) -> u8 { + if self.blocks_compressed == 0 { + CHUNK_START + } else { + 0 + } + } + + // Try to avoid buffering as much as possible, by compressing directly from + // the input slice when full blocks are available. + fn update(&mut self, mut input: &[u8]) -> &mut Self { + if self.buf_len > 0 { + self.fill_buf(&mut input); + if !input.is_empty() { + debug_assert_eq!(self.buf_len as usize, BLOCK_LEN); + let block_flags = self.flags | self.start_flag(); // borrowck + self.platform.compress_in_place( + &mut self.cv, + &self.buf, + BLOCK_LEN as u8, + self.chunk_counter, + block_flags, + ); + self.buf_len = 0; + self.buf = [0; BLOCK_LEN]; + self.blocks_compressed += 1; + } + } + + while input.len() > BLOCK_LEN { + debug_assert_eq!(self.buf_len, 0); + let block_flags = self.flags | self.start_flag(); // borrowck + let (block, rem) = split_array_ref(input); + self.platform.compress_in_place( + &mut self.cv, + block, + BLOCK_LEN as u8, + self.chunk_counter, + block_flags, + ); + self.blocks_compressed += 1; + input = rem; + } + + self.fill_buf(&mut input); + debug_assert!(input.is_empty()); + debug_assert!(self.len() <= CHUNK_LEN); + self + } + + fn output(&self) -> Output { + let block_flags = self.flags | self.start_flag() | CHUNK_END; + Output { + input_chaining_value: self.cv, + block: self.buf, + block_len: self.buf_len, + counter: self.chunk_counter, + flags: block_flags, + platform: self.platform, + } + } +} + +// Don't derive(Debug), because the state may be secret. +impl fmt::Debug for ChunkState { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.debug_struct("ChunkState") + .field("len", &self.len()) + .field("chunk_counter", &self.chunk_counter) + .field("flags", &self.flags) + .field("platform", &self.platform) + .finish() + } +} + +// IMPLEMENTATION NOTE +// =================== +// The recursive function compress_subtree_wide(), implemented below, is the +// basis of high-performance BLAKE3. We use it both for all-at-once hashing, +// and for the incremental input with Hasher (though we have to be careful with +// subtree boundaries in the incremental case). compress_subtree_wide() applies +// several optimizations at the same time: +// - Multithreading with Rayon. +// - Parallel chunk hashing with SIMD. +// - Parallel parent hashing with SIMD. Note that while SIMD chunk hashing +// maxes out at MAX_SIMD_DEGREE*CHUNK_LEN, parallel parent hashing continues +// to benefit from larger inputs, because more levels of the tree benefit can +// use full-width SIMD vectors for parent hashing. Without parallel parent +// hashing, we lose about 10% of overall throughput on AVX2 and AVX-512. + +/// Undocumented and unstable, for benchmarks only. +#[doc(hidden)] +#[derive(Clone, Copy)] +pub enum IncrementCounter { + Yes, + No, +} + +impl IncrementCounter { + #[inline] + fn yes(&self) -> bool { + match self { + IncrementCounter::Yes => true, + IncrementCounter::No => false, + } + } +} + +// The largest power of two less than or equal to `n`, used for left_len() +// immediately below, and also directly in Hasher::update(). +fn largest_power_of_two_leq(n: usize) -> usize { + ((n / 2) + 1).next_power_of_two() +} + +// Given some input larger than one chunk, return the number of bytes that +// should go in the left subtree. This is the largest power-of-2 number of +// chunks that leaves at least 1 byte for the right subtree. +fn left_len(content_len: usize) -> usize { + debug_assert!(content_len > CHUNK_LEN); + // Subtract 1 to reserve at least one byte for the right side. + let full_chunks = (content_len - 1) / CHUNK_LEN; + largest_power_of_two_leq(full_chunks) * CHUNK_LEN +} + +// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time +// on a single thread. Write out the chunk chaining values and return the +// number of chunks hashed. These chunks are never the root and never empty; +// those cases use a different codepath. +fn compress_chunks_parallel( + input: &[u8], + key: &CVWords, + chunk_counter: u64, + flags: u8, + platform: Platform, + out: &mut [u8], +) -> usize { + debug_assert!(!input.is_empty(), "empty chunks below the root"); + debug_assert!(input.len() <= MAX_SIMD_DEGREE * CHUNK_LEN); + + let mut chunks_exact = array_chunks(input); + let mut chunks_array = FixedVec::<&[u8; CHUNK_LEN], MAX_SIMD_DEGREE>::new(); + for chunk in &mut chunks_exact { + chunks_array.push(chunk); + } + platform.hash_many( + &chunks_array, + key, + chunk_counter, + IncrementCounter::Yes, + flags, + CHUNK_START, + CHUNK_END, + out, + ); + + // Hash the remaining partial chunk, if there is one. Note that the empty + // chunk (meaning the empty message) is a different codepath. + let chunks_so_far = chunks_array.len(); + if !chunks_exact.remainder().is_empty() { + let counter = chunk_counter + chunks_so_far as u64; + let mut chunk_state = ChunkState::new(key, counter, flags, platform); + chunk_state.update(chunks_exact.remainder()); + out[chunks_so_far * OUT_LEN..chunks_so_far * OUT_LEN + OUT_LEN] + .copy_from_slice(&chunk_state.output().chaining_value()); + chunks_so_far + 1 + } else { + chunks_so_far + } +} + +// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time +// on a single thread. Write out the parent chaining values and return the +// number of parents hashed. (If there's an odd input chaining value left over, +// return it as an additional output.) These parents are never the root and +// never empty; those cases use a different codepath. +fn compress_parents_parallel( + child_chaining_values: &[u8], + key: &CVWords, + flags: u8, + platform: Platform, + out: &mut [u8], +) -> usize { + debug_assert_eq!(child_chaining_values.len() % OUT_LEN, 0, "wacky hash bytes"); + let num_children = child_chaining_values.len() / OUT_LEN; + debug_assert!(num_children >= 2, "not enough children"); + debug_assert!(num_children <= 2 * MAX_SIMD_DEGREE_OR_2, "too many"); + + let mut parents_exact = array_chunks(child_chaining_values); + // Use MAX_SIMD_DEGREE_OR_2 rather than MAX_SIMD_DEGREE here, because of + // the requirements of compress_subtree_wide(). + let mut parents_array = FixedVec::<&[u8; BLOCK_LEN], MAX_SIMD_DEGREE_OR_2>::new(); + for parent in &mut parents_exact { + parents_array.push(parent); + } + platform.hash_many( + &parents_array, + key, + 0, // Parents always use counter 0. + IncrementCounter::No, + flags | PARENT, + 0, // Parents have no start flags. + 0, // Parents have no end flags. + out, + ); + + // If there's an odd child left over, it becomes an output. + let parents_so_far = parents_array.len(); + if !parents_exact.remainder().is_empty() { + out[parents_so_far * OUT_LEN..][..OUT_LEN].copy_from_slice(parents_exact.remainder()); + parents_so_far + 1 + } else { + parents_so_far + } +} + +// The wide helper function returns (writes out) an array of chaining values +// and returns the length of that array. The number of chaining values returned +// is the dynamically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer, +// if the input is shorter than that many chunks. The reason for maintaining a +// wide array of chaining values going back up the tree, is to allow the +// implementation to hash as many parents in parallel as possible. +// +// As a special case when the SIMD degree is 1, this function will still return +// at least 2 outputs. This guarantees that this function doesn't perform the +// root compression. (If it did, it would use the wrong flags, and also we +// wouldn't be able to implement exendable output.) Note that this function is +// not used when the whole input is only 1 chunk long; that's a different +// codepath. +// +// Why not just have the caller split the input on the first update(), instead +// of implementing this special rule? Because we don't want to limit SIMD or +// multithreading parallelism for that update(). +fn compress_subtree_wide( + input: &[u8], + key: &CVWords, + chunk_counter: u64, + flags: u8, + platform: Platform, + out: &mut [u8], +) -> usize { + // Note that the single chunk case does *not* bump the SIMD degree up to 2 + // when it is 1. This allows Rayon the option of multithreading even the + // 2-chunk case, which can help performance on smaller platforms. + if input.len() <= platform.simd_degree() * CHUNK_LEN { + return compress_chunks_parallel(input, key, chunk_counter, flags, platform, out); + } + + // With more than simd_degree chunks, we need to recurse. Start by dividing + // the input into left and right subtrees. (Note that this is only optimal + // as long as the SIMD degree is a power of 2. If we ever get a SIMD degree + // of 3 or something, we'll need a more complicated strategy.) + debug_assert_eq!(platform.simd_degree().count_ones(), 1, "power of 2"); + let (left, right) = input.split_at(left_len(input.len())); + let right_chunk_counter = chunk_counter + (left.len() / CHUNK_LEN) as u64; + + // Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 to + // account for the special case of returning 2 outputs when the SIMD degree + // is 1. + let mut cv_array = [0; 2 * MAX_SIMD_DEGREE_OR_2 * OUT_LEN]; + let degree = if left.len() == CHUNK_LEN { + // The "simd_degree=1 and we're at the leaf nodes" case. + debug_assert_eq!(platform.simd_degree(), 1); + 1 + } else { + cmp::max(platform.simd_degree(), 2) + }; + let (left_out, right_out) = cv_array.split_at_mut(degree * OUT_LEN); + + let (left_n, right_n) = J::join( + || compress_subtree_wide::(left, key, chunk_counter, flags, platform, left_out), + || compress_subtree_wide::(right, key, right_chunk_counter, flags, platform, right_out), + ); + + // The special case again. If simd_degree=1, then we'll have left_n=1 and + // right_n=1. Rather than compressing them into a single output, return + // them directly, to make sure we always have at least two outputs. + debug_assert_eq!(left_n, degree); + debug_assert!(right_n >= 1 && right_n <= left_n); + if left_n == 1 { + out[..2 * OUT_LEN].copy_from_slice(&cv_array[..2 * OUT_LEN]); + return 2; + } + + // Otherwise, do one layer of parent node compression. + let num_children = left_n + right_n; + compress_parents_parallel( + &cv_array[..num_children * OUT_LEN], + key, + flags, + platform, + out, + ) +} + +// Hash a subtree with compress_subtree_wide(), and then condense the resulting +// list of chaining values down to a single parent node. Don't compress that +// last parent node, however. Instead, return its message bytes (the +// concatenated chaining values of its children). This is necessary when the +// first call to update() supplies a complete subtree, because the topmost +// parent node of that subtree could end up being the root. It's also necessary +// for extended output in the general case. +// +// As with compress_subtree_wide(), this function is not used on inputs of 1 +// chunk or less. That's a different codepath. +fn compress_subtree_to_parent_node( + input: &[u8], + key: &CVWords, + chunk_counter: u64, + flags: u8, + platform: Platform, +) -> [u8; BLOCK_LEN] { + debug_assert!(input.len() > CHUNK_LEN); + let mut cv_array = [0; MAX_SIMD_DEGREE_OR_2 * OUT_LEN]; + let mut num_cvs = + compress_subtree_wide::(input, key, chunk_counter, flags, platform, &mut cv_array); + debug_assert!(num_cvs >= 2); + + // If MAX_SIMD_DEGREE is greater than 2 and there's enough input, + // compress_subtree_wide() returns more than 2 chaining values. Condense + // them into 2 by forming parent nodes repeatedly. + let mut out_array = [0; MAX_SIMD_DEGREE_OR_2 * OUT_LEN / 2]; + while num_cvs > 2 { + let cv_slice = &cv_array[..num_cvs * OUT_LEN]; + num_cvs = compress_parents_parallel(cv_slice, key, flags, platform, &mut out_array); + cv_array[..num_cvs * OUT_LEN].copy_from_slice(&out_array[..num_cvs * OUT_LEN]); + } + cv_array[0..2 * OUT_LEN].try_into().unwrap() +} + +// Hash a complete input all at once. Unlike compress_subtree_wide() and +// compress_subtree_to_parent_node(), this function handles the 1 chunk case. +fn hash_all_at_once(input: &[u8], key: &CVWords, flags: u8) -> Output { + let platform = Platform::detect(); + + // If the whole subtree is one chunk, hash it directly with a ChunkState. + if input.len() <= CHUNK_LEN { + return ChunkState::new(key, 0, flags, platform) + .update(input) + .output(); + } + + // Otherwise construct an Output object from the parent node returned by + // compress_subtree_to_parent_node(). + Output { + input_chaining_value: *key, + block: compress_subtree_to_parent_node::(input, key, 0, flags, platform), + block_len: BLOCK_LEN as u8, + counter: 0, + flags: flags | PARENT, + platform, + } +} + +/// The default hash function. +/// +/// For an incremental version that accepts multiple writes, see +/// [`Hasher::update`]. +/// +/// For output sizes other than 32 bytes, see [`Hasher::finalize_xof`] and +/// [`OutputReader`]. +/// +/// This function is always single-threaded. +pub fn hash(input: &[u8]) -> Hash { + hash_all_at_once::(input, IV, 0).root_hash() +} + +/// The keyed hash function. +/// +/// This is suitable for use as a message authentication code, for example to +/// replace an HMAC instance. In that use case, the constant-time equality +/// checking provided by [`Hash`](struct.Hash.html) is almost always a security +/// requirement, and callers need to be careful not to compare MACs as raw +/// bytes. +/// +/// For output sizes other than 32 bytes, see [`Hasher::new_keyed`], +/// [`Hasher::finalize_xof`], and [`OutputReader`]. +/// +/// This function is always single-threaded. For multithreading support, see +/// [`Hasher::new_keyed`] +pub fn keyed_hash(key: &[u8; KEY_LEN], input: &[u8]) -> Hash { + let key_words = platform::words_from_le_bytes_32(key); + hash_all_at_once::(input, &key_words, KEYED_HASH).root_hash() +} + +/// The key derivation function. +/// +/// Given cryptographic key material of any length and a context string of any +/// length, this function outputs a 32-byte derived subkey. **The context string +/// should be hardcoded, globally unique, and application-specific.** A good +/// default format for such strings is `"[application] [commit timestamp] +/// [purpose]"`, e.g., `"example.com 2019-12-25 16:18:03 session tokens v1"`. +/// +/// Key derivation is important when you want to use the same key in multiple +/// algorithms or use cases. Using the same key with different cryptographic +/// algorithms is generally forbidden, and deriving a separate subkey for each +/// use case protects you from bad interactions. Derived keys also mitigate the +/// damage from one part of your application accidentally leaking its key. +/// +/// As a rare exception to that general rule, however, it is possible to use +/// `derive_key` itself with key material that you are already using with +/// another algorithm. You might need to do this if you're adding features to +/// an existing application, which does not yet use key derivation internally. +/// However, you still must not share key material with algorithms that forbid +/// key reuse entirely, like a one-time pad. For more on this, see sections 6.2 +/// and 7.8 of the [BLAKE3 paper](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf). +/// +/// Note that BLAKE3 is not a password hash, and **`derive_key` should never be +/// used with passwords.** Instead, use a dedicated password hash like +/// [Argon2]. Password hashes are entirely different from generic hash +/// functions, with opposite design requirements. +/// +/// For output sizes other than 32 bytes, see [`Hasher::new_derive_key`], +/// [`Hasher::finalize_xof`], and [`OutputReader`]. +/// +/// This function is always single-threaded. For multithreading support, see +/// [`Hasher::new_derive_key`] +/// +/// [Argon2]: https://en.wikipedia.org/wiki/Argon2 +pub fn derive_key(context: &str, key_material: &[u8]) -> [u8; OUT_LEN] { + let context_key = + hash_all_at_once::(context.as_bytes(), IV, DERIVE_KEY_CONTEXT) + .root_hash(); + let context_key_words = platform::words_from_le_bytes_32(context_key.as_bytes()); + hash_all_at_once::(key_material, &context_key_words, DERIVE_KEY_MATERIAL) + .root_hash() + .0 +} + +fn parent_node_output( + left_child: &CVBytes, + right_child: &CVBytes, + key: &CVWords, + flags: u8, + platform: Platform, +) -> Output { + let mut block = [0; BLOCK_LEN]; + block[..32].copy_from_slice(left_child); + block[32..].copy_from_slice(right_child); + Output { + input_chaining_value: *key, + block, + block_len: BLOCK_LEN as u8, + counter: 0, + flags: flags | PARENT, + platform, + } +} + +/// An incremental hash state that can accept any number of writes. +/// +/// When the `traits-preview` Cargo feature is enabled, this type implements +/// several commonly used traits from the +/// [`digest`](https://crates.io/crates/digest) crate. However, those +/// traits aren't stable, and they're expected to change in incompatible ways +/// before that crate reaches 1.0. For that reason, this crate makes no SemVer +/// guarantees for this feature, and callers who use it should expect breaking +/// changes between patch versions. +/// +/// **Performance note:** The [`update`](#method.update) method can't take full +/// advantage of SIMD optimizations if its input buffer is too small or oddly +/// sized. Using a 16 KiB buffer, or any multiple of that, enables all currently +/// supported SIMD instruction sets. +/// +/// # Examples +/// +/// ``` +/// # fn main() -> Result<(), Box> { +/// // Hash an input incrementally. +/// let mut hasher = blake3::Hasher::new(); +/// hasher.update(b"foo"); +/// hasher.update(b"bar"); +/// hasher.update(b"baz"); +/// assert_eq!(hasher.finalize(), blake3::hash(b"foobarbaz")); +/// +/// # Ok(()) +/// # } +/// ``` +#[derive(Clone)] +pub struct Hasher { + key: CVWords, + chunk_state: ChunkState, + // The stack size is MAX_DEPTH + 1 because we do lazy merging. For example, + // with 7 chunks, we have 3 entries in the stack. Adding an 8th chunk + // requires a 4th entry, rather than merging everything down to 1, because + // we don't know whether more input is coming. This is different from how + // the reference implementation does things. + cv_stack: FixedVec, +} + +impl Hasher { + fn new_internal(key: &CVWords, flags: u8) -> Self { + Self { + key: *key, + chunk_state: ChunkState::new(key, 0, flags, Platform::detect()), + cv_stack: FixedVec::new(), + } + } + + /// Construct a new `Hasher` for the regular hash function. + pub fn new() -> Self { + Self::new_internal(IV, 0) + } + + /// Construct a new `Hasher` for the keyed hash function. See + /// [`keyed_hash`]. + /// + /// [`keyed_hash`]: fn.keyed_hash.html + pub fn new_keyed(key: &[u8; KEY_LEN]) -> Self { + let key_words = platform::words_from_le_bytes_32(key); + Self::new_internal(&key_words, KEYED_HASH) + } + + /// Construct a new `Hasher` for the key derivation function. See + /// [`derive_key`]. The context string should be hardcoded, globally + /// unique, and application-specific. + /// + /// [`derive_key`]: fn.derive_key.html + pub fn new_derive_key(context: &str) -> Self { + let context_key = + hash_all_at_once::(context.as_bytes(), IV, DERIVE_KEY_CONTEXT) + .root_hash(); + let context_key_words = platform::words_from_le_bytes_32(context_key.as_bytes()); + Self::new_internal(&context_key_words, DERIVE_KEY_MATERIAL) + } + + /// Reset the `Hasher` to its initial state. + /// + /// This is functionally the same as overwriting the `Hasher` with a new + /// one, using the same key or context string if any. + pub fn reset(&mut self) -> &mut Self { + self.chunk_state = ChunkState::new( + &self.key, + 0, + self.chunk_state.flags, + self.chunk_state.platform, + ); + self.cv_stack.clear(); + self + } + + // As described in push_cv() below, we do "lazy merging", delaying merges + // until right before the next CV is about to be added. This is different + // from the reference implementation. Another difference is that we aren't + // always merging 1 chunk at a time. Instead, each CV might represent any + // power-of-two number of chunks, as long as the smaller-above-larger stack + // order is maintained. Instead of the "count the trailing 0-bits" + // algorithm described in the spec, we use a "count the total number of + // 1-bits" variant that doesn't require us to retain the subtree size of + // the CV on top of the stack. The principle is the same: each CV that + // should remain in the stack is represented by a 1-bit in the total number + // of chunks (or bytes) so far. + fn merge_cv_stack(&mut self, total_len: u64) { + let post_merge_stack_len = total_len.count_ones() as usize; + while self.cv_stack.len() > post_merge_stack_len { + let right_child = self.cv_stack.pop().unwrap(); + let left_child = self.cv_stack.pop().unwrap(); + let parent_output = parent_node_output( + &left_child, + &right_child, + &self.key, + self.chunk_state.flags, + self.chunk_state.platform, + ); + self.cv_stack.push(parent_output.chaining_value()); + } + } + + // In reference_impl.rs, we merge the new CV with existing CVs from the + // stack before pushing it. We can do that because we know more input is + // coming, so we know none of the merges are root. + // + // This setting is different. We want to feed as much input as possible to + // compress_subtree_wide(), without setting aside anything for the + // chunk_state. If the user gives us 64 KiB, we want to parallelize over + // all 64 KiB at once as a single subtree, if at all possible. + // + // This leads to two problems: + // 1) This 64 KiB input might be the only call that ever gets made to + // update. In this case, the root node of the 64 KiB subtree would be + // the root node of the whole tree, and it would need to be ROOT + // finalized. We can't compress it until we know. + // 2) This 64 KiB input might complete a larger tree, whose root node is + // similarly going to be the the root of the whole tree. For example, + // maybe we have 196 KiB (that is, 128 + 64) hashed so far. We can't + // compress the node at the root of the 256 KiB subtree until we know + // how to finalize it. + // + // The second problem is solved with "lazy merging". That is, when we're + // about to add a CV to the stack, we don't merge it with anything first, + // as the reference impl does. Instead we do merges using the *previous* CV + // that was added, which is sitting on top of the stack, and we put the new + // CV (unmerged) on top of the stack afterwards. This guarantees that we + // never merge the root node until finalize(). + // + // Solving the first problem requires an additional tool, + // compress_subtree_to_parent_node(). That function always returns the top + // *two* chaining values of the subtree it's compressing. We then do lazy + // merging with each of them separately, so that the second CV will always + // remain unmerged. (That also helps us support extendable output when + // we're hashing an input all-at-once.) + fn push_cv(&mut self, new_cv: &CVBytes, chunk_counter: u64) { + self.merge_cv_stack(chunk_counter); + self.cv_stack.push(*new_cv); + } + + /// Add input bytes to the hash state. You can call this any number of + /// times. + /// + /// This method is always single-threaded. + /// + /// Note that the degree of SIMD parallelism that `update` can use is + /// limited by the size of this input buffer. The 8 KiB buffer currently + /// used by [`std::io::copy`] is enough to leverage AVX2, for example, but + /// not enough to leverage AVX-512. A 16 KiB buffer is large enough to + /// leverage all currently supported SIMD instruction sets. + /// + /// [`std::io::copy`]: https://doc.rust-lang.org/std/io/fn.copy.html + pub fn update(&mut self, input: &[u8]) -> &mut Self { + self.update_with_join::(input) + } + + fn update_with_join(&mut self, mut input: &[u8]) -> &mut Self { + // If we have some partial chunk bytes in the internal chunk_state, we + // need to finish that chunk first. + if self.chunk_state.len() > 0 { + let want = CHUNK_LEN - self.chunk_state.len(); + let take = cmp::min(want, input.len()); + self.chunk_state.update(&input[..take]); + input = &input[take..]; + if !input.is_empty() { + // We've filled the current chunk, and there's more input + // coming, so we know it's not the root and we can finalize it. + // Then we'll proceed to hashing whole chunks below. + debug_assert_eq!(self.chunk_state.len(), CHUNK_LEN); + let chunk_cv = self.chunk_state.output().chaining_value(); + self.push_cv(&chunk_cv, self.chunk_state.chunk_counter); + self.chunk_state = ChunkState::new( + &self.key, + self.chunk_state.chunk_counter + 1, + self.chunk_state.flags, + self.chunk_state.platform, + ); + } else { + return self; + } + } + + // Now the chunk_state is clear, and we have more input. If there's + // more than a single chunk (so, definitely not the root chunk), hash + // the largest whole subtree we can, with the full benefits of SIMD and + // multithreading parallelism. Two restrictions: + // - The subtree has to be a power-of-2 number of chunks. Only subtrees + // along the right edge can be incomplete, and we don't know where + // the right edge is going to be until we get to finalize(). + // - The subtree must evenly divide the total number of chunks up until + // this point (if total is not 0). If the current incomplete subtree + // is only waiting for 1 more chunk, we can't hash a subtree of 4 + // chunks. We have to complete the current subtree first. + // Because we might need to break up the input to form powers of 2, or + // to evenly divide what we already have, this part runs in a loop. + while input.len() > CHUNK_LEN { + debug_assert_eq!(self.chunk_state.len(), 0, "no partial chunk data"); + debug_assert_eq!(CHUNK_LEN.count_ones(), 1, "power of 2 chunk len"); + let mut subtree_len = largest_power_of_two_leq(input.len()); + let count_so_far = self.chunk_state.chunk_counter * CHUNK_LEN as u64; + // Shrink the subtree_len until it evenly divides the count so far. + // We know that subtree_len itself is a power of 2, so we can use a + // bitmasking trick instead of an actual remainder operation. (Note + // that if the caller consistently passes power-of-2 inputs of the + // same size, as is hopefully typical, this loop condition will + // always fail, and subtree_len will always be the full length of + // the input.) + // + // An aside: We don't have to shrink subtree_len quite this much. + // For example, if count_so_far is 1, we could pass 2 chunks to + // compress_subtree_to_parent_node. Since we'll get 2 CVs back, + // we'll still get the right answer in the end, and we might get to + // use 2-way SIMD parallelism. The problem with this optimization, + // is that it gets us stuck always hashing 2 chunks. The total + // number of chunks will remain odd, and we'll never graduate to + // higher degrees of parallelism. See + // https://github.com/BLAKE3-team/BLAKE3/issues/69. + while (subtree_len - 1) as u64 & count_so_far != 0 { + subtree_len /= 2; + } + // The shrunken subtree_len might now be 1 chunk long. If so, hash + // that one chunk by itself. Otherwise, compress the subtree into a + // pair of CVs. + let subtree_chunks = (subtree_len / CHUNK_LEN) as u64; + if subtree_len <= CHUNK_LEN { + debug_assert_eq!(subtree_len, CHUNK_LEN); + self.push_cv( + &ChunkState::new( + &self.key, + self.chunk_state.chunk_counter, + self.chunk_state.flags, + self.chunk_state.platform, + ) + .update(&input[..subtree_len]) + .output() + .chaining_value(), + self.chunk_state.chunk_counter, + ); + } else { + // This is the high-performance happy path, though getting here + // depends on the caller giving us a long enough input. + let cv_pair = compress_subtree_to_parent_node::( + &input[..subtree_len], + &self.key, + self.chunk_state.chunk_counter, + self.chunk_state.flags, + self.chunk_state.platform, + ); + let left_cv = cv_pair[0..32].try_into().unwrap(); + let right_cv = cv_pair[32..64].try_into().unwrap(); + // Push the two CVs we received into the CV stack in order. Because + // the stack merges lazily, this guarantees we aren't merging the + // root. + self.push_cv(left_cv, self.chunk_state.chunk_counter); + self.push_cv( + right_cv, + self.chunk_state.chunk_counter + (subtree_chunks / 2), + ); + } + self.chunk_state.chunk_counter += subtree_chunks; + input = &input[subtree_len..]; + } + + // What remains is 1 chunk or less. Add it to the chunk state. + debug_assert!(input.len() <= CHUNK_LEN); + if !input.is_empty() { + self.chunk_state.update(input); + // Having added some input to the chunk_state, we know what's in + // the CV stack won't become the root node, and we can do an extra + // merge. This simplifies finalize(). + self.merge_cv_stack(self.chunk_state.chunk_counter); + } + + self + } + + fn final_output(&self) -> Output { + // If the current chunk is the only chunk, that makes it the root node + // also. Convert it directly into an Output. Otherwise, we need to + // merge subtrees below. + if self.cv_stack.is_empty() { + debug_assert_eq!(self.chunk_state.chunk_counter, 0); + return self.chunk_state.output(); + } + + // If there are any bytes in the ChunkState, finalize that chunk and + // merge its CV with everything in the CV stack. In that case, the work + // we did at the end of update() above guarantees that the stack + // doesn't contain any unmerged subtrees that need to be merged first. + // (This is important, because if there were two chunk hashes sitting + // on top of the stack, they would need to merge with each other, and + // merging a new chunk hash into them would be incorrect.) + // + // If there are no bytes in the ChunkState, we'll merge what's already + // in the stack. In this case it's fine if there are unmerged chunks on + // top, because we'll merge them with each other. Note that the case of + // the empty chunk is taken care of above. + let mut output: Output; + let mut num_cvs_remaining = self.cv_stack.len(); + if self.chunk_state.len() > 0 { + debug_assert_eq!( + self.cv_stack.len(), + self.chunk_state.chunk_counter.count_ones() as usize, + "cv stack does not need a merge" + ); + output = self.chunk_state.output(); + } else { + debug_assert!(self.cv_stack.len() >= 2); + output = parent_node_output( + &self.cv_stack[num_cvs_remaining - 2], + &self.cv_stack[num_cvs_remaining - 1], + &self.key, + self.chunk_state.flags, + self.chunk_state.platform, + ); + num_cvs_remaining -= 2; + } + while num_cvs_remaining > 0 { + output = parent_node_output( + &self.cv_stack[num_cvs_remaining - 1], + &output.chaining_value(), + &self.key, + self.chunk_state.flags, + self.chunk_state.platform, + ); + num_cvs_remaining -= 1; + } + output + } + + /// Finalize the hash state and return the [`Hash`](struct.Hash.html) of + /// the input. + /// + /// This method is idempotent. Calling it twice will give the same result. + /// You can also add more input and finalize again. + pub fn finalize(&self) -> Hash { + self.final_output().root_hash() + } + + /// Finalize the hash state and return an [`OutputReader`], which can + /// supply any number of output bytes. + /// + /// This method is idempotent. Calling it twice will give the same result. + /// You can also add more input and finalize again. + /// + /// [`OutputReader`]: struct.OutputReader.html + pub fn finalize_xof(&self) -> OutputReader { + OutputReader::new(self.final_output()) + } + + /// Return the total number of bytes hashed so far. + pub fn count(&self) -> u64 { + self.chunk_state.chunk_counter * CHUNK_LEN as u64 + self.chunk_state.len() as u64 + } +} + +// Don't derive(Debug), because the state may be secret. +impl fmt::Debug for Hasher { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.debug_struct("Hasher") + .field("flags", &self.chunk_state.flags) + .field("platform", &self.chunk_state.platform) + .finish() + } +} + +impl Default for Hasher { + #[inline] + fn default() -> Self { + Self::new() + } +} + +/// An incremental reader for extended output, returned by +/// [`Hasher::finalize_xof`](struct.Hasher.html#method.finalize_xof). +/// +/// Shorter BLAKE3 outputs are prefixes of longer ones, and explicitly requesting a short output is +/// equivalent to truncating the default-length output. Note that this is a difference between +/// BLAKE2 and BLAKE3. +/// +/// # Security notes +/// +/// Outputs shorter than the default length of 32 bytes (256 bits) provide less security. An N-bit +/// BLAKE3 output is intended to provide N bits of first and second preimage resistance and N/2 +/// bits of collision resistance, for any N up to 256. Longer outputs don't provide any additional +/// security. +/// +/// Avoid relying on the secrecy of the output offset, that is, the number of output bytes read or +/// the arguments to [`seek`](struct.OutputReader.html#method.seek) or +/// [`set_position`](struct.OutputReader.html#method.set_position). [_Block-Cipher-Based Tree +/// Hashing_ by Aldo Gunsing](https://eprint.iacr.org/2022/283) shows that an attacker who knows +/// both the message and the key (if any) can easily determine the offset of an extended output. +/// For comparison, AES-CTR has a similar property: if you know the key, you can decrypt a block +/// from an unknown position in the output stream to recover its block index. Callers with strong +/// secret keys aren't affected in practice, but secret offsets are a [design +/// smell](https://en.wikipedia.org/wiki/Design_smell) in any case. +#[derive(Clone)] +pub struct OutputReader { + inner: Output, + position_within_block: u8, +} + +impl OutputReader { + fn new(inner: Output) -> Self { + Self { + inner, + position_within_block: 0, + } + } + + /// Fill a buffer with output bytes and advance the position of the + /// `OutputReader`. This is equivalent to [`Read::read`], except that it + /// doesn't return a `Result`. Both methods always fill the entire buffer. + /// + /// Note that `OutputReader` doesn't buffer output bytes internally, so + /// calling `fill` repeatedly with a short-length or odd-length slice will + /// end up performing the same compression multiple times. If you're + /// reading output in a loop, prefer a slice length that's a multiple of + /// 64. + /// + /// The maximum output size of BLAKE3 is 264-1 bytes. If you try + /// to extract more than that, for example by seeking near the end and + /// reading further, the behavior is unspecified. + /// + /// [`Read::read`]: #method.read + pub fn fill(&mut self, mut buf: &mut [u8]) { + while !buf.is_empty() { + let block: [u8; BLOCK_LEN] = self.inner.root_output_block(); + let output_bytes = &block[self.position_within_block as usize..]; + let take = cmp::min(buf.len(), output_bytes.len()); + buf[..take].copy_from_slice(&output_bytes[..take]); + buf = &mut buf[take..]; + self.position_within_block += take as u8; + if self.position_within_block == BLOCK_LEN as u8 { + self.inner.counter += 1; + self.position_within_block = 0; + } + } + } + + /// Return the current read position in the output stream. This is + /// equivalent to [`Seek::stream_position`], except that it doesn't return + /// a `Result`. The position of a new `OutputReader` starts at 0, and each + /// call to [`fill`] or [`Read::read`] moves the position forward by the + /// number of bytes read. + /// + /// [`Seek::stream_position`]: #method.stream_position + /// [`fill`]: #method.fill + /// [`Read::read`]: #method.read + pub fn position(&self) -> u64 { + self.inner.counter * BLOCK_LEN as u64 + self.position_within_block as u64 + } + + /// Seek to a new read position in the output stream. This is equivalent to + /// calling [`Seek::seek`] with [`SeekFrom::Start`], except that it doesn't + /// return a `Result`. + /// + /// [`Seek::seek`]: #method.seek + /// [`SeekFrom::Start`]: https://doc.rust-lang.org/std/io/enum.SeekFrom.html + pub fn set_position(&mut self, position: u64) { + self.position_within_block = (position % BLOCK_LEN as u64) as u8; + self.inner.counter = position / BLOCK_LEN as u64; + } +} + +// Don't derive(Debug), because the state may be secret. +impl fmt::Debug for OutputReader { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + f.debug_struct("OutputReader") + .field("position", &self.position()) + .finish() + } +} diff --git a/narcissus-core/src/blake3/platform.rs b/narcissus-core/src/blake3/platform.rs new file mode 100644 index 0000000..d71086d --- /dev/null +++ b/narcissus-core/src/blake3/platform.rs @@ -0,0 +1,296 @@ +use crate::slice::{array_chunks, array_chunks_mut}; + +use super::{portable, CVWords, IncrementCounter, BLOCK_LEN}; + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +pub const MAX_SIMD_DEGREE: usize = 8; + +#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))] +pub const MAX_SIMD_DEGREE: usize = 1; + +// There are some places where we want a static size that's equal to the +// MAX_SIMD_DEGREE, but also at least 2. Constant contexts aren't currently +// allowed to use cmp::max, so we have to hardcode this additional constant +// value. Get rid of this once cmp::max is a const fn. +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +pub const MAX_SIMD_DEGREE_OR_2: usize = MAX_SIMD_DEGREE; + +#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))] +pub const MAX_SIMD_DEGREE_OR_2: usize = 2; + +#[derive(Clone, Copy, Debug)] +pub enum Platform { + Portable, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + SSE2, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + SSE41, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + AVX2, +} + +impl Platform { + #[allow(unreachable_code)] + pub fn detect() -> Self { + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + { + if avx2_detected() { + return Platform::AVX2; + } + if sse41_detected() { + return Platform::SSE41; + } + if sse2_detected() { + return Platform::SSE2; + } + } + + Platform::Portable + } + + pub fn simd_degree(&self) -> usize { + let degree = match self { + Platform::Portable => 1, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::SSE2 => 4, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::SSE41 => 4, + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::AVX2 => 8, + }; + debug_assert!(degree <= MAX_SIMD_DEGREE); + degree + } + + pub fn compress_in_place( + &self, + cv: &mut CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, + ) { + match self { + Platform::Portable => portable::compress_in_place(cv, block, block_len, counter, flags), + // Safe because detect() checked for platform support. + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::SSE2 => unsafe { + super::sse2::compress_in_place(cv, block, block_len, counter, flags) + }, + // Safe because detect() checked for platform support. + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::SSE41 | Platform::AVX2 => unsafe { + super::sse41::compress_in_place(cv, block, block_len, counter, flags) + }, + } + } + + pub fn compress_xof( + &self, + cv: &CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, + ) -> [u8; 64] { + match self { + Platform::Portable => portable::compress_xof(cv, block, block_len, counter, flags), + // Safe because detect() checked for platform support. + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::SSE2 => unsafe { + super::sse2::compress_xof(cv, block, block_len, counter, flags) + }, + // Safe because detect() checked for platform support. + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::SSE41 | Platform::AVX2 => unsafe { + super::sse41::compress_xof(cv, block, block_len, counter, flags) + }, + } + } + + // IMPLEMENTATION NOTE + // =================== + // hash_many() applies two optimizations. The critically important + // optimization is the high-performance parallel SIMD hashing mode, + // described in detail in the spec. This more than doubles throughput per + // thread. Another optimization is keeping the state vectors transposed + // from block to block within a chunk. When state vectors are transposed + // after every block, there's a small but measurable performance loss. + // Compressing chunks with a dedicated loop avoids this. + + pub fn hash_many( + &self, + inputs: &[&[u8; N]], + key: &CVWords, + counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut [u8], + ) { + match self { + Platform::Portable => portable::hash_many( + inputs, + key, + counter, + increment_counter, + flags, + flags_start, + flags_end, + out, + ), + // Safe because detect() checked for platform support. + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::SSE2 => unsafe { + super::sse2::hash_many( + inputs, + key, + counter, + increment_counter, + flags, + flags_start, + flags_end, + out, + ) + }, + // Safe because detect() checked for platform support. + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::SSE41 => unsafe { + super::sse41::hash_many( + inputs, + key, + counter, + increment_counter, + flags, + flags_start, + flags_end, + out, + ) + }, + // Safe because detect() checked for platform support. + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + Platform::AVX2 => unsafe { + super::avx2::hash_many( + inputs, + key, + counter, + increment_counter, + flags, + flags_start, + flags_end, + out, + ) + }, + } + } + + // Explicit platform constructors, for benchmarks. + + pub fn portable() -> Self { + Self::Portable + } + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + pub fn sse2() -> Option { + if sse2_detected() { + Some(Self::SSE2) + } else { + None + } + } + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + pub fn sse41() -> Option { + if sse41_detected() { + Some(Self::SSE41) + } else { + None + } + } + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + pub fn avx2() -> Option { + if avx2_detected() { + Some(Self::AVX2) + } else { + None + } + } +} + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +#[inline(always)] +pub fn avx2_detected() -> bool { + // Static check, e.g. for building with target-cpu=native. + #[cfg(target_feature = "avx2")] + { + return true; + } + + #[cfg(not(target_feature = "avx2"))] + return is_x86_feature_detected!("avx2"); +} + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +#[inline(always)] +pub fn sse41_detected() -> bool { + // Static check, e.g. for building with target-cpu=native. + #[cfg(target_feature = "sse4.1")] + { + return true; + } + + #[cfg(not(target_feature = "sse4.1"))] + return is_x86_feature_detected!("sse4.1"); +} + +#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +#[inline(always)] +#[allow(unreachable_code)] +pub fn sse2_detected() -> bool { + // Static check, e.g. for building with target-cpu=native. + #[cfg(target_feature = "sse2")] + { + return true; + } + + #[cfg(not(target_feature = "sse2"))] + return is_x86_feature_detected!("sse2"); +} + +#[inline(always)] +pub fn words_from_le_bytes_32(bytes: &[u8; 32]) -> [u32; 8] { + let mut out = [0; 8]; + for (chunk, word) in array_chunks(bytes).zip(out.iter_mut()) { + *word = u32::from_le_bytes(*chunk) + } + out +} + +#[inline(always)] +pub fn words_from_le_bytes_64(bytes: &[u8; 64]) -> [u32; 16] { + let mut out = [0; 16]; + for (chunk, word) in array_chunks(bytes).zip(out.iter_mut()) { + *word = u32::from_le_bytes(*chunk) + } + out +} + +#[inline(always)] +pub fn le_bytes_from_words_32(words: &[u32; 8]) -> [u8; 32] { + let mut out = [0; 32]; + for (word, chunk) in words.iter().zip(array_chunks_mut(&mut out)) { + *chunk = word.to_le_bytes(); + } + out +} + +#[inline(always)] +pub fn le_bytes_from_words_64(words: &[u32; 16]) -> [u8; 64] { + let mut out = [0; 64]; + for (word, chunk) in words.iter().zip(array_chunks_mut(&mut out)) { + *chunk = word.to_le_bytes(); + } + out +} diff --git a/narcissus-core/src/blake3/portable.rs b/narcissus-core/src/blake3/portable.rs new file mode 100644 index 0000000..4a762c0 --- /dev/null +++ b/narcissus-core/src/blake3/portable.rs @@ -0,0 +1,195 @@ +use crate::slice::array_chunks_mut; + +use super::{ + counter_high, counter_low, platform, CVBytes, CVWords, IncrementCounter, BLOCK_LEN, IV, + MSG_SCHEDULE, OUT_LEN, +}; + +#[inline(always)] +fn g(state: &mut [u32; 16], a: usize, b: usize, c: usize, d: usize, x: u32, y: u32) { + state[a] = state[a].wrapping_add(state[b]).wrapping_add(x); + state[d] = (state[d] ^ state[a]).rotate_right(16); + state[c] = state[c].wrapping_add(state[d]); + state[b] = (state[b] ^ state[c]).rotate_right(12); + state[a] = state[a].wrapping_add(state[b]).wrapping_add(y); + state[d] = (state[d] ^ state[a]).rotate_right(8); + state[c] = state[c].wrapping_add(state[d]); + state[b] = (state[b] ^ state[c]).rotate_right(7); +} + +#[inline(always)] +fn round(state: &mut [u32; 16], msg: &[u32; 16], round: usize) { + // Select the message schedule based on the round. + let schedule = MSG_SCHEDULE[round]; + + // Mix the columns. + g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]); + g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]); + g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]); + g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]); + + // Mix the diagonals. + g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]); + g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]); + g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]); + g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]); +} + +#[inline(always)] +fn compress_pre( + cv: &CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) -> [u32; 16] { + let block_words = platform::words_from_le_bytes_64(block); + + let mut state = [ + cv[0], + cv[1], + cv[2], + cv[3], + cv[4], + cv[5], + cv[6], + cv[7], + IV[0], + IV[1], + IV[2], + IV[3], + counter_low(counter), + counter_high(counter), + block_len as u32, + flags as u32, + ]; + + round(&mut state, &block_words, 0); + round(&mut state, &block_words, 1); + round(&mut state, &block_words, 2); + round(&mut state, &block_words, 3); + round(&mut state, &block_words, 4); + round(&mut state, &block_words, 5); + round(&mut state, &block_words, 6); + + state +} + +pub fn compress_in_place( + cv: &mut CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) { + let state = compress_pre(cv, block, block_len, counter, flags); + + cv[0] = state[0] ^ state[8]; + cv[1] = state[1] ^ state[9]; + cv[2] = state[2] ^ state[10]; + cv[3] = state[3] ^ state[11]; + cv[4] = state[4] ^ state[12]; + cv[5] = state[5] ^ state[13]; + cv[6] = state[6] ^ state[14]; + cv[7] = state[7] ^ state[15]; +} + +pub fn compress_xof( + cv: &CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) -> [u8; 64] { + let mut state = compress_pre(cv, block, block_len, counter, flags); + state[0] ^= state[8]; + state[1] ^= state[9]; + state[2] ^= state[10]; + state[3] ^= state[11]; + state[4] ^= state[12]; + state[5] ^= state[13]; + state[6] ^= state[14]; + state[7] ^= state[15]; + state[8] ^= cv[0]; + state[9] ^= cv[1]; + state[10] ^= cv[2]; + state[11] ^= cv[3]; + state[12] ^= cv[4]; + state[13] ^= cv[5]; + state[14] ^= cv[6]; + state[15] ^= cv[7]; + platform::le_bytes_from_words_64(&state) +} + +pub fn hash1( + input: &[u8; N], + key: &CVWords, + counter: u64, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut CVBytes, +) { + debug_assert_eq!(N % BLOCK_LEN, 0, "uneven blocks"); + let mut cv = *key; + let mut block_flags = flags | flags_start; + let mut slice = &input[..]; + while slice.len() >= BLOCK_LEN { + if slice.len() == BLOCK_LEN { + block_flags |= flags_end; + } + //let (block, rem) = split_array_ref(slice); + + compress_in_place( + &mut cv, + &slice[0..BLOCK_LEN].try_into().unwrap(), + BLOCK_LEN as u8, + counter, + block_flags, + ); + block_flags = flags; + //slice = rem; + slice = &slice[BLOCK_LEN..]; + } + *out = platform::le_bytes_from_words_32(&cv); +} + +pub fn hash_many( + inputs: &[&[u8; N]], + key: &CVWords, + mut counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut [u8], +) { + debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short"); + for (&input, output) in inputs.iter().zip(array_chunks_mut(out)) { + hash1(input, key, counter, flags, flags_start, flags_end, output); + if increment_counter.yes() { + counter += 1; + } + } +} + +#[cfg(test)] +pub mod test { + use super::super::test; + use super::*; + + // This is basically testing the portable implementation against itself, + // but it also checks that compress_in_place and compress_xof are + // consistent. And there are tests against the reference implementation and + // against hardcoded test vectors elsewhere. + #[test] + fn test_compress() { + test::test_compress_fn(compress_in_place, compress_xof); + } + + // Ditto. + #[test] + fn test_hash_many() { + test::test_hash_many_fn(hash_many, hash_many); + } +} diff --git a/narcissus-core/src/blake3/reference_impl.rs b/narcissus-core/src/blake3/reference_impl.rs new file mode 100644 index 0000000..83ac795 --- /dev/null +++ b/narcissus-core/src/blake3/reference_impl.rs @@ -0,0 +1,384 @@ +//! This is the reference implementation of BLAKE3. It is used for testing and +//! as a readable example of the algorithms involved. Section 5.1 of [the BLAKE3 +//! spec](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf) +//! discusses this implementation. You can render docs for this implementation +//! by running `cargo doc --open` in this directory. +//! +//! # Example +//! +//! ``` +//! let mut hasher = reference_impl::Hasher::new(); +//! hasher.update(b"abc"); +//! hasher.update(b"def"); +//! let mut hash = [0; 32]; +//! hasher.finalize(&mut hash); +//! let mut extended_hash = [0; 500]; +//! hasher.finalize(&mut extended_hash); +//! assert_eq!(hash, extended_hash[..32]); +//! ``` + +use core::cmp::min; +use core::convert::TryInto; + +const OUT_LEN: usize = 32; +const KEY_LEN: usize = 32; +const BLOCK_LEN: usize = 64; +const CHUNK_LEN: usize = 1024; + +const CHUNK_START: u32 = 1 << 0; +const CHUNK_END: u32 = 1 << 1; +const PARENT: u32 = 1 << 2; +const ROOT: u32 = 1 << 3; +const KEYED_HASH: u32 = 1 << 4; +const DERIVE_KEY_CONTEXT: u32 = 1 << 5; +const DERIVE_KEY_MATERIAL: u32 = 1 << 6; + +const IV: [u32; 8] = [ + 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19, +]; + +const MSG_PERMUTATION: [usize; 16] = [2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8]; + +// The mixing function, G, which mixes either a column or a diagonal. +fn g(state: &mut [u32; 16], a: usize, b: usize, c: usize, d: usize, mx: u32, my: u32) { + state[a] = state[a].wrapping_add(state[b]).wrapping_add(mx); + state[d] = (state[d] ^ state[a]).rotate_right(16); + state[c] = state[c].wrapping_add(state[d]); + state[b] = (state[b] ^ state[c]).rotate_right(12); + state[a] = state[a].wrapping_add(state[b]).wrapping_add(my); + state[d] = (state[d] ^ state[a]).rotate_right(8); + state[c] = state[c].wrapping_add(state[d]); + state[b] = (state[b] ^ state[c]).rotate_right(7); +} + +fn round(state: &mut [u32; 16], m: &[u32; 16]) { + // Mix the columns. + g(state, 0, 4, 8, 12, m[0], m[1]); + g(state, 1, 5, 9, 13, m[2], m[3]); + g(state, 2, 6, 10, 14, m[4], m[5]); + g(state, 3, 7, 11, 15, m[6], m[7]); + // Mix the diagonals. + g(state, 0, 5, 10, 15, m[8], m[9]); + g(state, 1, 6, 11, 12, m[10], m[11]); + g(state, 2, 7, 8, 13, m[12], m[13]); + g(state, 3, 4, 9, 14, m[14], m[15]); +} + +fn permute(m: &mut [u32; 16]) { + let mut permuted = [0; 16]; + for i in 0..16 { + permuted[i] = m[MSG_PERMUTATION[i]]; + } + *m = permuted; +} + +fn compress( + chaining_value: &[u32; 8], + block_words: &[u32; 16], + counter: u64, + block_len: u32, + flags: u32, +) -> [u32; 16] { + let mut state = [ + chaining_value[0], + chaining_value[1], + chaining_value[2], + chaining_value[3], + chaining_value[4], + chaining_value[5], + chaining_value[6], + chaining_value[7], + IV[0], + IV[1], + IV[2], + IV[3], + counter as u32, + (counter >> 32) as u32, + block_len, + flags, + ]; + let mut block = *block_words; + + round(&mut state, &block); // round 1 + permute(&mut block); + round(&mut state, &block); // round 2 + permute(&mut block); + round(&mut state, &block); // round 3 + permute(&mut block); + round(&mut state, &block); // round 4 + permute(&mut block); + round(&mut state, &block); // round 5 + permute(&mut block); + round(&mut state, &block); // round 6 + permute(&mut block); + round(&mut state, &block); // round 7 + + for i in 0..8 { + state[i] ^= state[i + 8]; + state[i + 8] ^= chaining_value[i]; + } + state +} + +fn first_8_words(compression_output: [u32; 16]) -> [u32; 8] { + compression_output[0..8].try_into().unwrap() +} + +fn words_from_little_endian_bytes(bytes: &[u8], words: &mut [u32]) { + debug_assert_eq!(bytes.len(), 4 * words.len()); + for (four_bytes, word) in bytes.chunks_exact(4).zip(words) { + *word = u32::from_le_bytes(four_bytes.try_into().unwrap()); + } +} + +// Each chunk or parent node can produce either an 8-word chaining value or, by +// setting the ROOT flag, any number of final output bytes. The Output struct +// captures the state just prior to choosing between those two possibilities. +struct Output { + input_chaining_value: [u32; 8], + block_words: [u32; 16], + counter: u64, + block_len: u32, + flags: u32, +} + +impl Output { + fn chaining_value(&self) -> [u32; 8] { + first_8_words(compress( + &self.input_chaining_value, + &self.block_words, + self.counter, + self.block_len, + self.flags, + )) + } + + fn root_output_bytes(&self, out_slice: &mut [u8]) { + let mut output_block_counter = 0; + for out_block in out_slice.chunks_mut(2 * OUT_LEN) { + let words = compress( + &self.input_chaining_value, + &self.block_words, + output_block_counter, + self.block_len, + self.flags | ROOT, + ); + // The output length might not be a multiple of 4. + for (word, out_word) in words.iter().zip(out_block.chunks_mut(4)) { + out_word.copy_from_slice(&word.to_le_bytes()[..out_word.len()]); + } + output_block_counter += 1; + } + } +} + +struct ChunkState { + chaining_value: [u32; 8], + chunk_counter: u64, + block: [u8; BLOCK_LEN], + block_len: u8, + blocks_compressed: u8, + flags: u32, +} + +impl ChunkState { + fn new(key_words: [u32; 8], chunk_counter: u64, flags: u32) -> Self { + Self { + chaining_value: key_words, + chunk_counter, + block: [0; BLOCK_LEN], + block_len: 0, + blocks_compressed: 0, + flags, + } + } + + fn len(&self) -> usize { + BLOCK_LEN * self.blocks_compressed as usize + self.block_len as usize + } + + fn start_flag(&self) -> u32 { + if self.blocks_compressed == 0 { + CHUNK_START + } else { + 0 + } + } + + fn update(&mut self, mut input: &[u8]) { + while !input.is_empty() { + // If the block buffer is full, compress it and clear it. More + // input is coming, so this compression is not CHUNK_END. + if self.block_len as usize == BLOCK_LEN { + let mut block_words = [0; 16]; + words_from_little_endian_bytes(&self.block, &mut block_words); + self.chaining_value = first_8_words(compress( + &self.chaining_value, + &block_words, + self.chunk_counter, + BLOCK_LEN as u32, + self.flags | self.start_flag(), + )); + self.blocks_compressed += 1; + self.block = [0; BLOCK_LEN]; + self.block_len = 0; + } + + // Copy input bytes into the block buffer. + let want = BLOCK_LEN - self.block_len as usize; + let take = min(want, input.len()); + self.block[self.block_len as usize..][..take].copy_from_slice(&input[..take]); + self.block_len += take as u8; + input = &input[take..]; + } + } + + fn output(&self) -> Output { + let mut block_words = [0; 16]; + words_from_little_endian_bytes(&self.block, &mut block_words); + Output { + input_chaining_value: self.chaining_value, + block_words, + counter: self.chunk_counter, + block_len: self.block_len as u32, + flags: self.flags | self.start_flag() | CHUNK_END, + } + } +} + +fn parent_output( + left_child_cv: [u32; 8], + right_child_cv: [u32; 8], + key_words: [u32; 8], + flags: u32, +) -> Output { + let mut block_words = [0; 16]; + block_words[..8].copy_from_slice(&left_child_cv); + block_words[8..].copy_from_slice(&right_child_cv); + Output { + input_chaining_value: key_words, + block_words, + counter: 0, // Always 0 for parent nodes. + block_len: BLOCK_LEN as u32, // Always BLOCK_LEN (64) for parent nodes. + flags: PARENT | flags, + } +} + +fn parent_cv( + left_child_cv: [u32; 8], + right_child_cv: [u32; 8], + key_words: [u32; 8], + flags: u32, +) -> [u32; 8] { + parent_output(left_child_cv, right_child_cv, key_words, flags).chaining_value() +} + +/// An incremental hasher that can accept any number of writes. +pub struct Hasher { + chunk_state: ChunkState, + key_words: [u32; 8], + cv_stack: [[u32; 8]; 54], // Space for 54 subtree chaining values: + cv_stack_len: u8, // 2^54 * CHUNK_LEN = 2^64 + flags: u32, +} + +impl Hasher { + fn new_internal(key_words: [u32; 8], flags: u32) -> Self { + Self { + chunk_state: ChunkState::new(key_words, 0, flags), + key_words, + cv_stack: [[0; 8]; 54], + cv_stack_len: 0, + flags, + } + } + + /// Construct a new `Hasher` for the regular hash function. + pub fn new() -> Self { + Self::new_internal(IV, 0) + } + + /// Construct a new `Hasher` for the keyed hash function. + pub fn new_keyed(key: &[u8; KEY_LEN]) -> Self { + let mut key_words = [0; 8]; + words_from_little_endian_bytes(key, &mut key_words); + Self::new_internal(key_words, KEYED_HASH) + } + + /// Construct a new `Hasher` for the key derivation function. The context + /// string should be hardcoded, globally unique, and application-specific. + pub fn new_derive_key(context: &str) -> Self { + let mut context_hasher = Self::new_internal(IV, DERIVE_KEY_CONTEXT); + context_hasher.update(context.as_bytes()); + let mut context_key = [0; KEY_LEN]; + context_hasher.finalize(&mut context_key); + let mut context_key_words = [0; 8]; + words_from_little_endian_bytes(&context_key, &mut context_key_words); + Self::new_internal(context_key_words, DERIVE_KEY_MATERIAL) + } + + fn push_stack(&mut self, cv: [u32; 8]) { + self.cv_stack[self.cv_stack_len as usize] = cv; + self.cv_stack_len += 1; + } + + fn pop_stack(&mut self) -> [u32; 8] { + self.cv_stack_len -= 1; + self.cv_stack[self.cv_stack_len as usize] + } + + // Section 5.1.2 of the BLAKE3 spec explains this algorithm in more detail. + fn add_chunk_chaining_value(&mut self, mut new_cv: [u32; 8], mut total_chunks: u64) { + // This chunk might complete some subtrees. For each completed subtree, + // its left child will be the current top entry in the CV stack, and + // its right child will be the current value of `new_cv`. Pop each left + // child off the stack, merge it with `new_cv`, and overwrite `new_cv` + // with the result. After all these merges, push the final value of + // `new_cv` onto the stack. The number of completed subtrees is given + // by the number of trailing 0-bits in the new total number of chunks. + while total_chunks & 1 == 0 { + new_cv = parent_cv(self.pop_stack(), new_cv, self.key_words, self.flags); + total_chunks >>= 1; + } + self.push_stack(new_cv); + } + + /// Add input to the hash state. This can be called any number of times. + pub fn update(&mut self, mut input: &[u8]) { + while !input.is_empty() { + // If the current chunk is complete, finalize it and reset the + // chunk state. More input is coming, so this chunk is not ROOT. + if self.chunk_state.len() == CHUNK_LEN { + let chunk_cv = self.chunk_state.output().chaining_value(); + let total_chunks = self.chunk_state.chunk_counter + 1; + self.add_chunk_chaining_value(chunk_cv, total_chunks); + self.chunk_state = ChunkState::new(self.key_words, total_chunks, self.flags); + } + + // Compress input bytes into the current chunk state. + let want = CHUNK_LEN - self.chunk_state.len(); + let take = min(want, input.len()); + self.chunk_state.update(&input[..take]); + input = &input[take..]; + } + } + + /// Finalize the hash and write any number of output bytes. + pub fn finalize(&self, out_slice: &mut [u8]) { + // Starting with the Output from the current chunk, compute all the + // parent chaining values along the right edge of the tree, until we + // have the root Output. + let mut output = self.chunk_state.output(); + let mut parent_nodes_remaining = self.cv_stack_len as usize; + while parent_nodes_remaining > 0 { + parent_nodes_remaining -= 1; + output = parent_output( + self.cv_stack[parent_nodes_remaining], + output.chaining_value(), + self.key_words, + self.flags, + ); + } + output.root_output_bytes(out_slice); + } +} diff --git a/narcissus-core/src/blake3/rust_avx2.rs b/narcissus-core/src/blake3/rust_avx2.rs new file mode 100644 index 0000000..3826b37 --- /dev/null +++ b/narcissus-core/src/blake3/rust_avx2.rs @@ -0,0 +1,477 @@ +#[cfg(target_arch = "x86")] +use core::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use core::arch::x86_64::*; + +use crate::slice::{array_chunks_mut, split_array_mut}; + +use super::{ + counter_high, counter_low, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE, OUT_LEN, +}; + +pub const DEGREE: usize = 8; + +#[inline(always)] +unsafe fn loadu(src: *const u8) -> __m256i { + // This is an unaligned load, so the pointer cast is allowed. + _mm256_loadu_si256(src as *const __m256i) +} + +#[inline(always)] +unsafe fn storeu(src: __m256i, dest: *mut u8) { + // This is an unaligned store, so the pointer cast is allowed. + _mm256_storeu_si256(dest as *mut __m256i, src) +} + +#[inline(always)] +unsafe fn add(a: __m256i, b: __m256i) -> __m256i { + _mm256_add_epi32(a, b) +} + +#[inline(always)] +unsafe fn xor(a: __m256i, b: __m256i) -> __m256i { + _mm256_xor_si256(a, b) +} + +#[inline(always)] +unsafe fn set1(x: u32) -> __m256i { + _mm256_set1_epi32(x as i32) +} + +#[inline(always)] +unsafe fn set8(a: u32, b: u32, c: u32, d: u32, e: u32, f: u32, g: u32, h: u32) -> __m256i { + _mm256_setr_epi32( + a as i32, b as i32, c as i32, d as i32, e as i32, f as i32, g as i32, h as i32, + ) +} + +// These rotations are the "simple/shifts version". For the +// "complicated/shuffles version", see +// https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2s-common.h#L63-L66. +// For a discussion of the tradeoffs, see +// https://github.com/sneves/blake2-avx2/pull/5. Due to an LLVM bug +// (https://bugs.llvm.org/show_bug.cgi?id=44379), this version performs better +// on recent x86 chips. + +#[inline(always)] +unsafe fn rot16(x: __m256i) -> __m256i { + _mm256_or_si256(_mm256_srli_epi32(x, 16), _mm256_slli_epi32(x, 32 - 16)) +} + +#[inline(always)] +unsafe fn rot12(x: __m256i) -> __m256i { + _mm256_or_si256(_mm256_srli_epi32(x, 12), _mm256_slli_epi32(x, 32 - 12)) +} + +#[inline(always)] +unsafe fn rot8(x: __m256i) -> __m256i { + _mm256_or_si256(_mm256_srli_epi32(x, 8), _mm256_slli_epi32(x, 32 - 8)) +} + +#[inline(always)] +unsafe fn rot7(x: __m256i) -> __m256i { + _mm256_or_si256(_mm256_srli_epi32(x, 7), _mm256_slli_epi32(x, 32 - 7)) +} + +#[inline(always)] +unsafe fn round(v: &mut [__m256i; 16], m: &[__m256i; 16], r: usize) { + v[0] = add(v[0], m[MSG_SCHEDULE[r][0]]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][2]]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][4]]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][6]]); + v[0] = add(v[0], v[4]); + v[1] = add(v[1], v[5]); + v[2] = add(v[2], v[6]); + v[3] = add(v[3], v[7]); + v[12] = xor(v[12], v[0]); + v[13] = xor(v[13], v[1]); + v[14] = xor(v[14], v[2]); + v[15] = xor(v[15], v[3]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[15] = rot16(v[15]); + v[8] = add(v[8], v[12]); + v[9] = add(v[9], v[13]); + v[10] = add(v[10], v[14]); + v[11] = add(v[11], v[15]); + v[4] = xor(v[4], v[8]); + v[5] = xor(v[5], v[9]); + v[6] = xor(v[6], v[10]); + v[7] = xor(v[7], v[11]); + v[4] = rot12(v[4]); + v[5] = rot12(v[5]); + v[6] = rot12(v[6]); + v[7] = rot12(v[7]); + v[0] = add(v[0], m[MSG_SCHEDULE[r][1]]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][3]]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][5]]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][7]]); + v[0] = add(v[0], v[4]); + v[1] = add(v[1], v[5]); + v[2] = add(v[2], v[6]); + v[3] = add(v[3], v[7]); + v[12] = xor(v[12], v[0]); + v[13] = xor(v[13], v[1]); + v[14] = xor(v[14], v[2]); + v[15] = xor(v[15], v[3]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[15] = rot8(v[15]); + v[8] = add(v[8], v[12]); + v[9] = add(v[9], v[13]); + v[10] = add(v[10], v[14]); + v[11] = add(v[11], v[15]); + v[4] = xor(v[4], v[8]); + v[5] = xor(v[5], v[9]); + v[6] = xor(v[6], v[10]); + v[7] = xor(v[7], v[11]); + v[4] = rot7(v[4]); + v[5] = rot7(v[5]); + v[6] = rot7(v[6]); + v[7] = rot7(v[7]); + + v[0] = add(v[0], m[MSG_SCHEDULE[r][8]]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][10]]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][12]]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][14]]); + v[0] = add(v[0], v[5]); + v[1] = add(v[1], v[6]); + v[2] = add(v[2], v[7]); + v[3] = add(v[3], v[4]); + v[15] = xor(v[15], v[0]); + v[12] = xor(v[12], v[1]); + v[13] = xor(v[13], v[2]); + v[14] = xor(v[14], v[3]); + v[15] = rot16(v[15]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[10] = add(v[10], v[15]); + v[11] = add(v[11], v[12]); + v[8] = add(v[8], v[13]); + v[9] = add(v[9], v[14]); + v[5] = xor(v[5], v[10]); + v[6] = xor(v[6], v[11]); + v[7] = xor(v[7], v[8]); + v[4] = xor(v[4], v[9]); + v[5] = rot12(v[5]); + v[6] = rot12(v[6]); + v[7] = rot12(v[7]); + v[4] = rot12(v[4]); + v[0] = add(v[0], m[MSG_SCHEDULE[r][9]]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][11]]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][13]]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][15]]); + v[0] = add(v[0], v[5]); + v[1] = add(v[1], v[6]); + v[2] = add(v[2], v[7]); + v[3] = add(v[3], v[4]); + v[15] = xor(v[15], v[0]); + v[12] = xor(v[12], v[1]); + v[13] = xor(v[13], v[2]); + v[14] = xor(v[14], v[3]); + v[15] = rot8(v[15]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[10] = add(v[10], v[15]); + v[11] = add(v[11], v[12]); + v[8] = add(v[8], v[13]); + v[9] = add(v[9], v[14]); + v[5] = xor(v[5], v[10]); + v[6] = xor(v[6], v[11]); + v[7] = xor(v[7], v[8]); + v[4] = xor(v[4], v[9]); + v[5] = rot7(v[5]); + v[6] = rot7(v[6]); + v[7] = rot7(v[7]); + v[4] = rot7(v[4]); +} + +#[inline(always)] +unsafe fn interleave128(a: __m256i, b: __m256i) -> (__m256i, __m256i) { + ( + _mm256_permute2x128_si256(a, b, 0x20), + _mm256_permute2x128_si256(a, b, 0x31), + ) +} + +// There are several ways to do a transposition. We could do it naively, with 8 separate +// _mm256_set_epi32 instructions, referencing each of the 32 words explicitly. Or we could copy +// the vecs into contiguous storage and then use gather instructions. This third approach is to use +// a series of unpack instructions to interleave the vectors. In my benchmarks, interleaving is the +// fastest approach. To test this, run `cargo +nightly bench --bench libtest load_8` in the +// https://github.com/oconnor663/bao_experiments repo. +#[inline(always)] +unsafe fn transpose_vecs(vecs: &mut [__m256i; DEGREE]) { + // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high is 22/33/66/77. + let ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]); + let ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]); + let cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]); + let cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]); + let ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]); + let ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]); + let gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]); + let gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]); + + // Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is 11/33. + let abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145); + let abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145); + let abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367); + let abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367); + let efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145); + let efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145); + let efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367); + let efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367); + + // Interleave 128-bit lanes. + let (abcdefgh_0, abcdefgh_4) = interleave128(abcd_04, efgh_04); + let (abcdefgh_1, abcdefgh_5) = interleave128(abcd_15, efgh_15); + let (abcdefgh_2, abcdefgh_6) = interleave128(abcd_26, efgh_26); + let (abcdefgh_3, abcdefgh_7) = interleave128(abcd_37, efgh_37); + + vecs[0] = abcdefgh_0; + vecs[1] = abcdefgh_1; + vecs[2] = abcdefgh_2; + vecs[3] = abcdefgh_3; + vecs[4] = abcdefgh_4; + vecs[5] = abcdefgh_5; + vecs[6] = abcdefgh_6; + vecs[7] = abcdefgh_7; +} + +#[inline(always)] +unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [__m256i; 16] { + let mut vecs = [ + loadu(inputs[0].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[1].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[2].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[3].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[4].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[5].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[6].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[7].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[0].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[1].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[2].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[3].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[4].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[5].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[6].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[7].add(block_offset + 1 * 4 * DEGREE)), + ]; + for prefetch in inputs.iter().take(DEGREE) { + _mm_prefetch(prefetch.add(block_offset + 256) as *const i8, _MM_HINT_T0); + } + for square in array_chunks_mut(&mut vecs) { + transpose_vecs(square); + } + vecs +} + +#[inline(always)] +unsafe fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m256i, __m256i) { + let mask = if increment_counter.yes() { !0 } else { 0 }; + ( + set8( + counter_low(counter + (mask & 0)), + counter_low(counter + (mask & 1)), + counter_low(counter + (mask & 2)), + counter_low(counter + (mask & 3)), + counter_low(counter + (mask & 4)), + counter_low(counter + (mask & 5)), + counter_low(counter + (mask & 6)), + counter_low(counter + (mask & 7)), + ), + set8( + counter_high(counter + (mask & 0)), + counter_high(counter + (mask & 1)), + counter_high(counter + (mask & 2)), + counter_high(counter + (mask & 3)), + counter_high(counter + (mask & 4)), + counter_high(counter + (mask & 5)), + counter_high(counter + (mask & 6)), + counter_high(counter + (mask & 7)), + ), + ) +} + +#[target_feature(enable = "avx2")] +pub unsafe fn hash8( + inputs: &[*const u8; DEGREE], + blocks: usize, + key: &CVWords, + counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut [u8; DEGREE * OUT_LEN], +) { + let mut h_vecs = [ + set1(key[0]), + set1(key[1]), + set1(key[2]), + set1(key[3]), + set1(key[4]), + set1(key[5]), + set1(key[6]), + set1(key[7]), + ]; + let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter); + let mut block_flags = flags | flags_start; + + for block in 0..blocks { + if block + 1 == blocks { + block_flags |= flags_end; + } + let block_len_vec = set1(BLOCK_LEN as u32); // full blocks only + let block_flags_vec = set1(block_flags as u32); + let msg_vecs = transpose_msg_vecs(inputs, block * BLOCK_LEN); + + // The transposed compression function. Note that inlining this + // manually here improves compile times by a lot, compared to factoring + // it out into its own function and making it #[inline(always)]. Just + // guessing, it might have something to do with loop unrolling. + let mut v = [ + h_vecs[0], + h_vecs[1], + h_vecs[2], + h_vecs[3], + h_vecs[4], + h_vecs[5], + h_vecs[6], + h_vecs[7], + set1(IV[0]), + set1(IV[1]), + set1(IV[2]), + set1(IV[3]), + counter_low_vec, + counter_high_vec, + block_len_vec, + block_flags_vec, + ]; + round(&mut v, &msg_vecs, 0); + round(&mut v, &msg_vecs, 1); + round(&mut v, &msg_vecs, 2); + round(&mut v, &msg_vecs, 3); + round(&mut v, &msg_vecs, 4); + round(&mut v, &msg_vecs, 5); + round(&mut v, &msg_vecs, 6); + h_vecs[0] = xor(v[0], v[8]); + h_vecs[1] = xor(v[1], v[9]); + h_vecs[2] = xor(v[2], v[10]); + h_vecs[3] = xor(v[3], v[11]); + h_vecs[4] = xor(v[4], v[12]); + h_vecs[5] = xor(v[5], v[13]); + h_vecs[6] = xor(v[6], v[14]); + h_vecs[7] = xor(v[7], v[15]); + + block_flags = flags; + } + + transpose_vecs(&mut h_vecs); + storeu(h_vecs[0], out.as_mut_ptr().add(0 * 4 * DEGREE)); + storeu(h_vecs[1], out.as_mut_ptr().add(1 * 4 * DEGREE)); + storeu(h_vecs[2], out.as_mut_ptr().add(2 * 4 * DEGREE)); + storeu(h_vecs[3], out.as_mut_ptr().add(3 * 4 * DEGREE)); + storeu(h_vecs[4], out.as_mut_ptr().add(4 * 4 * DEGREE)); + storeu(h_vecs[5], out.as_mut_ptr().add(5 * 4 * DEGREE)); + storeu(h_vecs[6], out.as_mut_ptr().add(6 * 4 * DEGREE)); + storeu(h_vecs[7], out.as_mut_ptr().add(7 * 4 * DEGREE)); +} + +#[target_feature(enable = "avx2")] +pub unsafe fn hash_many( + mut inputs: &[&[u8; N]], + key: &CVWords, + mut counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + mut output: &mut [u8], +) { + debug_assert!(output.len() >= inputs.len() * OUT_LEN, "out too short"); + while inputs.len() >= DEGREE && output.len() >= DEGREE * OUT_LEN { + // Safe because the layout of arrays is guaranteed, and because the + // `blocks` count is determined statically from the argument type. + let input_ptrs: &[*const u8; DEGREE] = &*(inputs.as_ptr() as *const [*const u8; DEGREE]); + let blocks = N / BLOCK_LEN; + let (out, rem) = split_array_mut(output); + hash8( + input_ptrs, + blocks, + key, + counter, + increment_counter, + flags, + flags_start, + flags_end, + out, + ); + if increment_counter.yes() { + counter += DEGREE as u64; + } + inputs = &inputs[DEGREE..]; + output = rem; + } + super::sse41::hash_many( + inputs, + key, + counter, + increment_counter, + flags, + flags_start, + flags_end, + output, + ); +} + +#[cfg(test)] +mod test { + use super::*; + use crate::blake3::{avx2::transpose_vecs, platform, test::test_hash_many_fn}; + + #[test] + fn test_transpose() { + if !platform::avx2_detected() { + return; + } + + #[target_feature(enable = "avx2")] + unsafe fn transpose_wrapper(vecs: &mut [__m256i; DEGREE]) { + transpose_vecs(vecs); + } + + let mut matrix = [[0 as u32; DEGREE]; DEGREE]; + for i in 0..DEGREE { + for j in 0..DEGREE { + matrix[i][j] = (i * DEGREE + j) as u32; + } + } + + unsafe { + let mut vecs: [__m256i; DEGREE] = core::mem::transmute(matrix); + transpose_wrapper(&mut vecs); + matrix = core::mem::transmute(vecs); + } + + for i in 0..DEGREE { + for j in 0..DEGREE { + // Reversed indexes from above. + assert_eq!(matrix[j][i], (i * DEGREE + j) as u32); + } + } + } + + #[test] + fn test_hash_many() { + if !platform::avx2_detected() { + return; + } + test_hash_many_fn(hash_many, hash_many); + } +} diff --git a/narcissus-core/src/blake3/rust_sse2.rs b/narcissus-core/src/blake3/rust_sse2.rs new file mode 100644 index 0000000..a49fa1b --- /dev/null +++ b/narcissus-core/src/blake3/rust_sse2.rs @@ -0,0 +1,774 @@ +#[cfg(target_arch = "x86")] +use core::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use core::arch::x86_64::*; + +use crate::slice::{array_chunks_mut, split_array_mut}; + +use super::{ + counter_high, counter_low, CVBytes, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE, + OUT_LEN, +}; + +pub const DEGREE: usize = 4; + +#[inline(always)] +unsafe fn loadu(src: *const u8) -> __m128i { + // This is an unaligned load, so the pointer cast is allowed. + _mm_loadu_si128(src as *const __m128i) +} + +#[inline(always)] +unsafe fn storeu(src: __m128i, dest: *mut u8) { + // This is an unaligned store, so the pointer cast is allowed. + _mm_storeu_si128(dest as *mut __m128i, src) +} + +#[inline(always)] +unsafe fn add(a: __m128i, b: __m128i) -> __m128i { + _mm_add_epi32(a, b) +} + +#[inline(always)] +unsafe fn xor(a: __m128i, b: __m128i) -> __m128i { + _mm_xor_si128(a, b) +} + +#[inline(always)] +unsafe fn set1(x: u32) -> __m128i { + _mm_set1_epi32(x as i32) +} + +#[inline(always)] +unsafe fn set4(a: u32, b: u32, c: u32, d: u32) -> __m128i { + _mm_setr_epi32(a as i32, b as i32, c as i32, d as i32) +} + +// These rotations are the "simple/shifts version". For the +// "complicated/shuffles version", see +// https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2s-common.h#L63-L66. +// For a discussion of the tradeoffs, see +// https://github.com/sneves/blake2-avx2/pull/5. Due to an LLVM bug +// (https://bugs.llvm.org/show_bug.cgi?id=44379), this version performs better +// on recent x86 chips. + +#[inline(always)] +unsafe fn rot16(a: __m128i) -> __m128i { + _mm_or_si128(_mm_srli_epi32(a, 16), _mm_slli_epi32(a, 32 - 16)) +} + +#[inline(always)] +unsafe fn rot12(a: __m128i) -> __m128i { + _mm_or_si128(_mm_srli_epi32(a, 12), _mm_slli_epi32(a, 32 - 12)) +} + +#[inline(always)] +unsafe fn rot8(a: __m128i) -> __m128i { + _mm_or_si128(_mm_srli_epi32(a, 8), _mm_slli_epi32(a, 32 - 8)) +} + +#[inline(always)] +unsafe fn rot7(a: __m128i) -> __m128i { + _mm_or_si128(_mm_srli_epi32(a, 7), _mm_slli_epi32(a, 32 - 7)) +} + +#[inline(always)] +unsafe fn g1( + row0: &mut __m128i, + row1: &mut __m128i, + row2: &mut __m128i, + row3: &mut __m128i, + m: __m128i, +) { + *row0 = add(add(*row0, m), *row1); + *row3 = xor(*row3, *row0); + *row3 = rot16(*row3); + *row2 = add(*row2, *row3); + *row1 = xor(*row1, *row2); + *row1 = rot12(*row1); +} + +#[inline(always)] +unsafe fn g2( + row0: &mut __m128i, + row1: &mut __m128i, + row2: &mut __m128i, + row3: &mut __m128i, + m: __m128i, +) { + *row0 = add(add(*row0, m), *row1); + *row3 = xor(*row3, *row0); + *row3 = rot8(*row3); + *row2 = add(*row2, *row3); + *row1 = xor(*row1, *row2); + *row1 = rot7(*row1); +} + +// Adapted from https://github.com/rust-lang-nursery/stdsimd/pull/479. +macro_rules! _MM_SHUFFLE { + ($z:expr, $y:expr, $x:expr, $w:expr) => { + ($z << 6) | ($y << 4) | ($x << 2) | $w + }; +} + +macro_rules! shuffle2 { + ($a:expr, $b:expr, $c:expr) => { + _mm_castps_si128(_mm_shuffle_ps( + _mm_castsi128_ps($a), + _mm_castsi128_ps($b), + $c, + )) + }; +} + +// Note the optimization here of leaving row1 as the unrotated row, rather than +// row0. All the message loads below are adjusted to compensate for this. See +// discussion at https://github.com/sneves/blake2-avx2/pull/4 +#[inline(always)] +unsafe fn diagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) { + *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(2, 1, 0, 3)); + *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2)); + *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(0, 3, 2, 1)); +} + +#[inline(always)] +unsafe fn undiagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) { + *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(0, 3, 2, 1)); + *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2)); + *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(2, 1, 0, 3)); +} + +#[inline(always)] +unsafe fn blend_epi16(a: __m128i, b: __m128i, imm8: i32) -> __m128i { + let bits = _mm_set_epi16(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01); + let mut mask = _mm_set1_epi16(imm8 as i16); + mask = _mm_and_si128(mask, bits); + mask = _mm_cmpeq_epi16(mask, bits); + _mm_or_si128(_mm_and_si128(mask, b), _mm_andnot_si128(mask, a)) +} + +#[inline(always)] +unsafe fn compress_pre( + cv: &CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) -> [__m128i; 4] { + let row0 = &mut loadu(cv.as_ptr().add(0) as *const u8); + let row1 = &mut loadu(cv.as_ptr().add(4) as *const u8); + let row2 = &mut set4(IV[0], IV[1], IV[2], IV[3]); + let row3 = &mut set4( + counter_low(counter), + counter_high(counter), + block_len as u32, + flags as u32, + ); + + let mut m0 = loadu(block.as_ptr().add(0 * 4 * DEGREE)); + let mut m1 = loadu(block.as_ptr().add(1 * 4 * DEGREE)); + let mut m2 = loadu(block.as_ptr().add(2 * 4 * DEGREE)); + let mut m3 = loadu(block.as_ptr().add(3 * 4 * DEGREE)); + + let mut t0; + let mut t1; + let mut t2; + let mut t3; + let mut tt; + + // Round 1. The first round permutes the message words from the original + // input order, into the groups that get mixed in parallel. + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(2, 0, 2, 0)); // 6 4 2 0 + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 3, 1)); // 7 5 3 1 + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = shuffle2!(m2, m3, _MM_SHUFFLE!(2, 0, 2, 0)); // 14 12 10 8 + t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE!(2, 1, 0, 3)); // 12 10 8 14 + g1(row0, row1, row2, row3, t2); + t3 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 1, 3, 1)); // 15 13 11 9 + t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE!(2, 1, 0, 3)); // 13 11 9 15 + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 2. This round and all following rounds apply a fixed permutation + // to the message words from the round before. + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 3 + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 4 + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 5 + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 6 + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 7 + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + + [*row0, *row1, *row2, *row3] +} + +#[target_feature(enable = "sse2")] +pub unsafe fn compress_in_place( + cv: &mut CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) { + let [row0, row1, row2, row3] = compress_pre(cv, block, block_len, counter, flags); + storeu(xor(row0, row2), cv.as_mut_ptr().add(0) as *mut u8); + storeu(xor(row1, row3), cv.as_mut_ptr().add(4) as *mut u8); +} + +#[target_feature(enable = "sse2")] +pub unsafe fn compress_xof( + cv: &CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) -> [u8; 64] { + let [mut row0, mut row1, mut row2, mut row3] = + compress_pre(cv, block, block_len, counter, flags); + row0 = xor(row0, row2); + row1 = xor(row1, row3); + row2 = xor(row2, loadu(cv.as_ptr().add(0) as *const u8)); + row3 = xor(row3, loadu(cv.as_ptr().add(4) as *const u8)); + core::mem::transmute([row0, row1, row2, row3]) +} + +#[inline(always)] +unsafe fn round(v: &mut [__m128i; 16], m: &[__m128i; 16], r: usize) { + v[0] = add(v[0], m[MSG_SCHEDULE[r][0]]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][2]]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][4]]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][6]]); + v[0] = add(v[0], v[4]); + v[1] = add(v[1], v[5]); + v[2] = add(v[2], v[6]); + v[3] = add(v[3], v[7]); + v[12] = xor(v[12], v[0]); + v[13] = xor(v[13], v[1]); + v[14] = xor(v[14], v[2]); + v[15] = xor(v[15], v[3]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[15] = rot16(v[15]); + v[8] = add(v[8], v[12]); + v[9] = add(v[9], v[13]); + v[10] = add(v[10], v[14]); + v[11] = add(v[11], v[15]); + v[4] = xor(v[4], v[8]); + v[5] = xor(v[5], v[9]); + v[6] = xor(v[6], v[10]); + v[7] = xor(v[7], v[11]); + v[4] = rot12(v[4]); + v[5] = rot12(v[5]); + v[6] = rot12(v[6]); + v[7] = rot12(v[7]); + v[0] = add(v[0], m[MSG_SCHEDULE[r][1]]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][3]]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][5]]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][7]]); + v[0] = add(v[0], v[4]); + v[1] = add(v[1], v[5]); + v[2] = add(v[2], v[6]); + v[3] = add(v[3], v[7]); + v[12] = xor(v[12], v[0]); + v[13] = xor(v[13], v[1]); + v[14] = xor(v[14], v[2]); + v[15] = xor(v[15], v[3]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[15] = rot8(v[15]); + v[8] = add(v[8], v[12]); + v[9] = add(v[9], v[13]); + v[10] = add(v[10], v[14]); + v[11] = add(v[11], v[15]); + v[4] = xor(v[4], v[8]); + v[5] = xor(v[5], v[9]); + v[6] = xor(v[6], v[10]); + v[7] = xor(v[7], v[11]); + v[4] = rot7(v[4]); + v[5] = rot7(v[5]); + v[6] = rot7(v[6]); + v[7] = rot7(v[7]); + + v[0] = add(v[0], m[MSG_SCHEDULE[r][8]]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][10]]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][12]]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][14]]); + v[0] = add(v[0], v[5]); + v[1] = add(v[1], v[6]); + v[2] = add(v[2], v[7]); + v[3] = add(v[3], v[4]); + v[15] = xor(v[15], v[0]); + v[12] = xor(v[12], v[1]); + v[13] = xor(v[13], v[2]); + v[14] = xor(v[14], v[3]); + v[15] = rot16(v[15]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[10] = add(v[10], v[15]); + v[11] = add(v[11], v[12]); + v[8] = add(v[8], v[13]); + v[9] = add(v[9], v[14]); + v[5] = xor(v[5], v[10]); + v[6] = xor(v[6], v[11]); + v[7] = xor(v[7], v[8]); + v[4] = xor(v[4], v[9]); + v[5] = rot12(v[5]); + v[6] = rot12(v[6]); + v[7] = rot12(v[7]); + v[4] = rot12(v[4]); + v[0] = add(v[0], m[MSG_SCHEDULE[r][9]]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][11]]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][13]]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][15]]); + v[0] = add(v[0], v[5]); + v[1] = add(v[1], v[6]); + v[2] = add(v[2], v[7]); + v[3] = add(v[3], v[4]); + v[15] = xor(v[15], v[0]); + v[12] = xor(v[12], v[1]); + v[13] = xor(v[13], v[2]); + v[14] = xor(v[14], v[3]); + v[15] = rot8(v[15]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[10] = add(v[10], v[15]); + v[11] = add(v[11], v[12]); + v[8] = add(v[8], v[13]); + v[9] = add(v[9], v[14]); + v[5] = xor(v[5], v[10]); + v[6] = xor(v[6], v[11]); + v[7] = xor(v[7], v[8]); + v[4] = xor(v[4], v[9]); + v[5] = rot7(v[5]); + v[6] = rot7(v[6]); + v[7] = rot7(v[7]); + v[4] = rot7(v[4]); +} + +#[inline(always)] +unsafe fn transpose_vecs(vecs: &mut [__m128i; DEGREE]) { + // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is + // 22/33. Note that this doesn't split the vector into two lanes, as the + // AVX2 counterparts do. + let ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); + let ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); + let cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); + let cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); + + // Interleave 64-bit lanes. + let abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01); + let abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01); + let abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23); + let abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23); + + vecs[0] = abcd_0; + vecs[1] = abcd_1; + vecs[2] = abcd_2; + vecs[3] = abcd_3; +} + +#[inline(always)] +unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [__m128i; 16] { + let mut vecs = [ + loadu(inputs[0].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[1].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[2].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[3].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[0].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[1].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[2].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[3].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[0].add(block_offset + 2 * 4 * DEGREE)), + loadu(inputs[1].add(block_offset + 2 * 4 * DEGREE)), + loadu(inputs[2].add(block_offset + 2 * 4 * DEGREE)), + loadu(inputs[3].add(block_offset + 2 * 4 * DEGREE)), + loadu(inputs[0].add(block_offset + 3 * 4 * DEGREE)), + loadu(inputs[1].add(block_offset + 3 * 4 * DEGREE)), + loadu(inputs[2].add(block_offset + 3 * 4 * DEGREE)), + loadu(inputs[3].add(block_offset + 3 * 4 * DEGREE)), + ]; + for prefetch in inputs.iter().take(DEGREE) { + _mm_prefetch(prefetch.add(block_offset + 256) as *const i8, _MM_HINT_T0); + } + for square in array_chunks_mut(&mut vecs) { + transpose_vecs(square); + } + vecs +} + +#[inline(always)] +unsafe fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m128i, __m128i) { + let mask = if increment_counter.yes() { !0 } else { 0 }; + ( + set4( + counter_low(counter + (mask & 0)), + counter_low(counter + (mask & 1)), + counter_low(counter + (mask & 2)), + counter_low(counter + (mask & 3)), + ), + set4( + counter_high(counter + (mask & 0)), + counter_high(counter + (mask & 1)), + counter_high(counter + (mask & 2)), + counter_high(counter + (mask & 3)), + ), + ) +} + +#[target_feature(enable = "sse2")] +pub unsafe fn hash4( + inputs: &[*const u8; DEGREE], + blocks: usize, + key: &CVWords, + counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut [u8; DEGREE * OUT_LEN], +) { + let mut h_vecs = [ + set1(key[0]), + set1(key[1]), + set1(key[2]), + set1(key[3]), + set1(key[4]), + set1(key[5]), + set1(key[6]), + set1(key[7]), + ]; + let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter); + let mut block_flags = flags | flags_start; + + for block in 0..blocks { + if block + 1 == blocks { + block_flags |= flags_end; + } + let block_len_vec = set1(BLOCK_LEN as u32); // full blocks only + let block_flags_vec = set1(block_flags as u32); + let msg_vecs = transpose_msg_vecs(inputs, block * BLOCK_LEN); + + // The transposed compression function. Note that inlining this + // manually here improves compile times by a lot, compared to factoring + // it out into its own function and making it #[inline(always)]. Just + // guessing, it might have something to do with loop unrolling. + let mut v = [ + h_vecs[0], + h_vecs[1], + h_vecs[2], + h_vecs[3], + h_vecs[4], + h_vecs[5], + h_vecs[6], + h_vecs[7], + set1(IV[0]), + set1(IV[1]), + set1(IV[2]), + set1(IV[3]), + counter_low_vec, + counter_high_vec, + block_len_vec, + block_flags_vec, + ]; + round(&mut v, &msg_vecs, 0); + round(&mut v, &msg_vecs, 1); + round(&mut v, &msg_vecs, 2); + round(&mut v, &msg_vecs, 3); + round(&mut v, &msg_vecs, 4); + round(&mut v, &msg_vecs, 5); + round(&mut v, &msg_vecs, 6); + h_vecs[0] = xor(v[0], v[8]); + h_vecs[1] = xor(v[1], v[9]); + h_vecs[2] = xor(v[2], v[10]); + h_vecs[3] = xor(v[3], v[11]); + h_vecs[4] = xor(v[4], v[12]); + h_vecs[5] = xor(v[5], v[13]); + h_vecs[6] = xor(v[6], v[14]); + h_vecs[7] = xor(v[7], v[15]); + + block_flags = flags; + } + + for square in array_chunks_mut(&mut h_vecs) { + transpose_vecs(square); + } + + // The first four vecs now contain the first half of each output, and the + // second four vecs contain the second half of each output. + storeu(h_vecs[0], out.as_mut_ptr().add(0 * 4 * DEGREE)); + storeu(h_vecs[4], out.as_mut_ptr().add(1 * 4 * DEGREE)); + storeu(h_vecs[1], out.as_mut_ptr().add(2 * 4 * DEGREE)); + storeu(h_vecs[5], out.as_mut_ptr().add(3 * 4 * DEGREE)); + storeu(h_vecs[2], out.as_mut_ptr().add(4 * 4 * DEGREE)); + storeu(h_vecs[6], out.as_mut_ptr().add(5 * 4 * DEGREE)); + storeu(h_vecs[3], out.as_mut_ptr().add(6 * 4 * DEGREE)); + storeu(h_vecs[7], out.as_mut_ptr().add(7 * 4 * DEGREE)); +} + +#[target_feature(enable = "sse2")] +unsafe fn hash1( + input: &[u8; N], + key: &CVWords, + counter: u64, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut CVBytes, +) { + debug_assert_eq!(N % BLOCK_LEN, 0, "uneven blocks"); + let mut cv = *key; + let mut block_flags = flags | flags_start; + let mut slice = &input[..]; + while slice.len() >= BLOCK_LEN { + if slice.len() == BLOCK_LEN { + block_flags |= flags_end; + } + compress_in_place( + &mut cv, + slice[0..BLOCK_LEN].try_into().unwrap(), + BLOCK_LEN as u8, + counter, + block_flags, + ); + block_flags = flags; + slice = &slice[BLOCK_LEN..]; + } + *out = core::mem::transmute(cv); // x86 is little-endian +} + +#[target_feature(enable = "sse2")] +pub unsafe fn hash_many( + mut inputs: &[&[u8; N]], + key: &CVWords, + mut counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + mut output: &mut [u8], +) { + debug_assert!(output.len() >= inputs.len() * OUT_LEN, "out too short"); + while inputs.len() >= DEGREE && output.len() >= DEGREE * OUT_LEN { + // Safe because the layout of arrays is guaranteed, and because the + // `blocks` count is determined statically from the argument type. + let input_ptrs: &[*const u8; DEGREE] = &*(inputs.as_ptr() as *const [*const u8; DEGREE]); + let blocks = N / BLOCK_LEN; + + let (out, rem) = split_array_mut(output); + hash4( + input_ptrs, + blocks, + key, + counter, + increment_counter, + flags, + flags_start, + flags_end, + out, + ); + if increment_counter.yes() { + counter += DEGREE as u64; + } + inputs = &inputs[DEGREE..]; + output = rem; + } + for (&input, output) in inputs.iter().zip(array_chunks_mut(output)) { + hash1(input, key, counter, flags, flags_start, flags_end, output); + if increment_counter.yes() { + counter += 1; + } + } +} + +#[cfg(test)] +mod test { + use crate::blake3::{ + platform, + test::{test_compress_fn, test_hash_many_fn}, + }; + + use super::*; + + #[test] + fn test_transpose() { + if !platform::sse2_detected() { + return; + } + + #[target_feature(enable = "sse2")] + unsafe fn transpose_wrapper(vecs: &mut [__m128i; DEGREE]) { + transpose_vecs(vecs); + } + + let mut matrix = [[0 as u32; DEGREE]; DEGREE]; + for i in 0..DEGREE { + for j in 0..DEGREE { + matrix[i][j] = (i * DEGREE + j) as u32; + } + } + + unsafe { + let mut vecs: [__m128i; DEGREE] = core::mem::transmute(matrix); + transpose_wrapper(&mut vecs); + matrix = core::mem::transmute(vecs); + } + + for i in 0..DEGREE { + for j in 0..DEGREE { + // Reversed indexes from above. + assert_eq!(matrix[j][i], (i * DEGREE + j) as u32); + } + } + } + + #[test] + fn test_compress() { + if !platform::sse2_detected() { + return; + } + test_compress_fn(compress_in_place, compress_xof); + } + + #[test] + fn test_hash_many() { + if !platform::sse2_detected() { + return; + } + test_hash_many_fn(hash_many, hash_many); + } +} diff --git a/narcissus-core/src/blake3/rust_sse41.rs b/narcissus-core/src/blake3/rust_sse41.rs new file mode 100644 index 0000000..6da3aee --- /dev/null +++ b/narcissus-core/src/blake3/rust_sse41.rs @@ -0,0 +1,762 @@ +#[cfg(target_arch = "x86")] +use core::arch::x86::*; +#[cfg(target_arch = "x86_64")] +use core::arch::x86_64::*; + +use crate::slice::{array_chunks_mut, split_array_mut}; + +use super::{ + counter_high, counter_low, CVBytes, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE, + OUT_LEN, +}; + +pub const DEGREE: usize = 4; + +#[inline(always)] +unsafe fn loadu(src: *const u8) -> __m128i { + // This is an unaligned load, so the pointer cast is allowed. + _mm_loadu_si128(src as *const __m128i) +} + +#[inline(always)] +unsafe fn storeu(src: __m128i, dest: *mut u8) { + // This is an unaligned store, so the pointer cast is allowed. + _mm_storeu_si128(dest as *mut __m128i, src) +} + +#[inline(always)] +unsafe fn add(a: __m128i, b: __m128i) -> __m128i { + _mm_add_epi32(a, b) +} + +#[inline(always)] +unsafe fn xor(a: __m128i, b: __m128i) -> __m128i { + _mm_xor_si128(a, b) +} + +#[inline(always)] +unsafe fn set1(x: u32) -> __m128i { + _mm_set1_epi32(x as i32) +} + +#[inline(always)] +unsafe fn set4(a: u32, b: u32, c: u32, d: u32) -> __m128i { + _mm_setr_epi32(a as i32, b as i32, c as i32, d as i32) +} + +// These rotations are the "simple/shifts version". For the +// "complicated/shuffles version", see +// https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2s-common.h#L63-L66. +// For a discussion of the tradeoffs, see +// https://github.com/sneves/blake2-avx2/pull/5. Due to an LLVM bug +// (https://bugs.llvm.org/show_bug.cgi?id=44379), this version performs better +// on recent x86 chips. + +#[inline(always)] +unsafe fn rot16(a: __m128i) -> __m128i { + _mm_or_si128(_mm_srli_epi32(a, 16), _mm_slli_epi32(a, 32 - 16)) +} + +#[inline(always)] +unsafe fn rot12(a: __m128i) -> __m128i { + _mm_or_si128(_mm_srli_epi32(a, 12), _mm_slli_epi32(a, 32 - 12)) +} + +#[inline(always)] +unsafe fn rot8(a: __m128i) -> __m128i { + _mm_or_si128(_mm_srli_epi32(a, 8), _mm_slli_epi32(a, 32 - 8)) +} + +#[inline(always)] +unsafe fn rot7(a: __m128i) -> __m128i { + _mm_or_si128(_mm_srli_epi32(a, 7), _mm_slli_epi32(a, 32 - 7)) +} + +#[inline(always)] +unsafe fn g1( + row0: &mut __m128i, + row1: &mut __m128i, + row2: &mut __m128i, + row3: &mut __m128i, + m: __m128i, +) { + *row0 = add(add(*row0, m), *row1); + *row3 = xor(*row3, *row0); + *row3 = rot16(*row3); + *row2 = add(*row2, *row3); + *row1 = xor(*row1, *row2); + *row1 = rot12(*row1); +} + +#[inline(always)] +unsafe fn g2( + row0: &mut __m128i, + row1: &mut __m128i, + row2: &mut __m128i, + row3: &mut __m128i, + m: __m128i, +) { + *row0 = add(add(*row0, m), *row1); + *row3 = xor(*row3, *row0); + *row3 = rot8(*row3); + *row2 = add(*row2, *row3); + *row1 = xor(*row1, *row2); + *row1 = rot7(*row1); +} + +// Adapted from https://github.com/rust-lang-nursery/stdsimd/pull/479. +macro_rules! _MM_SHUFFLE { + ($z:expr, $y:expr, $x:expr, $w:expr) => { + ($z << 6) | ($y << 4) | ($x << 2) | $w + }; +} + +macro_rules! shuffle2 { + ($a:expr, $b:expr, $c:expr) => { + _mm_castps_si128(_mm_shuffle_ps( + _mm_castsi128_ps($a), + _mm_castsi128_ps($b), + $c, + )) + }; +} + +// Note the optimization here of leaving row1 as the unrotated row, rather than +// row0. All the message loads below are adjusted to compensate for this. See +// discussion at https://github.com/sneves/blake2-avx2/pull/4 +#[inline(always)] +unsafe fn diagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) { + *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(2, 1, 0, 3)); + *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2)); + *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(0, 3, 2, 1)); +} + +#[inline(always)] +unsafe fn undiagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) { + *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(0, 3, 2, 1)); + *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2)); + *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(2, 1, 0, 3)); +} + +#[inline(always)] +unsafe fn compress_pre( + cv: &CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) -> [__m128i; 4] { + let row0 = &mut loadu(cv.as_ptr().add(0) as *const u8); + let row1 = &mut loadu(cv.as_ptr().add(4) as *const u8); + let row2 = &mut set4(IV[0], IV[1], IV[2], IV[3]); + let row3 = &mut set4( + counter_low(counter), + counter_high(counter), + block_len as u32, + flags as u32, + ); + + let mut m0 = loadu(block.as_ptr().add(0 * 4 * DEGREE)); + let mut m1 = loadu(block.as_ptr().add(1 * 4 * DEGREE)); + let mut m2 = loadu(block.as_ptr().add(2 * 4 * DEGREE)); + let mut m3 = loadu(block.as_ptr().add(3 * 4 * DEGREE)); + + let mut t0; + let mut t1; + let mut t2; + let mut t3; + let mut tt; + + // Round 1. The first round permutes the message words from the original + // input order, into the groups that get mixed in parallel. + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(2, 0, 2, 0)); // 6 4 2 0 + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 3, 1)); // 7 5 3 1 + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = shuffle2!(m2, m3, _MM_SHUFFLE!(2, 0, 2, 0)); // 14 12 10 8 + t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE!(2, 1, 0, 3)); // 12 10 8 14 + g1(row0, row1, row2, row3, t2); + t3 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 1, 3, 1)); // 15 13 11 9 + t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE!(2, 1, 0, 3)); // 13 11 9 15 + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 2. This round and all following rounds apply a fixed permutation + // to the message words from the round before. + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 3 + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 4 + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 5 + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 6 + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + m0 = t0; + m1 = t1; + m2 = t2; + m3 = t3; + + // Round 7 + t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); + t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); + g1(row0, row1, row2, row3, t0); + t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); + tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); + t1 = _mm_blend_epi16(tt, t1, 0xCC); + g2(row0, row1, row2, row3, t1); + diagonalize(row0, row2, row3); + t2 = _mm_unpacklo_epi64(m3, m1); + tt = _mm_blend_epi16(t2, m2, 0xC0); + t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); + g1(row0, row1, row2, row3, t2); + t3 = _mm_unpackhi_epi32(m1, m3); + tt = _mm_unpacklo_epi32(m2, t3); + t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); + g2(row0, row1, row2, row3, t3); + undiagonalize(row0, row2, row3); + + [*row0, *row1, *row2, *row3] +} + +#[target_feature(enable = "sse4.1")] +pub unsafe fn compress_in_place( + cv: &mut CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) { + let [row0, row1, row2, row3] = compress_pre(cv, block, block_len, counter, flags); + storeu(xor(row0, row2), cv.as_mut_ptr().add(0) as *mut u8); + storeu(xor(row1, row3), cv.as_mut_ptr().add(4) as *mut u8); +} + +#[target_feature(enable = "sse4.1")] +pub unsafe fn compress_xof( + cv: &CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) -> [u8; 64] { + let [mut row0, mut row1, mut row2, mut row3] = + compress_pre(cv, block, block_len, counter, flags); + row0 = xor(row0, row2); + row1 = xor(row1, row3); + row2 = xor(row2, loadu(cv.as_ptr().add(0) as *const u8)); + row3 = xor(row3, loadu(cv.as_ptr().add(4) as *const u8)); + core::mem::transmute([row0, row1, row2, row3]) +} + +#[inline(always)] +unsafe fn round(v: &mut [__m128i; 16], m: &[__m128i; 16], r: usize) { + v[0] = add(v[0], m[MSG_SCHEDULE[r][0]]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][2]]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][4]]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][6]]); + v[0] = add(v[0], v[4]); + v[1] = add(v[1], v[5]); + v[2] = add(v[2], v[6]); + v[3] = add(v[3], v[7]); + v[12] = xor(v[12], v[0]); + v[13] = xor(v[13], v[1]); + v[14] = xor(v[14], v[2]); + v[15] = xor(v[15], v[3]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[15] = rot16(v[15]); + v[8] = add(v[8], v[12]); + v[9] = add(v[9], v[13]); + v[10] = add(v[10], v[14]); + v[11] = add(v[11], v[15]); + v[4] = xor(v[4], v[8]); + v[5] = xor(v[5], v[9]); + v[6] = xor(v[6], v[10]); + v[7] = xor(v[7], v[11]); + v[4] = rot12(v[4]); + v[5] = rot12(v[5]); + v[6] = rot12(v[6]); + v[7] = rot12(v[7]); + v[0] = add(v[0], m[MSG_SCHEDULE[r][1]]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][3]]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][5]]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][7]]); + v[0] = add(v[0], v[4]); + v[1] = add(v[1], v[5]); + v[2] = add(v[2], v[6]); + v[3] = add(v[3], v[7]); + v[12] = xor(v[12], v[0]); + v[13] = xor(v[13], v[1]); + v[14] = xor(v[14], v[2]); + v[15] = xor(v[15], v[3]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[15] = rot8(v[15]); + v[8] = add(v[8], v[12]); + v[9] = add(v[9], v[13]); + v[10] = add(v[10], v[14]); + v[11] = add(v[11], v[15]); + v[4] = xor(v[4], v[8]); + v[5] = xor(v[5], v[9]); + v[6] = xor(v[6], v[10]); + v[7] = xor(v[7], v[11]); + v[4] = rot7(v[4]); + v[5] = rot7(v[5]); + v[6] = rot7(v[6]); + v[7] = rot7(v[7]); + + v[0] = add(v[0], m[MSG_SCHEDULE[r][8]]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][10]]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][12]]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][14]]); + v[0] = add(v[0], v[5]); + v[1] = add(v[1], v[6]); + v[2] = add(v[2], v[7]); + v[3] = add(v[3], v[4]); + v[15] = xor(v[15], v[0]); + v[12] = xor(v[12], v[1]); + v[13] = xor(v[13], v[2]); + v[14] = xor(v[14], v[3]); + v[15] = rot16(v[15]); + v[12] = rot16(v[12]); + v[13] = rot16(v[13]); + v[14] = rot16(v[14]); + v[10] = add(v[10], v[15]); + v[11] = add(v[11], v[12]); + v[8] = add(v[8], v[13]); + v[9] = add(v[9], v[14]); + v[5] = xor(v[5], v[10]); + v[6] = xor(v[6], v[11]); + v[7] = xor(v[7], v[8]); + v[4] = xor(v[4], v[9]); + v[5] = rot12(v[5]); + v[6] = rot12(v[6]); + v[7] = rot12(v[7]); + v[4] = rot12(v[4]); + v[0] = add(v[0], m[MSG_SCHEDULE[r][9]]); + v[1] = add(v[1], m[MSG_SCHEDULE[r][11]]); + v[2] = add(v[2], m[MSG_SCHEDULE[r][13]]); + v[3] = add(v[3], m[MSG_SCHEDULE[r][15]]); + v[0] = add(v[0], v[5]); + v[1] = add(v[1], v[6]); + v[2] = add(v[2], v[7]); + v[3] = add(v[3], v[4]); + v[15] = xor(v[15], v[0]); + v[12] = xor(v[12], v[1]); + v[13] = xor(v[13], v[2]); + v[14] = xor(v[14], v[3]); + v[15] = rot8(v[15]); + v[12] = rot8(v[12]); + v[13] = rot8(v[13]); + v[14] = rot8(v[14]); + v[10] = add(v[10], v[15]); + v[11] = add(v[11], v[12]); + v[8] = add(v[8], v[13]); + v[9] = add(v[9], v[14]); + v[5] = xor(v[5], v[10]); + v[6] = xor(v[6], v[11]); + v[7] = xor(v[7], v[8]); + v[4] = xor(v[4], v[9]); + v[5] = rot7(v[5]); + v[6] = rot7(v[6]); + v[7] = rot7(v[7]); + v[4] = rot7(v[4]); +} + +#[inline(always)] +unsafe fn transpose_vecs(vecs: &mut [__m128i; DEGREE]) { + // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is + // 22/33. Note that this doesn't split the vector into two lanes, as the + // AVX2 counterparts do. + let ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); + let ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); + let cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); + let cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); + + // Interleave 64-bit lanes. + let abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01); + let abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01); + let abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23); + let abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23); + + vecs[0] = abcd_0; + vecs[1] = abcd_1; + vecs[2] = abcd_2; + vecs[3] = abcd_3; +} + +#[inline(always)] +unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [__m128i; 16] { + let mut vecs = [ + loadu(inputs[0].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[1].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[2].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[3].add(block_offset + 0 * 4 * DEGREE)), + loadu(inputs[0].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[1].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[2].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[3].add(block_offset + 1 * 4 * DEGREE)), + loadu(inputs[0].add(block_offset + 2 * 4 * DEGREE)), + loadu(inputs[1].add(block_offset + 2 * 4 * DEGREE)), + loadu(inputs[2].add(block_offset + 2 * 4 * DEGREE)), + loadu(inputs[3].add(block_offset + 2 * 4 * DEGREE)), + loadu(inputs[0].add(block_offset + 3 * 4 * DEGREE)), + loadu(inputs[1].add(block_offset + 3 * 4 * DEGREE)), + loadu(inputs[2].add(block_offset + 3 * 4 * DEGREE)), + loadu(inputs[3].add(block_offset + 3 * 4 * DEGREE)), + ]; + for prefetch in inputs.iter().take(DEGREE) { + _mm_prefetch(prefetch.add(block_offset + 256) as *const i8, _MM_HINT_T0); + } + for square in array_chunks_mut(&mut vecs) { + transpose_vecs(square); + } + vecs +} + +#[inline(always)] +unsafe fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m128i, __m128i) { + let mask = if increment_counter.yes() { !0 } else { 0 }; + ( + set4( + counter_low(counter + (mask & 0)), + counter_low(counter + (mask & 1)), + counter_low(counter + (mask & 2)), + counter_low(counter + (mask & 3)), + ), + set4( + counter_high(counter + (mask & 0)), + counter_high(counter + (mask & 1)), + counter_high(counter + (mask & 2)), + counter_high(counter + (mask & 3)), + ), + ) +} + +#[target_feature(enable = "sse4.1")] +pub unsafe fn hash4( + inputs: &[*const u8; DEGREE], + blocks: usize, + key: &CVWords, + counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut [u8; DEGREE * OUT_LEN], +) { + let mut h_vecs = [ + set1(key[0]), + set1(key[1]), + set1(key[2]), + set1(key[3]), + set1(key[4]), + set1(key[5]), + set1(key[6]), + set1(key[7]), + ]; + let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter); + let mut block_flags = flags | flags_start; + + for block in 0..blocks { + if block + 1 == blocks { + block_flags |= flags_end; + } + let block_len_vec = set1(BLOCK_LEN as u32); // full blocks only + let block_flags_vec = set1(block_flags as u32); + let msg_vecs = transpose_msg_vecs(inputs, block * BLOCK_LEN); + + // The transposed compression function. Note that inlining this + // manually here improves compile times by a lot, compared to factoring + // it out into its own function and making it #[inline(always)]. Just + // guessing, it might have something to do with loop unrolling. + let mut v = [ + h_vecs[0], + h_vecs[1], + h_vecs[2], + h_vecs[3], + h_vecs[4], + h_vecs[5], + h_vecs[6], + h_vecs[7], + set1(IV[0]), + set1(IV[1]), + set1(IV[2]), + set1(IV[3]), + counter_low_vec, + counter_high_vec, + block_len_vec, + block_flags_vec, + ]; + round(&mut v, &msg_vecs, 0); + round(&mut v, &msg_vecs, 1); + round(&mut v, &msg_vecs, 2); + round(&mut v, &msg_vecs, 3); + round(&mut v, &msg_vecs, 4); + round(&mut v, &msg_vecs, 5); + round(&mut v, &msg_vecs, 6); + h_vecs[0] = xor(v[0], v[8]); + h_vecs[1] = xor(v[1], v[9]); + h_vecs[2] = xor(v[2], v[10]); + h_vecs[3] = xor(v[3], v[11]); + h_vecs[4] = xor(v[4], v[12]); + h_vecs[5] = xor(v[5], v[13]); + h_vecs[6] = xor(v[6], v[14]); + h_vecs[7] = xor(v[7], v[15]); + + block_flags = flags; + } + + for square in array_chunks_mut(&mut h_vecs) { + transpose_vecs(square); + } + + // The first four vecs now contain the first half of each output, and the + // second four vecs contain the second half of each output. + storeu(h_vecs[0], out.as_mut_ptr().add(0 * 4 * DEGREE)); + storeu(h_vecs[4], out.as_mut_ptr().add(1 * 4 * DEGREE)); + storeu(h_vecs[1], out.as_mut_ptr().add(2 * 4 * DEGREE)); + storeu(h_vecs[5], out.as_mut_ptr().add(3 * 4 * DEGREE)); + storeu(h_vecs[2], out.as_mut_ptr().add(4 * 4 * DEGREE)); + storeu(h_vecs[6], out.as_mut_ptr().add(5 * 4 * DEGREE)); + storeu(h_vecs[3], out.as_mut_ptr().add(6 * 4 * DEGREE)); + storeu(h_vecs[7], out.as_mut_ptr().add(7 * 4 * DEGREE)); +} + +#[target_feature(enable = "sse4.1")] +unsafe fn hash1( + input: &[u8; N], + key: &CVWords, + counter: u64, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut CVBytes, +) { + debug_assert_eq!(N % BLOCK_LEN, 0, "uneven blocks"); + let mut cv = *key; + let mut block_flags = flags | flags_start; + let mut slice = &input[..]; + while slice.len() >= BLOCK_LEN { + if slice.len() == BLOCK_LEN { + block_flags |= flags_end; + } + compress_in_place( + &mut cv, + slice[0..BLOCK_LEN].try_into().unwrap(), + BLOCK_LEN as u8, + counter, + block_flags, + ); + block_flags = flags; + slice = &slice[BLOCK_LEN..]; + } + *out = core::mem::transmute(cv); // x86 is little-endian +} + +#[target_feature(enable = "sse4.1")] +pub unsafe fn hash_many( + mut inputs: &[&[u8; N]], + key: &CVWords, + mut counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + mut output: &mut [u8], +) { + debug_assert!(output.len() >= inputs.len() * OUT_LEN, "out too short"); + while inputs.len() >= DEGREE && output.len() >= DEGREE * OUT_LEN { + // Safe because the layout of arrays is guaranteed, and because the + // `blocks` count is determined statically from the argument type. + let input_ptrs: &[*const u8; DEGREE] = &*(inputs.as_ptr() as *const [*const u8; DEGREE]); + let blocks = N / BLOCK_LEN; + + let (out, rem) = split_array_mut(output); + hash4( + input_ptrs, + blocks, + key, + counter, + increment_counter, + flags, + flags_start, + flags_end, + out, + ); + if increment_counter.yes() { + counter += DEGREE as u64; + } + inputs = &inputs[DEGREE..]; + output = rem; + } + for (&input, output) in inputs.iter().zip(array_chunks_mut(output)) { + hash1(input, key, counter, flags, flags_start, flags_end, output); + if increment_counter.yes() { + counter += 1; + } + } +} + +#[cfg(test)] +mod test { + use super::super::platform; + use super::super::test::*; + use super::*; + + #[test] + fn test_transpose() { + if !platform::sse41_detected() { + return; + } + + #[target_feature(enable = "sse4.1")] + unsafe fn transpose_wrapper(vecs: &mut [__m128i; DEGREE]) { + transpose_vecs(vecs); + } + + let mut matrix = [[0 as u32; DEGREE]; DEGREE]; + for i in 0..DEGREE { + for j in 0..DEGREE { + matrix[i][j] = (i * DEGREE + j) as u32; + } + } + + unsafe { + let mut vecs: [__m128i; DEGREE] = core::mem::transmute(matrix); + transpose_wrapper(&mut vecs); + matrix = core::mem::transmute(vecs); + } + + for i in 0..DEGREE { + for j in 0..DEGREE { + // Reversed indexes from above. + assert_eq!(matrix[j][i], (i * DEGREE + j) as u32); + } + } + } + + #[test] + fn test_compress() { + if !platform::sse41_detected() { + return; + } + test_compress_fn(compress_in_place, compress_xof); + } + + #[test] + fn test_hash_many() { + if !platform::sse41_detected() { + return; + } + test_hash_many_fn(hash_many, hash_many); + } +} diff --git a/narcissus-core/src/blake3/test.rs b/narcissus-core/src/blake3/test.rs new file mode 100644 index 0000000..04597ef --- /dev/null +++ b/narcissus-core/src/blake3/test.rs @@ -0,0 +1,507 @@ +use crate::blake3::reference_impl; +use crate::rand::Pcg64; +use crate::slice::array_chunks; +use crate::FixedVec; + +use super::portable; +use super::{CVBytes, CVWords, IncrementCounter, BLOCK_LEN, CHUNK_LEN, OUT_LEN}; +use core::usize; + +// Interesting input lengths to run tests on. +pub const TEST_CASES: &[usize] = &[ + 0, + 1, + 2, + 3, + 4, + 5, + 6, + 7, + 8, + BLOCK_LEN - 1, + BLOCK_LEN, + BLOCK_LEN + 1, + 2 * BLOCK_LEN - 1, + 2 * BLOCK_LEN, + 2 * BLOCK_LEN + 1, + CHUNK_LEN - 1, + CHUNK_LEN, + CHUNK_LEN + 1, + 2 * CHUNK_LEN, + 2 * CHUNK_LEN + 1, + 3 * CHUNK_LEN, + 3 * CHUNK_LEN + 1, + 4 * CHUNK_LEN, + 4 * CHUNK_LEN + 1, + 5 * CHUNK_LEN, + 5 * CHUNK_LEN + 1, + 6 * CHUNK_LEN, + 6 * CHUNK_LEN + 1, + 7 * CHUNK_LEN, + 7 * CHUNK_LEN + 1, + 8 * CHUNK_LEN, + 8 * CHUNK_LEN + 1, + 16 * CHUNK_LEN, // AVX512's bandwidth + 31 * CHUNK_LEN, // 16 + 8 + 4 + 2 + 1 + 100 * CHUNK_LEN, // subtrees larger than MAX_SIMD_DEGREE chunks +]; + +pub const TEST_CASES_MAX: usize = 100 * CHUNK_LEN; + +// There's a test to make sure these two are equal below. +pub const TEST_KEY: CVBytes = *b"whats the Elvish word for friend"; +pub const TEST_KEY_WORDS: CVWords = [ + 1952540791, 1752440947, 1816469605, 1752394102, 1919907616, 1868963940, 1919295602, 1684956521, +]; + +// Paint the input with a repeating byte pattern. We use a cycle length of 251, +// because that's the largets prime number less than 256. This makes it +// unlikely to swapping any two adjacent input blocks or chunks will give the +// same answer. +pub fn paint_test_input(buf: &mut [u8]) { + for (i, b) in buf.iter_mut().enumerate() { + *b = (i % 251) as u8; + } +} + +type CompressInPlaceFn = + unsafe fn(cv: &mut CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8); + +type CompressXofFn = unsafe fn( + cv: &CVWords, + block: &[u8; BLOCK_LEN], + block_len: u8, + counter: u64, + flags: u8, +) -> [u8; 64]; + +// A shared helper function for platform-specific tests. +pub fn test_compress_fn(compress_in_place_fn: CompressInPlaceFn, compress_xof_fn: CompressXofFn) { + let initial_state = TEST_KEY_WORDS; + let block_len: u8 = 61; + let mut block = [0; BLOCK_LEN]; + paint_test_input(&mut block[..block_len as usize]); + // Use a counter with set bits in both 32-bit words. + let counter = (5u64 << 32) + 6; + let flags = super::CHUNK_END | super::ROOT | super::KEYED_HASH; + + let portable_out = + portable::compress_xof(&initial_state, &block, block_len, counter as u64, flags); + + let mut test_state = initial_state; + unsafe { compress_in_place_fn(&mut test_state, &block, block_len, counter as u64, flags) }; + let test_state_bytes = super::platform::le_bytes_from_words_32(&test_state); + let test_xof = + unsafe { compress_xof_fn(&initial_state, &block, block_len, counter as u64, flags) }; + + assert_eq!(&portable_out[..32], &test_state_bytes[..]); + assert_eq!(&portable_out[..], &test_xof[..]); +} + +type HashManyFn = unsafe fn( + inputs: &[&A], + key: &CVWords, + counter: u64, + increment_counter: IncrementCounter, + flags: u8, + flags_start: u8, + flags_end: u8, + out: &mut [u8], +); + +// A shared helper function for platform-specific tests. +pub fn test_hash_many_fn( + hash_many_chunks_fn: HashManyFn<[u8; CHUNK_LEN]>, + hash_many_parents_fn: HashManyFn<[u8; 2 * OUT_LEN]>, +) { + // Test a few different initial counter values. + // - 0: The base case. + // - u32::MAX: The low word of the counter overflows for all inputs except the first. + // - i32::MAX: *No* overflow. But carry bugs in tricky SIMD code can screw this up, if you XOR + // when you're supposed to ANDNOT... + let initial_counters = [0, u32::MAX as u64, i32::MAX as u64]; + for counter in initial_counters { + // 31 (16 + 8 + 4 + 2 + 1) inputs + const NUM_INPUTS: usize = 31; + let mut input_buf = [0; CHUNK_LEN * NUM_INPUTS]; + super::test::paint_test_input(&mut input_buf); + + // First hash chunks. + let mut chunks = FixedVec::<&[u8; CHUNK_LEN], NUM_INPUTS>::new(); + for chunk in array_chunks(&input_buf).take(NUM_INPUTS) { + chunks.push(chunk); + } + let mut portable_chunks_out = [0; NUM_INPUTS * OUT_LEN]; + portable::hash_many( + &chunks, + &TEST_KEY_WORDS, + counter, + IncrementCounter::Yes, + super::KEYED_HASH, + super::CHUNK_START, + super::CHUNK_END, + &mut portable_chunks_out, + ); + + let mut test_chunks_out = [0; NUM_INPUTS * OUT_LEN]; + unsafe { + hash_many_chunks_fn( + &chunks[..], + &TEST_KEY_WORDS, + counter, + IncrementCounter::Yes, + super::KEYED_HASH, + super::CHUNK_START, + super::CHUNK_END, + &mut test_chunks_out, + ); + } + for n in 0..NUM_INPUTS { + assert_eq!( + &portable_chunks_out[n * OUT_LEN..][..OUT_LEN], + &test_chunks_out[n * OUT_LEN..][..OUT_LEN] + ); + } + + // Then hash parents. + let mut parents = FixedVec::<&[u8; 2 * OUT_LEN], NUM_INPUTS>::new(); + for parent in array_chunks(&input_buf).take(NUM_INPUTS) { + parents.push(parent); + } + let mut portable_parents_out = [0; NUM_INPUTS * OUT_LEN]; + portable::hash_many( + &parents, + &TEST_KEY_WORDS, + counter, + IncrementCounter::No, + super::KEYED_HASH | super::PARENT, + 0, + 0, + &mut portable_parents_out, + ); + + let mut test_parents_out = [0; NUM_INPUTS * OUT_LEN]; + unsafe { + hash_many_parents_fn( + &parents[..], + &TEST_KEY_WORDS, + counter, + IncrementCounter::No, + super::KEYED_HASH | super::PARENT, + 0, + 0, + &mut test_parents_out, + ); + } + for n in 0..NUM_INPUTS { + assert_eq!( + &portable_parents_out[n * OUT_LEN..][..OUT_LEN], + &test_parents_out[n * OUT_LEN..][..OUT_LEN] + ); + } + } +} + +#[test] +fn test_key_bytes_equal_key_words() { + assert_eq!( + TEST_KEY_WORDS, + super::platform::words_from_le_bytes_32(&TEST_KEY), + ); +} + +#[test] +fn test_reference_impl_size() { + // Because the Rust compiler optimizes struct layout, it's possible that + // some future version of the compiler will produce a different size. If + // that happens, we can either disable this test, or test for multiple + // expected values. For now, the purpose of this test is to make sure we + // notice if that happens. + assert_eq!(1880, core::mem::size_of::()); +} + +#[test] +fn test_counter_words() { + let counter: u64 = (1 << 32) + 2; + assert_eq!(super::counter_low(counter), 2); + assert_eq!(super::counter_high(counter), 1); +} + +#[test] +fn test_largest_power_of_two_leq() { + let input_output = &[ + // The zero case is nonsensical, but it does work. + (0, 1), + (1, 1), + (2, 2), + (3, 2), + (4, 4), + (5, 4), + (6, 4), + (7, 4), + (8, 8), + // the largest possible usize + (usize::MAX, (usize::MAX >> 1) + 1), + ]; + for &(input, output) in input_output { + assert_eq!( + output, + super::largest_power_of_two_leq(input), + "wrong output for n={}", + input + ); + } +} + +#[test] +fn test_left_len() { + let input_output = &[ + (CHUNK_LEN + 1, CHUNK_LEN), + (2 * CHUNK_LEN - 1, CHUNK_LEN), + (2 * CHUNK_LEN, CHUNK_LEN), + (2 * CHUNK_LEN + 1, 2 * CHUNK_LEN), + (4 * CHUNK_LEN - 1, 2 * CHUNK_LEN), + (4 * CHUNK_LEN, 2 * CHUNK_LEN), + (4 * CHUNK_LEN + 1, 4 * CHUNK_LEN), + ]; + for &(input, output) in input_output { + assert_eq!(super::left_len(input), output); + } +} + +#[test] +fn test_compare_reference_impl() { + const OUT: usize = 303; // more than 64, not a multiple of 4 + let mut input_buf = [0; TEST_CASES_MAX]; + paint_test_input(&mut input_buf); + for &case in TEST_CASES { + let input = &input_buf[..case]; + + // regular + { + let mut reference_hasher = reference_impl::Hasher::new(); + reference_hasher.update(input); + let mut expected_out = [0; OUT]; + reference_hasher.finalize(&mut expected_out); + + // all at once + let test_out = super::hash(input); + let out: &[u8; 32] = &expected_out[0..32].try_into().unwrap(); + assert_eq!(test_out, *out); + // incremental + let mut hasher = super::Hasher::new(); + hasher.update(input); + assert_eq!(hasher.finalize(), *&expected_out[0..32]); + assert_eq!(hasher.finalize(), test_out); + // xof + let mut extended = [0; OUT]; + hasher.finalize_xof().fill(&mut extended); + assert_eq!(extended, expected_out); + } + + // keyed + { + let mut reference_hasher = reference_impl::Hasher::new_keyed(&TEST_KEY); + reference_hasher.update(input); + let mut expected_out = [0; OUT]; + reference_hasher.finalize(&mut expected_out); + + // all at once + let test_out = super::keyed_hash(&TEST_KEY, input); + assert_eq!(test_out, expected_out[0..32]); + // incremental + let mut hasher = super::Hasher::new_keyed(&TEST_KEY); + hasher.update(input); + assert_eq!(hasher.finalize(), expected_out[0..32]); + assert_eq!(hasher.finalize(), test_out); + // xof + let mut extended = [0; OUT]; + hasher.finalize_xof().fill(&mut extended); + assert_eq!(extended, expected_out); + } + + // derive_key + { + let context = "BLAKE3 2019-12-27 16:13:59 example context (not the test vector one)"; + let mut reference_hasher = reference_impl::Hasher::new_derive_key(context); + reference_hasher.update(input); + let mut expected_out = [0; OUT]; + reference_hasher.finalize(&mut expected_out); + + // all at once + let test_out = super::derive_key(context, input); + assert_eq!(test_out, expected_out[..32]); + // incremental + let mut hasher = super::Hasher::new_derive_key(context); + hasher.update(input); + assert_eq!(hasher.finalize(), expected_out[0..32]); + assert_eq!(hasher.finalize(), test_out[0..32]); + // xof + let mut extended = [0; OUT]; + hasher.finalize_xof().fill(&mut extended); + assert_eq!(extended, expected_out); + } + } +} + +fn reference_hash(input: &[u8]) -> super::Hash { + let mut hasher = reference_impl::Hasher::new(); + hasher.update(input); + let mut bytes = [0; 32]; + hasher.finalize(&mut bytes); + bytes.into() +} + +#[test] +fn test_compare_update_multiple() { + // Don't use all the long test cases here, since that's unnecessarily slow + // in debug mode. + let mut short_test_cases = TEST_CASES; + while *short_test_cases.last().unwrap() > 4 * CHUNK_LEN { + short_test_cases = &short_test_cases[..short_test_cases.len() - 1]; + } + assert_eq!(*short_test_cases.last().unwrap(), 4 * CHUNK_LEN); + + let mut input_buf = [0; 2 * TEST_CASES_MAX]; + paint_test_input(&mut input_buf); + + for &first_update in short_test_cases { + #[cfg(feature = "std")] + dbg!(first_update); + let first_input = &input_buf[..first_update]; + let mut test_hasher = super::Hasher::new(); + test_hasher.update(first_input); + + for &second_update in short_test_cases { + #[cfg(feature = "std")] + dbg!(second_update); + let second_input = &input_buf[first_update..][..second_update]; + let total_input = &input_buf[..first_update + second_update]; + + // Clone the hasher with first_update bytes already written, so + // that the next iteration can reuse it. + let mut test_hasher = test_hasher.clone(); + test_hasher.update(second_input); + let expected = reference_hash(total_input); + assert_eq!(expected, test_hasher.finalize()); + } + } +} + +#[test] +fn test_fuzz_hasher() { + const INPUT_MAX: usize = 4 * CHUNK_LEN; + let mut input_buf = [0; 3 * INPUT_MAX]; + paint_test_input(&mut input_buf); + + // Don't do too many iterations in debug mode, to keep the tests under a + // second or so. CI should run tests in release mode also. Provide an + // environment variable for specifying a larger number of fuzz iterations. + let num_tests = if cfg!(debug_assertions) { 100 } else { 10_000 }; + + // Use a fixed RNG seed for reproducibility. + let mut rng = Pcg64::new(); + for _num_test in 0..num_tests { + #[cfg(feature = "std")] + dbg!(_num_test); + let mut hasher = super::Hasher::new(); + let mut total_input = 0; + // For each test, write 3 inputs of random length. + for _ in 0..3 { + let input_len = rng.next_bound_u64((INPUT_MAX + 1) as u64) as usize; + let input = &input_buf[total_input..][..input_len]; + hasher.update(input); + total_input += input_len; + } + let expected = reference_hash(&input_buf[..total_input]); + assert_eq!(expected, hasher.finalize()); + } +} + +#[test] +fn test_xof_seek() { + let mut out = [0; 533]; + let mut hasher = super::Hasher::new(); + hasher.update(b"foo"); + hasher.finalize_xof().fill(&mut out); + assert_eq!(hasher.finalize().as_bytes(), &out[0..32]); + + let mut reader = hasher.finalize_xof(); + reader.set_position(303); + let mut out2 = [0; 102]; + reader.fill(&mut out2); + assert_eq!(&out[303..][..102], &out2[..]); +} + +#[test] +fn test_msg_schdule_permutation() { + let permutation = [2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8]; + + let mut generated = [[0; 16]; 7]; + generated[0] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; + + for round in 1..7 { + for i in 0..16 { + generated[round][i] = generated[round - 1][permutation[i]]; + } + } + + assert_eq!(generated, super::MSG_SCHEDULE); +} + +#[test] +fn test_reset() { + let mut hasher = super::Hasher::new(); + hasher.update(&[42; 3 * CHUNK_LEN + 7]); + hasher.reset(); + hasher.update(&[42; CHUNK_LEN + 3]); + assert_eq!(hasher.finalize(), super::hash(&[42; CHUNK_LEN + 3])); + + let key = &[99; super::KEY_LEN]; + let mut keyed_hasher = super::Hasher::new_keyed(key); + keyed_hasher.update(&[42; 3 * CHUNK_LEN + 7]); + keyed_hasher.reset(); + keyed_hasher.update(&[42; CHUNK_LEN + 3]); + assert_eq!( + keyed_hasher.finalize(), + super::keyed_hash(key, &[42; CHUNK_LEN + 3]), + ); + + let context = "BLAKE3 2020-02-12 10:20:58 reset test"; + let mut kdf = super::Hasher::new_derive_key(context); + kdf.update(&[42; 3 * CHUNK_LEN + 7]); + kdf.reset(); + kdf.update(&[42; CHUNK_LEN + 3]); + let expected = super::derive_key(context, &[42; CHUNK_LEN + 3]); + assert_eq!(kdf.finalize(), expected); +} + +// This test is a mimized failure case for the Windows SSE2 bug described in +// https://github.com/BLAKE3-team/BLAKE3/issues/206. +// +// Before that issue was fixed, this test would fail on Windows in the following configuration: +// +// cargo test --features=no_avx512,no_avx2,no_sse41 --release +// +// Bugs like this one (stomping on a caller's register) are very sensitive to the details of +// surrounding code, so it's not especially likely that this test will catch another bug (or even +// the same bug) in the future. Still, there's no harm in keeping it. +#[test] +fn test_issue_206_windows_sse2() { + // This stupid loop has to be here to trigger the bug. I don't know why. + for _ in &[0] { + // The length 65 (two blocks) is significant. It doesn't repro with 64 (one block). It also + // doesn't repro with an all-zero input. + let input = &[0xff; 65]; + let expected_hash = [ + 183, 235, 50, 217, 156, 24, 190, 219, 2, 216, 176, 255, 224, 53, 28, 95, 57, 148, 179, + 245, 162, 90, 37, 121, 0, 142, 219, 62, 234, 204, 225, 161, + ]; + + // This throwaway call has to be here to trigger the bug. + super::Hasher::new().update(input); + + // This assert fails when the bug is triggered. + assert_eq!(super::Hasher::new().update(input).finalize(), expected_hash); + } +} diff --git a/narcissus-core/src/lib.rs b/narcissus-core/src/lib.rs index 2b4380c..a117e75 100644 --- a/narcissus-core/src/lib.rs +++ b/narcissus-core/src/lib.rs @@ -1,5 +1,6 @@ mod arena; mod bitset; +pub mod blake3; mod fixed_vec; mod libc; pub mod manual_arc; -- 2.49.0