From 20538616f22844ead97a78c51cbaf24134c75e59 Mon Sep 17 00:00:00 2001 From: Joshua Simmons Date: Sun, 19 Feb 2023 15:17:21 +0100 Subject: [PATCH] Remove blake3, F to pay respekts --- narcissus-core/src/blake3/LICENSE | 330 ----- narcissus-core/src/blake3/constant_time_eq.rs | 153 --- narcissus-core/src/blake3/guts.rs | 103 -- narcissus-core/src/blake3/join.rs | 62 - narcissus-core/src/blake3/mod.rs | 1222 ----------------- narcissus-core/src/blake3/platform.rs | 296 ---- narcissus-core/src/blake3/portable.rs | 195 --- narcissus-core/src/blake3/reference_impl.rs | 384 ------ narcissus-core/src/blake3/rust_avx2.rs | 477 ------- narcissus-core/src/blake3/rust_sse2.rs | 774 ----------- narcissus-core/src/blake3/rust_sse41.rs | 762 ---------- narcissus-core/src/blake3/test.rs | 507 ------- narcissus-core/src/lib.rs | 1 - 13 files changed, 5266 deletions(-) delete mode 100644 narcissus-core/src/blake3/LICENSE delete mode 100644 narcissus-core/src/blake3/constant_time_eq.rs delete mode 100644 narcissus-core/src/blake3/guts.rs delete mode 100644 narcissus-core/src/blake3/join.rs delete mode 100644 narcissus-core/src/blake3/mod.rs delete mode 100644 narcissus-core/src/blake3/platform.rs delete mode 100644 narcissus-core/src/blake3/portable.rs delete mode 100644 narcissus-core/src/blake3/reference_impl.rs delete mode 100644 narcissus-core/src/blake3/rust_avx2.rs delete mode 100644 narcissus-core/src/blake3/rust_sse2.rs delete mode 100644 narcissus-core/src/blake3/rust_sse41.rs delete mode 100644 narcissus-core/src/blake3/test.rs diff --git a/narcissus-core/src/blake3/LICENSE b/narcissus-core/src/blake3/LICENSE deleted file mode 100644 index 4d38c4b..0000000 --- a/narcissus-core/src/blake3/LICENSE +++ /dev/null @@ -1,330 +0,0 @@ -This work is released into the public domain with CC0 1.0. Alternatively, it is -licensed under the Apache License 2.0. - -------------------------------------------------------------------------------- - -Creative Commons Legal Code - -CC0 1.0 Universal - - CREATIVE COMMONS CORPORATION IS NOT A LAW FIRM AND DOES NOT PROVIDE - LEGAL SERVICES. DISTRIBUTION OF THIS DOCUMENT DOES NOT CREATE AN - ATTORNEY-CLIENT RELATIONSHIP. CREATIVE COMMONS PROVIDES THIS - INFORMATION ON AN "AS-IS" BASIS. CREATIVE COMMONS MAKES NO WARRANTIES - REGARDING THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS - PROVIDED HEREUNDER, AND DISCLAIMS LIABILITY FOR DAMAGES RESULTING FROM - THE USE OF THIS DOCUMENT OR THE INFORMATION OR WORKS PROVIDED - HEREUNDER. - -Statement of Purpose - -The laws of most jurisdictions throughout the world automatically confer -exclusive Copyright and Related Rights (defined below) upon the creator -and subsequent owner(s) (each and all, an "owner") of an original work of -authorship and/or a database (each, a "Work"). - -Certain owners wish to permanently relinquish those rights to a Work for -the purpose of contributing to a commons of creative, cultural and -scientific works ("Commons") that the public can reliably and without fear -of later claims of infringement build upon, modify, incorporate in other -works, reuse and redistribute as freely as possible in any form whatsoever -and for any purposes, including without limitation commercial purposes. -These owners may contribute to the Commons to promote the ideal of a free -culture and the further production of creative, cultural and scientific -works, or to gain reputation or greater distribution for their Work in -part through the use and efforts of others. - -For these and/or other purposes and motivations, and without any -expectation of additional consideration or compensation, the person -associating CC0 with a Work (the "Affirmer"), to the extent that he or she -is an owner of Copyright and Related Rights in the Work, voluntarily -elects to apply CC0 to the Work and publicly distribute the Work under its -terms, with knowledge of his or her Copyright and Related Rights in the -Work and the meaning and intended legal effect of CC0 on those rights. - -1. Copyright and Related Rights. A Work made available under CC0 may be -protected by copyright and related or neighboring rights ("Copyright and -Related Rights"). Copyright and Related Rights include, but are not -limited to, the following: - - i. the right to reproduce, adapt, distribute, perform, display, - communicate, and translate a Work; - ii. moral rights retained by the original author(s) and/or performer(s); -iii. publicity and privacy rights pertaining to a person's image or - likeness depicted in a Work; - iv. rights protecting against unfair competition in regards to a Work, - subject to the limitations in paragraph 4(a), below; - v. rights protecting the extraction, dissemination, use and reuse of data - in a Work; - vi. database rights (such as those arising under Directive 96/9/EC of the - European Parliament and of the Council of 11 March 1996 on the legal - protection of databases, and under any national implementation - thereof, including any amended or successor version of such - directive); and -vii. other similar, equivalent or corresponding rights throughout the - world based on applicable law or treaty, and any national - implementations thereof. - -2. Waiver. To the greatest extent permitted by, but not in contravention -of, applicable law, Affirmer hereby overtly, fully, permanently, -irrevocably and unconditionally waives, abandons, and surrenders all of -Affirmer's Copyright and Related Rights and associated claims and causes -of action, whether now known or unknown (including existing as well as -future claims and causes of action), in the Work (i) in all territories -worldwide, (ii) for the maximum duration provided by applicable law or -treaty (including future time extensions), (iii) in any current or future -medium and for any number of copies, and (iv) for any purpose whatsoever, -including without limitation commercial, advertising or promotional -purposes (the "Waiver"). Affirmer makes the Waiver for the benefit of each -member of the public at large and to the detriment of Affirmer's heirs and -successors, fully intending that such Waiver shall not be subject to -revocation, rescission, cancellation, termination, or any other legal or -equitable action to disrupt the quiet enjoyment of the Work by the public -as contemplated by Affirmer's express Statement of Purpose. - -3. Public License Fallback. Should any part of the Waiver for any reason -be judged legally invalid or ineffective under applicable law, then the -Waiver shall be preserved to the maximum extent permitted taking into -account Affirmer's express Statement of Purpose. In addition, to the -extent the Waiver is so judged Affirmer hereby grants to each affected -person a royalty-free, non transferable, non sublicensable, non exclusive, -irrevocable and unconditional license to exercise Affirmer's Copyright and -Related Rights in the Work (i) in all territories worldwide, (ii) for the -maximum duration provided by applicable law or treaty (including future -time extensions), (iii) in any current or future medium and for any number -of copies, and (iv) for any purpose whatsoever, including without -limitation commercial, advertising or promotional purposes (the -"License"). The License shall be deemed effective as of the date CC0 was -applied by Affirmer to the Work. Should any part of the License for any -reason be judged legally invalid or ineffective under applicable law, such -partial invalidity or ineffectiveness shall not invalidate the remainder -of the License, and in such case Affirmer hereby affirms that he or she -will not (i) exercise any of his or her remaining Copyright and Related -Rights in the Work or (ii) assert any associated claims and causes of -action with respect to the Work, in either case contrary to Affirmer's -express Statement of Purpose. - -4. Limitations and Disclaimers. - - a. No trademark or patent rights held by Affirmer are waived, abandoned, - surrendered, licensed or otherwise affected by this document. - b. Affirmer offers the Work as-is and makes no representations or - warranties of any kind concerning the Work, express, implied, - statutory or otherwise, including without limitation warranties of - title, merchantability, fitness for a particular purpose, non - infringement, or the absence of latent or other defects, accuracy, or - the present or absence of errors, whether or not discoverable, all to - the greatest extent permissible under applicable law. - c. Affirmer disclaims responsibility for clearing rights of other persons - that may apply to the Work or any use thereof, including without - limitation any person's Copyright and Related Rights in the Work. - Further, Affirmer disclaims responsibility for obtaining any necessary - consents, permissions or other rights required for any use of the - Work. - d. Affirmer understands and acknowledges that Creative Commons is not a - party to this document and has no duty or obligation with respect to - this CC0 or use of the Work. - -------------------------------------------------------------------------------- - - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "[]" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright 2019 Jack O'Connor and Samuel Neves - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. \ No newline at end of file diff --git a/narcissus-core/src/blake3/constant_time_eq.rs b/narcissus-core/src/blake3/constant_time_eq.rs deleted file mode 100644 index bb2f308..0000000 --- a/narcissus-core/src/blake3/constant_time_eq.rs +++ /dev/null @@ -1,153 +0,0 @@ -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -#[inline] -fn optimizer_hide(mut value: u8) -> u8 { - // SAFETY: the input value is passed unchanged to the output, the inline assembly does nothing. - unsafe { - core::arch::asm!("/* {0} */", inout(reg_byte) value, options(pure, nomem, nostack, preserves_flags)); - value - } -} - -#[cfg(any( - target_arch = "arm", - target_arch = "aarch64", - target_arch = "riscv32", - target_arch = "riscv64" -))] -#[allow(asm_sub_register)] -#[inline] -fn optimizer_hide(mut value: u8) -> u8 { - // SAFETY: the input value is passed unchanged to the output, the inline assembly does nothing. - unsafe { - core::arch::asm!("/* {0} */", inout(reg) value, options(pure, nomem, nostack, preserves_flags)); - value - } -} - -#[cfg(not(any( - target_arch = "x86", - target_arch = "x86_64", - target_arch = "arm", - target_arch = "aarch64", - target_arch = "riscv32", - target_arch = "riscv64" -)))] -#[inline(never)] // This function is non-inline to prevent the optimizer from looking inside it. -fn optimizer_hide(value: u8) -> u8 { - // SAFETY: the result of casting a reference to a pointer is valid; the type is Copy. - unsafe { core::ptr::read_volatile(&value) } -} - -#[inline] -fn constant_time_ne(a: &[u8], b: &[u8]) -> u8 { - assert!(a.len() == b.len()); - - // These useless slices make the optimizer elide the bounds checks. - // See the comment in clone_from_slice() added on Rust commit 6a7bc47. - let len = a.len(); - let a = &a[..len]; - let b = &b[..len]; - - let mut tmp = 0; - for i in 0..len { - tmp |= a[i] ^ b[i]; - } - - // The compare with 0 must happen outside this function. - optimizer_hide(tmp) -} - -/// Compares two equal-sized byte strings in constant time. -/// -/// # Examples -/// -/// ``` -/// use narcissus_core::blake3::constant_time_eq::constant_time_eq; -/// -/// assert!(constant_time_eq(b"foo", b"foo")); -/// assert!(!constant_time_eq(b"foo", b"bar")); -/// assert!(!constant_time_eq(b"bar", b"baz")); -/// # assert!(constant_time_eq(b"", b"")); -/// -/// // Not equal-sized, so won't take constant time. -/// assert!(!constant_time_eq(b"foo", b"")); -/// assert!(!constant_time_eq(b"foo", b"quux")); -/// ``` -pub fn constant_time_eq(a: &[u8], b: &[u8]) -> bool { - a.len() == b.len() && constant_time_ne(a, b) == 0 -} - -// Fixed-size array variant. - -#[inline] -fn constant_time_ne_n(a: &[u8; N], b: &[u8; N]) -> u8 { - let mut tmp = 0; - for i in 0..N { - tmp |= a[i] ^ b[i]; - } - - // The compare with 0 must happen outside this function. - optimizer_hide(tmp) -} - -/// Compares two fixed-size byte strings in constant time. -/// -/// # Examples -/// -/// ``` -/// use narcissus_core::blake3::constant_time_eq::constant_time_eq_n; -/// -/// assert!(constant_time_eq_n(&[3; 20], &[3; 20])); -/// assert!(!constant_time_eq_n(&[3; 20], &[7; 20])); -/// ``` -#[inline] -pub fn constant_time_eq_n(a: &[u8; N], b: &[u8; N]) -> bool { - constant_time_ne_n(a, b) == 0 -} - -// Fixed-size variants for the most common sizes. - -/// Compares two 128-bit byte strings in constant time. -/// -/// # Examples -/// -/// ``` -/// use narcissus_core::blake3::constant_time_eq::constant_time_eq_16; -/// -/// assert!(constant_time_eq_16(&[3; 16], &[3; 16])); -/// assert!(!constant_time_eq_16(&[3; 16], &[7; 16])); -/// ``` -#[inline] -pub fn constant_time_eq_16(a: &[u8; 16], b: &[u8; 16]) -> bool { - constant_time_eq_n(a, b) -} - -/// Compares two 256-bit byte strings in constant time. -/// -/// # Examples -/// -/// ``` -/// use narcissus_core::blake3::constant_time_eq::constant_time_eq_32; -/// -/// assert!(constant_time_eq_32(&[3; 32], &[3; 32])); -/// assert!(!constant_time_eq_32(&[3; 32], &[7; 32])); -/// ``` -#[inline] -pub fn constant_time_eq_32(a: &[u8; 32], b: &[u8; 32]) -> bool { - constant_time_eq_n(a, b) -} - -/// Compares two 512-bit byte strings in constant time. -/// -/// # Examples -/// -/// ``` -/// use narcissus_core::blake3::constant_time_eq::constant_time_eq_64; -/// -/// assert!(constant_time_eq_64(&[3; 64], &[3; 64])); -/// assert!(!constant_time_eq_64(&[3; 64], &[7; 64])); -/// ``` -#[inline] -pub fn constant_time_eq_64(a: &[u8; 64], b: &[u8; 64]) -> bool { - constant_time_eq_n(a, b) -} diff --git a/narcissus-core/src/blake3/guts.rs b/narcissus-core/src/blake3/guts.rs deleted file mode 100644 index 939287a..0000000 --- a/narcissus-core/src/blake3/guts.rs +++ /dev/null @@ -1,103 +0,0 @@ -//! This undocumented and unstable module is for use cases like the `bao` crate, -//! which need to traverse the BLAKE3 Merkle tree and work with chunk and parent -//! chaining values directly. There might be breaking changes to this module -//! between patch versions. -//! -//! We could stabilize something like this module in the future. If you have a -//! use case for it, please let us know by filing a GitHub issue. - -pub const BLOCK_LEN: usize = 64; -pub const CHUNK_LEN: usize = 1024; - -#[derive(Clone, Debug)] -pub struct ChunkState(super::ChunkState); - -impl ChunkState { - // Currently this type only supports the regular hash mode. If an - // incremental user needs keyed_hash or derive_key, we can add that. - pub fn new(chunk_counter: u64) -> Self { - Self(super::ChunkState::new( - super::IV, - chunk_counter, - 0, - super::platform::Platform::detect(), - )) - } - - #[inline] - pub fn len(&self) -> usize { - self.0.len() - } - - #[inline] - pub fn update(&mut self, input: &[u8]) -> &mut Self { - self.0.update(input); - self - } - - pub fn finalize(&self, is_root: bool) -> super::Hash { - let output = self.0.output(); - if is_root { - output.root_hash() - } else { - output.chaining_value().into() - } - } -} - -// As above, this currently assumes the regular hash mode. If an incremental -// user needs keyed_hash or derive_key, we can add that. -pub fn parent_cv( - left_child: &super::Hash, - right_child: &super::Hash, - is_root: bool, -) -> super::Hash { - let output = super::parent_node_output( - left_child.as_bytes(), - right_child.as_bytes(), - super::IV, - 0, - super::platform::Platform::detect(), - ); - if is_root { - output.root_hash() - } else { - output.chaining_value().into() - } -} - -#[cfg(test)] -mod test { - use crate::blake3::{hash, Hasher}; - - use super::*; - - #[test] - fn test_chunk() { - assert_eq!( - hash(b"foo"), - ChunkState::new(0).update(b"foo").finalize(true) - ); - } - - #[test] - fn test_parents() { - let mut hasher = Hasher::new(); - let mut buf = [0; super::CHUNK_LEN]; - - buf[0] = 'a' as u8; - hasher.update(&buf); - let chunk0_cv = ChunkState::new(0).update(&buf).finalize(false); - - buf[0] = 'b' as u8; - hasher.update(&buf); - let chunk1_cv = ChunkState::new(1).update(&buf).finalize(false); - - hasher.update(b"c"); - let chunk2_cv = ChunkState::new(2).update(b"c").finalize(false); - - let parent = parent_cv(&chunk0_cv, &chunk1_cv, false); - let root = parent_cv(&parent, &chunk2_cv, true); - assert_eq!(hasher.finalize(), root); - } -} diff --git a/narcissus-core/src/blake3/join.rs b/narcissus-core/src/blake3/join.rs deleted file mode 100644 index aa17159..0000000 --- a/narcissus-core/src/blake3/join.rs +++ /dev/null @@ -1,62 +0,0 @@ -//! The multi-threading abstractions used by `Hasher::update_with_join`. -//! -//! Different implementations of the `Join` trait determine whether -//! `Hasher::update_with_join` performs multi-threading on sufficiently large -//! inputs. The `SerialJoin` implementation is single-threaded, and the -//! `RayonJoin` implementation (gated by the `rayon` feature) is multi-threaded. -//! Interfaces other than `Hasher::update_with_join`, like [`hash`](crate::hash) -//! and [`Hasher::update`](crate::Hasher::update), always use `SerialJoin` -//! internally. -//! -//! The `Join` trait is an almost exact copy of the [`rayon::join`] API, and -//! `RayonJoin` is the only non-trivial implementation. Previously this trait -//! was public, but currently it's been re-privatized, as it's both 1) of no -//! value to most callers and 2) a pretty big implementation detail to commit -//! to. -//! -//! [`rayon::join`]: https://docs.rs/rayon/1.3.0/rayon/fn.join.html - -/// The trait that abstracts over single-threaded and multi-threaded recursion. -/// -/// See the [`join` module docs](index.html) for more details. -pub trait Join { - fn join(oper_a: A, oper_b: B) -> (RA, RB) - where - A: FnOnce() -> RA + Send, - B: FnOnce() -> RB + Send, - RA: Send, - RB: Send; -} - -/// The trivial, serial implementation of `Join`. The left and right sides are -/// executed one after the other, on the calling thread. The standalone hashing -/// functions and the `Hasher::update` method use this implementation -/// internally. -/// -/// See the [`join` module docs](index.html) for more details. -pub enum SerialJoin {} - -impl Join for SerialJoin { - #[inline] - fn join(oper_a: A, oper_b: B) -> (RA, RB) - where - A: FnOnce() -> RA + Send, - B: FnOnce() -> RB + Send, - RA: Send, - RB: Send, - { - (oper_a(), oper_b()) - } -} - -#[cfg(test)] -mod test { - use super::*; - - #[test] - fn test_serial_join() { - let oper_a = || 1 + 1; - let oper_b = || 2 + 2; - assert_eq!((2, 4), SerialJoin::join(oper_a, oper_b)); - } -} diff --git a/narcissus-core/src/blake3/mod.rs b/narcissus-core/src/blake3/mod.rs deleted file mode 100644 index 35a85fb..0000000 --- a/narcissus-core/src/blake3/mod.rs +++ /dev/null @@ -1,1222 +0,0 @@ -//! The official Rust implementation of the [BLAKE3] cryptographic hash -//! function. -//! -//! # Examples -//! -//! ``` -//! # fn main() -> Result<(), Box> { -//! // Hash an input all at once. -//! let hash1 = narcissus_core::blake3::hash(b"foobarbaz"); -//! -//! // Hash an input incrementally. -//! let mut hasher = narcissus_core::blake3::Hasher::new(); -//! hasher.update(b"foo"); -//! hasher.update(b"bar"); -//! hasher.update(b"baz"); -//! let hash2 = hasher.finalize(); -//! assert_eq!(hash1, hash2); -//! # Ok(()) -//! # } -//! ``` -//! -//! [BLAKE3]: https://blake3.io -//! [docs.rs]: https://docs.rs/ -//! [`Write`]: https://doc.rust-lang.org/std/io/trait.Write.html -//! [`Seek`]: https://doc.rust-lang.org/std/io/trait.Seek.html - -#[cfg(test)] -pub(crate) mod test; - -#[cfg(test)] -pub(crate) mod reference_impl; - -// The guts module is for incremental use cases like the `bao` crate that need -// to explicitly compute chunk and parent chaining values. It is semi-stable -// and likely to keep working, but largely undocumented and not intended for -// widespread use. -#[doc(hidden)] -pub mod guts; - -/// Undocumented and unstable, for benchmarks only. -#[doc(hidden)] -pub mod platform; - -mod portable; - -// Platform-specific implementations of the compression function. These -// BLAKE3-specific cfg flags are set in build.rs. -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -#[path = "rust_avx2.rs"] -mod avx2; -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -#[path = "rust_sse2.rs"] -mod sse2; -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -#[path = "rust_sse41.rs"] -mod sse41; - -mod join; - -use core::cmp; -use core::fmt; -use platform::{Platform, MAX_SIMD_DEGREE, MAX_SIMD_DEGREE_OR_2}; - -/// The number of bytes in a [`Hash`](struct.Hash.html), 32. -pub const OUT_LEN: usize = 32; - -/// The number of bytes in a key, 32. -pub const KEY_LEN: usize = 32; - -const MAX_DEPTH: usize = 54; // 2^54 * CHUNK_LEN = 2^64 -use guts::{BLOCK_LEN, CHUNK_LEN}; - -pub mod constant_time_eq; - -use crate::slice::array_chunks; -use crate::slice::split_array_ref; -use crate::FixedVec; - -// While iterating the compression function within a chunk, the CV is -// represented as words, to avoid doing two extra endianness conversions for -// each compression in the portable implementation. But the hash_many interface -// needs to hash both input bytes and parent nodes, so its better for its -// output CVs to be represented as bytes. -type CVWords = [u32; 8]; -type CVBytes = [u8; 32]; // little-endian - -const IV: &CVWords = &[ - 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19, -]; - -const MSG_SCHEDULE: [[usize; 16]; 7] = [ - [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15], - [2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8], - [3, 4, 10, 12, 13, 2, 7, 14, 6, 5, 9, 0, 11, 15, 8, 1], - [10, 7, 12, 9, 14, 3, 13, 15, 4, 0, 11, 2, 5, 8, 1, 6], - [12, 13, 9, 11, 15, 10, 14, 8, 7, 2, 5, 3, 0, 1, 6, 4], - [9, 14, 11, 5, 8, 12, 15, 1, 13, 3, 0, 10, 2, 6, 4, 7], - [11, 15, 5, 0, 1, 9, 8, 6, 14, 10, 2, 12, 3, 4, 7, 13], -]; - -// These are the internal flags that we use to domain separate root/non-root, -// chunk/parent, and chunk beginning/middle/end. These get set at the high end -// of the block flags word in the compression function, so their values start -// high and go down. -const CHUNK_START: u8 = 1 << 0; -const CHUNK_END: u8 = 1 << 1; -const PARENT: u8 = 1 << 2; -const ROOT: u8 = 1 << 3; -const KEYED_HASH: u8 = 1 << 4; -const DERIVE_KEY_CONTEXT: u8 = 1 << 5; -const DERIVE_KEY_MATERIAL: u8 = 1 << 6; - -#[inline] -fn counter_low(counter: u64) -> u32 { - counter as u32 -} - -#[inline] -fn counter_high(counter: u64) -> u32 { - (counter >> 32) as u32 -} - -/// An output of the default size, 32 bytes, which provides constant-time -/// equality checking. -/// -/// `Hash` implements [`From`] and [`Into`] for `[u8; 32]`, and it provides an -/// explicit [`as_bytes`] method returning `&[u8; 32]`. However, byte arrays -/// and slices don't provide constant-time equality checking, which is often a -/// security requirement in software that handles private data. `Hash` doesn't -/// implement [`Deref`] or [`AsRef`], to avoid situations where a type -/// conversion happens implicitly and the constant-time property is -/// accidentally lost. -/// -/// [`From`]: https://doc.rust-lang.org/std/convert/trait.From.html -/// [`Into`]: https://doc.rust-lang.org/std/convert/trait.Into.html -/// [`as_bytes`]: #method.as_bytes -/// [`Deref`]: https://doc.rust-lang.org/stable/std/ops/trait.Deref.html -/// [`AsRef`]: https://doc.rust-lang.org/std/convert/trait.AsRef.html -#[derive(Clone, Copy)] -pub struct Hash([u8; OUT_LEN]); - -impl Hash { - /// The raw bytes of the `Hash`. Note that byte arrays don't provide - /// constant-time equality checking, so if you need to compare hashes, - /// prefer the `Hash` type. - #[inline] - pub fn as_bytes(&self) -> &[u8; OUT_LEN] { - &self.0 - } -} - -impl From<[u8; OUT_LEN]> for Hash { - #[inline] - fn from(bytes: [u8; OUT_LEN]) -> Self { - Self(bytes) - } -} - -impl From for [u8; OUT_LEN] { - #[inline] - fn from(hash: Hash) -> Self { - hash.0 - } -} - -/// This implementation is constant-time. -impl PartialEq for Hash { - #[inline] - fn eq(&self, other: &Hash) -> bool { - constant_time_eq::constant_time_eq_32(&self.0, &other.0) - } -} - -impl std::hash::Hash for Hash { - fn hash(&self, state: &mut H) { - self.0.hash(state); - } -} - -/// This implementation is constant-time. -impl PartialEq<[u8; OUT_LEN]> for Hash { - #[inline] - fn eq(&self, other: &[u8; OUT_LEN]) -> bool { - constant_time_eq::constant_time_eq_32(&self.0, other) - } -} - -/// This implementation is constant-time if the target is 32 bytes long. -impl PartialEq<[u8]> for Hash { - #[inline] - fn eq(&self, other: &[u8]) -> bool { - constant_time_eq::constant_time_eq(&self.0, other) - } -} - -impl Eq for Hash {} - -impl fmt::Debug for Hash { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - f.debug_tuple("Hash").field(&self.0).finish() - } -} - -// Each chunk or parent node can produce either a 32-byte chaining value or, by -// setting the ROOT flag, any number of final output bytes. The Output struct -// captures the state just prior to choosing between those two possibilities. -#[derive(Clone)] -struct Output { - input_chaining_value: CVWords, - block: [u8; 64], - block_len: u8, - counter: u64, - flags: u8, - platform: Platform, -} - -impl Output { - fn chaining_value(&self) -> CVBytes { - let mut cv = self.input_chaining_value; - self.platform.compress_in_place( - &mut cv, - &self.block, - self.block_len, - self.counter, - self.flags, - ); - platform::le_bytes_from_words_32(&cv) - } - - fn root_hash(&self) -> Hash { - debug_assert_eq!(self.counter, 0); - let mut cv = self.input_chaining_value; - self.platform - .compress_in_place(&mut cv, &self.block, self.block_len, 0, self.flags | ROOT); - Hash(platform::le_bytes_from_words_32(&cv)) - } - - fn root_output_block(&self) -> [u8; 2 * OUT_LEN] { - self.platform.compress_xof( - &self.input_chaining_value, - &self.block, - self.block_len, - self.counter, - self.flags | ROOT, - ) - } -} - -#[derive(Clone)] -struct ChunkState { - cv: CVWords, - chunk_counter: u64, - buf: [u8; BLOCK_LEN], - buf_len: u8, - blocks_compressed: u8, - flags: u8, - platform: Platform, -} - -impl ChunkState { - fn new(key: &CVWords, chunk_counter: u64, flags: u8, platform: Platform) -> Self { - Self { - cv: *key, - chunk_counter, - buf: [0; BLOCK_LEN], - buf_len: 0, - blocks_compressed: 0, - flags, - platform, - } - } - - fn len(&self) -> usize { - BLOCK_LEN * self.blocks_compressed as usize + self.buf_len as usize - } - - fn fill_buf(&mut self, input: &mut &[u8]) { - let want = BLOCK_LEN - self.buf_len as usize; - let take = cmp::min(want, input.len()); - self.buf[self.buf_len as usize..][..take].copy_from_slice(&input[..take]); - self.buf_len += take as u8; - *input = &input[take..]; - } - - fn start_flag(&self) -> u8 { - if self.blocks_compressed == 0 { - CHUNK_START - } else { - 0 - } - } - - // Try to avoid buffering as much as possible, by compressing directly from - // the input slice when full blocks are available. - fn update(&mut self, mut input: &[u8]) -> &mut Self { - if self.buf_len > 0 { - self.fill_buf(&mut input); - if !input.is_empty() { - debug_assert_eq!(self.buf_len as usize, BLOCK_LEN); - let block_flags = self.flags | self.start_flag(); // borrowck - self.platform.compress_in_place( - &mut self.cv, - &self.buf, - BLOCK_LEN as u8, - self.chunk_counter, - block_flags, - ); - self.buf_len = 0; - self.buf = [0; BLOCK_LEN]; - self.blocks_compressed += 1; - } - } - - while input.len() > BLOCK_LEN { - debug_assert_eq!(self.buf_len, 0); - let block_flags = self.flags | self.start_flag(); // borrowck - let (block, rem) = split_array_ref(input); - self.platform.compress_in_place( - &mut self.cv, - block, - BLOCK_LEN as u8, - self.chunk_counter, - block_flags, - ); - self.blocks_compressed += 1; - input = rem; - } - - self.fill_buf(&mut input); - debug_assert!(input.is_empty()); - debug_assert!(self.len() <= CHUNK_LEN); - self - } - - fn output(&self) -> Output { - let block_flags = self.flags | self.start_flag() | CHUNK_END; - Output { - input_chaining_value: self.cv, - block: self.buf, - block_len: self.buf_len, - counter: self.chunk_counter, - flags: block_flags, - platform: self.platform, - } - } -} - -// Don't derive(Debug), because the state may be secret. -impl fmt::Debug for ChunkState { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - f.debug_struct("ChunkState") - .field("len", &self.len()) - .field("chunk_counter", &self.chunk_counter) - .field("flags", &self.flags) - .field("platform", &self.platform) - .finish() - } -} - -// IMPLEMENTATION NOTE -// =================== -// The recursive function compress_subtree_wide(), implemented below, is the -// basis of high-performance BLAKE3. We use it both for all-at-once hashing, -// and for the incremental input with Hasher (though we have to be careful with -// subtree boundaries in the incremental case). compress_subtree_wide() applies -// several optimizations at the same time: -// - Multithreading with Rayon. -// - Parallel chunk hashing with SIMD. -// - Parallel parent hashing with SIMD. Note that while SIMD chunk hashing -// maxes out at MAX_SIMD_DEGREE*CHUNK_LEN, parallel parent hashing continues -// to benefit from larger inputs, because more levels of the tree benefit can -// use full-width SIMD vectors for parent hashing. Without parallel parent -// hashing, we lose about 10% of overall throughput on AVX2 and AVX-512. - -/// Undocumented and unstable, for benchmarks only. -#[doc(hidden)] -#[derive(Clone, Copy)] -pub enum IncrementCounter { - Yes, - No, -} - -impl IncrementCounter { - #[inline] - fn yes(&self) -> bool { - match self { - IncrementCounter::Yes => true, - IncrementCounter::No => false, - } - } -} - -// The largest power of two less than or equal to `n`, used for left_len() -// immediately below, and also directly in Hasher::update(). -fn largest_power_of_two_leq(n: usize) -> usize { - ((n / 2) + 1).next_power_of_two() -} - -// Given some input larger than one chunk, return the number of bytes that -// should go in the left subtree. This is the largest power-of-2 number of -// chunks that leaves at least 1 byte for the right subtree. -fn left_len(content_len: usize) -> usize { - debug_assert!(content_len > CHUNK_LEN); - // Subtract 1 to reserve at least one byte for the right side. - let full_chunks = (content_len - 1) / CHUNK_LEN; - largest_power_of_two_leq(full_chunks) * CHUNK_LEN -} - -// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE chunks at the same time -// on a single thread. Write out the chunk chaining values and return the -// number of chunks hashed. These chunks are never the root and never empty; -// those cases use a different codepath. -fn compress_chunks_parallel( - input: &[u8], - key: &CVWords, - chunk_counter: u64, - flags: u8, - platform: Platform, - out: &mut [u8], -) -> usize { - debug_assert!(!input.is_empty(), "empty chunks below the root"); - debug_assert!(input.len() <= MAX_SIMD_DEGREE * CHUNK_LEN); - - let mut chunks_exact = array_chunks(input); - let mut chunks_array = FixedVec::<&[u8; CHUNK_LEN], MAX_SIMD_DEGREE>::new(); - for chunk in &mut chunks_exact { - chunks_array.push(chunk); - } - platform.hash_many( - &chunks_array, - key, - chunk_counter, - IncrementCounter::Yes, - flags, - CHUNK_START, - CHUNK_END, - out, - ); - - // Hash the remaining partial chunk, if there is one. Note that the empty - // chunk (meaning the empty message) is a different codepath. - let chunks_so_far = chunks_array.len(); - if !chunks_exact.remainder().is_empty() { - let counter = chunk_counter + chunks_so_far as u64; - let mut chunk_state = ChunkState::new(key, counter, flags, platform); - chunk_state.update(chunks_exact.remainder()); - out[chunks_so_far * OUT_LEN..chunks_so_far * OUT_LEN + OUT_LEN] - .copy_from_slice(&chunk_state.output().chaining_value()); - chunks_so_far + 1 - } else { - chunks_so_far - } -} - -// Use SIMD parallelism to hash up to MAX_SIMD_DEGREE parents at the same time -// on a single thread. Write out the parent chaining values and return the -// number of parents hashed. (If there's an odd input chaining value left over, -// return it as an additional output.) These parents are never the root and -// never empty; those cases use a different codepath. -fn compress_parents_parallel( - child_chaining_values: &[u8], - key: &CVWords, - flags: u8, - platform: Platform, - out: &mut [u8], -) -> usize { - debug_assert_eq!(child_chaining_values.len() % OUT_LEN, 0, "wacky hash bytes"); - let num_children = child_chaining_values.len() / OUT_LEN; - debug_assert!(num_children >= 2, "not enough children"); - debug_assert!(num_children <= 2 * MAX_SIMD_DEGREE_OR_2, "too many"); - - let mut parents_exact = array_chunks(child_chaining_values); - // Use MAX_SIMD_DEGREE_OR_2 rather than MAX_SIMD_DEGREE here, because of - // the requirements of compress_subtree_wide(). - let mut parents_array = FixedVec::<&[u8; BLOCK_LEN], MAX_SIMD_DEGREE_OR_2>::new(); - for parent in &mut parents_exact { - parents_array.push(parent); - } - platform.hash_many( - &parents_array, - key, - 0, // Parents always use counter 0. - IncrementCounter::No, - flags | PARENT, - 0, // Parents have no start flags. - 0, // Parents have no end flags. - out, - ); - - // If there's an odd child left over, it becomes an output. - let parents_so_far = parents_array.len(); - if !parents_exact.remainder().is_empty() { - out[parents_so_far * OUT_LEN..][..OUT_LEN].copy_from_slice(parents_exact.remainder()); - parents_so_far + 1 - } else { - parents_so_far - } -} - -// The wide helper function returns (writes out) an array of chaining values -// and returns the length of that array. The number of chaining values returned -// is the dynamically detected SIMD degree, at most MAX_SIMD_DEGREE. Or fewer, -// if the input is shorter than that many chunks. The reason for maintaining a -// wide array of chaining values going back up the tree, is to allow the -// implementation to hash as many parents in parallel as possible. -// -// As a special case when the SIMD degree is 1, this function will still return -// at least 2 outputs. This guarantees that this function doesn't perform the -// root compression. (If it did, it would use the wrong flags, and also we -// wouldn't be able to implement exendable output.) Note that this function is -// not used when the whole input is only 1 chunk long; that's a different -// codepath. -// -// Why not just have the caller split the input on the first update(), instead -// of implementing this special rule? Because we don't want to limit SIMD or -// multithreading parallelism for that update(). -fn compress_subtree_wide( - input: &[u8], - key: &CVWords, - chunk_counter: u64, - flags: u8, - platform: Platform, - out: &mut [u8], -) -> usize { - // Note that the single chunk case does *not* bump the SIMD degree up to 2 - // when it is 1. This allows Rayon the option of multithreading even the - // 2-chunk case, which can help performance on smaller platforms. - if input.len() <= platform.simd_degree() * CHUNK_LEN { - return compress_chunks_parallel(input, key, chunk_counter, flags, platform, out); - } - - // With more than simd_degree chunks, we need to recurse. Start by dividing - // the input into left and right subtrees. (Note that this is only optimal - // as long as the SIMD degree is a power of 2. If we ever get a SIMD degree - // of 3 or something, we'll need a more complicated strategy.) - debug_assert_eq!(platform.simd_degree().count_ones(), 1, "power of 2"); - let (left, right) = input.split_at(left_len(input.len())); - let right_chunk_counter = chunk_counter + (left.len() / CHUNK_LEN) as u64; - - // Make space for the child outputs. Here we use MAX_SIMD_DEGREE_OR_2 to - // account for the special case of returning 2 outputs when the SIMD degree - // is 1. - let mut cv_array = [0; 2 * MAX_SIMD_DEGREE_OR_2 * OUT_LEN]; - let degree = if left.len() == CHUNK_LEN { - // The "simd_degree=1 and we're at the leaf nodes" case. - debug_assert_eq!(platform.simd_degree(), 1); - 1 - } else { - cmp::max(platform.simd_degree(), 2) - }; - let (left_out, right_out) = cv_array.split_at_mut(degree * OUT_LEN); - - let (left_n, right_n) = J::join( - || compress_subtree_wide::(left, key, chunk_counter, flags, platform, left_out), - || compress_subtree_wide::(right, key, right_chunk_counter, flags, platform, right_out), - ); - - // The special case again. If simd_degree=1, then we'll have left_n=1 and - // right_n=1. Rather than compressing them into a single output, return - // them directly, to make sure we always have at least two outputs. - debug_assert_eq!(left_n, degree); - debug_assert!(right_n >= 1 && right_n <= left_n); - if left_n == 1 { - out[..2 * OUT_LEN].copy_from_slice(&cv_array[..2 * OUT_LEN]); - return 2; - } - - // Otherwise, do one layer of parent node compression. - let num_children = left_n + right_n; - compress_parents_parallel( - &cv_array[..num_children * OUT_LEN], - key, - flags, - platform, - out, - ) -} - -// Hash a subtree with compress_subtree_wide(), and then condense the resulting -// list of chaining values down to a single parent node. Don't compress that -// last parent node, however. Instead, return its message bytes (the -// concatenated chaining values of its children). This is necessary when the -// first call to update() supplies a complete subtree, because the topmost -// parent node of that subtree could end up being the root. It's also necessary -// for extended output in the general case. -// -// As with compress_subtree_wide(), this function is not used on inputs of 1 -// chunk or less. That's a different codepath. -fn compress_subtree_to_parent_node( - input: &[u8], - key: &CVWords, - chunk_counter: u64, - flags: u8, - platform: Platform, -) -> [u8; BLOCK_LEN] { - debug_assert!(input.len() > CHUNK_LEN); - let mut cv_array = [0; MAX_SIMD_DEGREE_OR_2 * OUT_LEN]; - let mut num_cvs = - compress_subtree_wide::(input, key, chunk_counter, flags, platform, &mut cv_array); - debug_assert!(num_cvs >= 2); - - // If MAX_SIMD_DEGREE is greater than 2 and there's enough input, - // compress_subtree_wide() returns more than 2 chaining values. Condense - // them into 2 by forming parent nodes repeatedly. - let mut out_array = [0; MAX_SIMD_DEGREE_OR_2 * OUT_LEN / 2]; - while num_cvs > 2 { - let cv_slice = &cv_array[..num_cvs * OUT_LEN]; - num_cvs = compress_parents_parallel(cv_slice, key, flags, platform, &mut out_array); - cv_array[..num_cvs * OUT_LEN].copy_from_slice(&out_array[..num_cvs * OUT_LEN]); - } - cv_array[0..2 * OUT_LEN].try_into().unwrap() -} - -// Hash a complete input all at once. Unlike compress_subtree_wide() and -// compress_subtree_to_parent_node(), this function handles the 1 chunk case. -fn hash_all_at_once(input: &[u8], key: &CVWords, flags: u8) -> Output { - let platform = Platform::detect(); - - // If the whole subtree is one chunk, hash it directly with a ChunkState. - if input.len() <= CHUNK_LEN { - return ChunkState::new(key, 0, flags, platform) - .update(input) - .output(); - } - - // Otherwise construct an Output object from the parent node returned by - // compress_subtree_to_parent_node(). - Output { - input_chaining_value: *key, - block: compress_subtree_to_parent_node::(input, key, 0, flags, platform), - block_len: BLOCK_LEN as u8, - counter: 0, - flags: flags | PARENT, - platform, - } -} - -/// The default hash function. -/// -/// For an incremental version that accepts multiple writes, see -/// [`Hasher::update`]. -/// -/// For output sizes other than 32 bytes, see [`Hasher::finalize_xof`] and -/// [`OutputReader`]. -/// -/// This function is always single-threaded. -pub fn hash(input: &[u8]) -> Hash { - hash_all_at_once::(input, IV, 0).root_hash() -} - -/// The keyed hash function. -/// -/// This is suitable for use as a message authentication code, for example to -/// replace an HMAC instance. In that use case, the constant-time equality -/// checking provided by [`Hash`](struct.Hash.html) is almost always a security -/// requirement, and callers need to be careful not to compare MACs as raw -/// bytes. -/// -/// For output sizes other than 32 bytes, see [`Hasher::new_keyed`], -/// [`Hasher::finalize_xof`], and [`OutputReader`]. -/// -/// This function is always single-threaded. For multithreading support, see -/// [`Hasher::new_keyed`] -pub fn keyed_hash(key: &[u8; KEY_LEN], input: &[u8]) -> Hash { - let key_words = platform::words_from_le_bytes_32(key); - hash_all_at_once::(input, &key_words, KEYED_HASH).root_hash() -} - -/// The key derivation function. -/// -/// Given cryptographic key material of any length and a context string of any -/// length, this function outputs a 32-byte derived subkey. **The context string -/// should be hardcoded, globally unique, and application-specific.** A good -/// default format for such strings is `"[application] [commit timestamp] -/// [purpose]"`, e.g., `"example.com 2019-12-25 16:18:03 session tokens v1"`. -/// -/// Key derivation is important when you want to use the same key in multiple -/// algorithms or use cases. Using the same key with different cryptographic -/// algorithms is generally forbidden, and deriving a separate subkey for each -/// use case protects you from bad interactions. Derived keys also mitigate the -/// damage from one part of your application accidentally leaking its key. -/// -/// As a rare exception to that general rule, however, it is possible to use -/// `derive_key` itself with key material that you are already using with -/// another algorithm. You might need to do this if you're adding features to -/// an existing application, which does not yet use key derivation internally. -/// However, you still must not share key material with algorithms that forbid -/// key reuse entirely, like a one-time pad. For more on this, see sections 6.2 -/// and 7.8 of the [BLAKE3 paper](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf). -/// -/// Note that BLAKE3 is not a password hash, and **`derive_key` should never be -/// used with passwords.** Instead, use a dedicated password hash like -/// [Argon2]. Password hashes are entirely different from generic hash -/// functions, with opposite design requirements. -/// -/// For output sizes other than 32 bytes, see [`Hasher::new_derive_key`], -/// [`Hasher::finalize_xof`], and [`OutputReader`]. -/// -/// This function is always single-threaded. For multithreading support, see -/// [`Hasher::new_derive_key`] -/// -/// [Argon2]: https://en.wikipedia.org/wiki/Argon2 -pub fn derive_key(context: &str, key_material: &[u8]) -> [u8; OUT_LEN] { - let context_key = - hash_all_at_once::(context.as_bytes(), IV, DERIVE_KEY_CONTEXT) - .root_hash(); - let context_key_words = platform::words_from_le_bytes_32(context_key.as_bytes()); - hash_all_at_once::(key_material, &context_key_words, DERIVE_KEY_MATERIAL) - .root_hash() - .0 -} - -fn parent_node_output( - left_child: &CVBytes, - right_child: &CVBytes, - key: &CVWords, - flags: u8, - platform: Platform, -) -> Output { - let mut block = [0; BLOCK_LEN]; - block[..32].copy_from_slice(left_child); - block[32..].copy_from_slice(right_child); - Output { - input_chaining_value: *key, - block, - block_len: BLOCK_LEN as u8, - counter: 0, - flags: flags | PARENT, - platform, - } -} - -/// An incremental hash state that can accept any number of writes. -/// -/// When the `traits-preview` Cargo feature is enabled, this type implements -/// several commonly used traits from the -/// [`digest`](https://crates.io/crates/digest) crate. However, those -/// traits aren't stable, and they're expected to change in incompatible ways -/// before that crate reaches 1.0. For that reason, this crate makes no SemVer -/// guarantees for this feature, and callers who use it should expect breaking -/// changes between patch versions. -/// -/// **Performance note:** The [`update`](#method.update) method can't take full -/// advantage of SIMD optimizations if its input buffer is too small or oddly -/// sized. Using a 16 KiB buffer, or any multiple of that, enables all currently -/// supported SIMD instruction sets. -/// -/// # Examples -/// -/// ``` -/// # fn main() -> Result<(), Box> { -/// // Hash an input incrementally. -/// let mut hasher = narcissus_core::blake3::Hasher::new(); -/// hasher.update(b"foo"); -/// hasher.update(b"bar"); -/// hasher.update(b"baz"); -/// assert_eq!(hasher.finalize(), narcissus_core::blake3::hash(b"foobarbaz")); -/// -/// # Ok(()) -/// # } -/// ``` -#[derive(Clone)] -pub struct Hasher { - key: CVWords, - chunk_state: ChunkState, - // The stack size is MAX_DEPTH + 1 because we do lazy merging. For example, - // with 7 chunks, we have 3 entries in the stack. Adding an 8th chunk - // requires a 4th entry, rather than merging everything down to 1, because - // we don't know whether more input is coming. This is different from how - // the reference implementation does things. - cv_stack: FixedVec, -} - -impl Hasher { - fn new_internal(key: &CVWords, flags: u8) -> Self { - Self { - key: *key, - chunk_state: ChunkState::new(key, 0, flags, Platform::detect()), - cv_stack: FixedVec::new(), - } - } - - /// Construct a new `Hasher` for the regular hash function. - pub fn new() -> Self { - Self::new_internal(IV, 0) - } - - /// Construct a new `Hasher` for the keyed hash function. See - /// [`keyed_hash`]. - /// - /// [`keyed_hash`]: fn.keyed_hash.html - pub fn new_keyed(key: &[u8; KEY_LEN]) -> Self { - let key_words = platform::words_from_le_bytes_32(key); - Self::new_internal(&key_words, KEYED_HASH) - } - - /// Construct a new `Hasher` for the key derivation function. See - /// [`derive_key`]. The context string should be hardcoded, globally - /// unique, and application-specific. - /// - /// [`derive_key`]: fn.derive_key.html - pub fn new_derive_key(context: &str) -> Self { - let context_key = - hash_all_at_once::(context.as_bytes(), IV, DERIVE_KEY_CONTEXT) - .root_hash(); - let context_key_words = platform::words_from_le_bytes_32(context_key.as_bytes()); - Self::new_internal(&context_key_words, DERIVE_KEY_MATERIAL) - } - - /// Reset the `Hasher` to its initial state. - /// - /// This is functionally the same as overwriting the `Hasher` with a new - /// one, using the same key or context string if any. - pub fn reset(&mut self) -> &mut Self { - self.chunk_state = ChunkState::new( - &self.key, - 0, - self.chunk_state.flags, - self.chunk_state.platform, - ); - self.cv_stack.clear(); - self - } - - // As described in push_cv() below, we do "lazy merging", delaying merges - // until right before the next CV is about to be added. This is different - // from the reference implementation. Another difference is that we aren't - // always merging 1 chunk at a time. Instead, each CV might represent any - // power-of-two number of chunks, as long as the smaller-above-larger stack - // order is maintained. Instead of the "count the trailing 0-bits" - // algorithm described in the spec, we use a "count the total number of - // 1-bits" variant that doesn't require us to retain the subtree size of - // the CV on top of the stack. The principle is the same: each CV that - // should remain in the stack is represented by a 1-bit in the total number - // of chunks (or bytes) so far. - fn merge_cv_stack(&mut self, total_len: u64) { - let post_merge_stack_len = total_len.count_ones() as usize; - while self.cv_stack.len() > post_merge_stack_len { - let right_child = self.cv_stack.pop().unwrap(); - let left_child = self.cv_stack.pop().unwrap(); - let parent_output = parent_node_output( - &left_child, - &right_child, - &self.key, - self.chunk_state.flags, - self.chunk_state.platform, - ); - self.cv_stack.push(parent_output.chaining_value()); - } - } - - // In reference_impl.rs, we merge the new CV with existing CVs from the - // stack before pushing it. We can do that because we know more input is - // coming, so we know none of the merges are root. - // - // This setting is different. We want to feed as much input as possible to - // compress_subtree_wide(), without setting aside anything for the - // chunk_state. If the user gives us 64 KiB, we want to parallelize over - // all 64 KiB at once as a single subtree, if at all possible. - // - // This leads to two problems: - // 1) This 64 KiB input might be the only call that ever gets made to - // update. In this case, the root node of the 64 KiB subtree would be - // the root node of the whole tree, and it would need to be ROOT - // finalized. We can't compress it until we know. - // 2) This 64 KiB input might complete a larger tree, whose root node is - // similarly going to be the the root of the whole tree. For example, - // maybe we have 196 KiB (that is, 128 + 64) hashed so far. We can't - // compress the node at the root of the 256 KiB subtree until we know - // how to finalize it. - // - // The second problem is solved with "lazy merging". That is, when we're - // about to add a CV to the stack, we don't merge it with anything first, - // as the reference impl does. Instead we do merges using the *previous* CV - // that was added, which is sitting on top of the stack, and we put the new - // CV (unmerged) on top of the stack afterwards. This guarantees that we - // never merge the root node until finalize(). - // - // Solving the first problem requires an additional tool, - // compress_subtree_to_parent_node(). That function always returns the top - // *two* chaining values of the subtree it's compressing. We then do lazy - // merging with each of them separately, so that the second CV will always - // remain unmerged. (That also helps us support extendable output when - // we're hashing an input all-at-once.) - fn push_cv(&mut self, new_cv: &CVBytes, chunk_counter: u64) { - self.merge_cv_stack(chunk_counter); - self.cv_stack.push(*new_cv); - } - - /// Add input bytes to the hash state. You can call this any number of - /// times. - /// - /// This method is always single-threaded. - /// - /// Note that the degree of SIMD parallelism that `update` can use is - /// limited by the size of this input buffer. The 8 KiB buffer currently - /// used by [`std::io::copy`] is enough to leverage AVX2, for example, but - /// not enough to leverage AVX-512. A 16 KiB buffer is large enough to - /// leverage all currently supported SIMD instruction sets. - /// - /// [`std::io::copy`]: https://doc.rust-lang.org/std/io/fn.copy.html - pub fn update(&mut self, input: &[u8]) -> &mut Self { - self.update_with_join::(input) - } - - fn update_with_join(&mut self, mut input: &[u8]) -> &mut Self { - // If we have some partial chunk bytes in the internal chunk_state, we - // need to finish that chunk first. - if self.chunk_state.len() > 0 { - let want = CHUNK_LEN - self.chunk_state.len(); - let take = cmp::min(want, input.len()); - self.chunk_state.update(&input[..take]); - input = &input[take..]; - if !input.is_empty() { - // We've filled the current chunk, and there's more input - // coming, so we know it's not the root and we can finalize it. - // Then we'll proceed to hashing whole chunks below. - debug_assert_eq!(self.chunk_state.len(), CHUNK_LEN); - let chunk_cv = self.chunk_state.output().chaining_value(); - self.push_cv(&chunk_cv, self.chunk_state.chunk_counter); - self.chunk_state = ChunkState::new( - &self.key, - self.chunk_state.chunk_counter + 1, - self.chunk_state.flags, - self.chunk_state.platform, - ); - } else { - return self; - } - } - - // Now the chunk_state is clear, and we have more input. If there's - // more than a single chunk (so, definitely not the root chunk), hash - // the largest whole subtree we can, with the full benefits of SIMD and - // multithreading parallelism. Two restrictions: - // - The subtree has to be a power-of-2 number of chunks. Only subtrees - // along the right edge can be incomplete, and we don't know where - // the right edge is going to be until we get to finalize(). - // - The subtree must evenly divide the total number of chunks up until - // this point (if total is not 0). If the current incomplete subtree - // is only waiting for 1 more chunk, we can't hash a subtree of 4 - // chunks. We have to complete the current subtree first. - // Because we might need to break up the input to form powers of 2, or - // to evenly divide what we already have, this part runs in a loop. - while input.len() > CHUNK_LEN { - debug_assert_eq!(self.chunk_state.len(), 0, "no partial chunk data"); - debug_assert_eq!(CHUNK_LEN.count_ones(), 1, "power of 2 chunk len"); - let mut subtree_len = largest_power_of_two_leq(input.len()); - let count_so_far = self.chunk_state.chunk_counter * CHUNK_LEN as u64; - // Shrink the subtree_len until it evenly divides the count so far. - // We know that subtree_len itself is a power of 2, so we can use a - // bitmasking trick instead of an actual remainder operation. (Note - // that if the caller consistently passes power-of-2 inputs of the - // same size, as is hopefully typical, this loop condition will - // always fail, and subtree_len will always be the full length of - // the input.) - // - // An aside: We don't have to shrink subtree_len quite this much. - // For example, if count_so_far is 1, we could pass 2 chunks to - // compress_subtree_to_parent_node. Since we'll get 2 CVs back, - // we'll still get the right answer in the end, and we might get to - // use 2-way SIMD parallelism. The problem with this optimization, - // is that it gets us stuck always hashing 2 chunks. The total - // number of chunks will remain odd, and we'll never graduate to - // higher degrees of parallelism. See - // https://github.com/BLAKE3-team/BLAKE3/issues/69. - while (subtree_len - 1) as u64 & count_so_far != 0 { - subtree_len /= 2; - } - // The shrunken subtree_len might now be 1 chunk long. If so, hash - // that one chunk by itself. Otherwise, compress the subtree into a - // pair of CVs. - let subtree_chunks = (subtree_len / CHUNK_LEN) as u64; - if subtree_len <= CHUNK_LEN { - debug_assert_eq!(subtree_len, CHUNK_LEN); - self.push_cv( - &ChunkState::new( - &self.key, - self.chunk_state.chunk_counter, - self.chunk_state.flags, - self.chunk_state.platform, - ) - .update(&input[..subtree_len]) - .output() - .chaining_value(), - self.chunk_state.chunk_counter, - ); - } else { - // This is the high-performance happy path, though getting here - // depends on the caller giving us a long enough input. - let cv_pair = compress_subtree_to_parent_node::( - &input[..subtree_len], - &self.key, - self.chunk_state.chunk_counter, - self.chunk_state.flags, - self.chunk_state.platform, - ); - let left_cv = cv_pair[0..32].try_into().unwrap(); - let right_cv = cv_pair[32..64].try_into().unwrap(); - // Push the two CVs we received into the CV stack in order. Because - // the stack merges lazily, this guarantees we aren't merging the - // root. - self.push_cv(left_cv, self.chunk_state.chunk_counter); - self.push_cv( - right_cv, - self.chunk_state.chunk_counter + (subtree_chunks / 2), - ); - } - self.chunk_state.chunk_counter += subtree_chunks; - input = &input[subtree_len..]; - } - - // What remains is 1 chunk or less. Add it to the chunk state. - debug_assert!(input.len() <= CHUNK_LEN); - if !input.is_empty() { - self.chunk_state.update(input); - // Having added some input to the chunk_state, we know what's in - // the CV stack won't become the root node, and we can do an extra - // merge. This simplifies finalize(). - self.merge_cv_stack(self.chunk_state.chunk_counter); - } - - self - } - - fn final_output(&self) -> Output { - // If the current chunk is the only chunk, that makes it the root node - // also. Convert it directly into an Output. Otherwise, we need to - // merge subtrees below. - if self.cv_stack.is_empty() { - debug_assert_eq!(self.chunk_state.chunk_counter, 0); - return self.chunk_state.output(); - } - - // If there are any bytes in the ChunkState, finalize that chunk and - // merge its CV with everything in the CV stack. In that case, the work - // we did at the end of update() above guarantees that the stack - // doesn't contain any unmerged subtrees that need to be merged first. - // (This is important, because if there were two chunk hashes sitting - // on top of the stack, they would need to merge with each other, and - // merging a new chunk hash into them would be incorrect.) - // - // If there are no bytes in the ChunkState, we'll merge what's already - // in the stack. In this case it's fine if there are unmerged chunks on - // top, because we'll merge them with each other. Note that the case of - // the empty chunk is taken care of above. - let mut output: Output; - let mut num_cvs_remaining = self.cv_stack.len(); - if self.chunk_state.len() > 0 { - debug_assert_eq!( - self.cv_stack.len(), - self.chunk_state.chunk_counter.count_ones() as usize, - "cv stack does not need a merge" - ); - output = self.chunk_state.output(); - } else { - debug_assert!(self.cv_stack.len() >= 2); - output = parent_node_output( - &self.cv_stack[num_cvs_remaining - 2], - &self.cv_stack[num_cvs_remaining - 1], - &self.key, - self.chunk_state.flags, - self.chunk_state.platform, - ); - num_cvs_remaining -= 2; - } - while num_cvs_remaining > 0 { - output = parent_node_output( - &self.cv_stack[num_cvs_remaining - 1], - &output.chaining_value(), - &self.key, - self.chunk_state.flags, - self.chunk_state.platform, - ); - num_cvs_remaining -= 1; - } - output - } - - /// Finalize the hash state and return the [`Hash`](struct.Hash.html) of - /// the input. - /// - /// This method is idempotent. Calling it twice will give the same result. - /// You can also add more input and finalize again. - pub fn finalize(&self) -> Hash { - self.final_output().root_hash() - } - - /// Finalize the hash state and return an [`OutputReader`], which can - /// supply any number of output bytes. - /// - /// This method is idempotent. Calling it twice will give the same result. - /// You can also add more input and finalize again. - /// - /// [`OutputReader`]: struct.OutputReader.html - pub fn finalize_xof(&self) -> OutputReader { - OutputReader::new(self.final_output()) - } - - /// Return the total number of bytes hashed so far. - pub fn count(&self) -> u64 { - self.chunk_state.chunk_counter * CHUNK_LEN as u64 + self.chunk_state.len() as u64 - } -} - -// Don't derive(Debug), because the state may be secret. -impl fmt::Debug for Hasher { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - f.debug_struct("Hasher") - .field("flags", &self.chunk_state.flags) - .field("platform", &self.chunk_state.platform) - .finish() - } -} - -impl Default for Hasher { - #[inline] - fn default() -> Self { - Self::new() - } -} - -/// An incremental reader for extended output, returned by -/// [`Hasher::finalize_xof`](struct.Hasher.html#method.finalize_xof). -/// -/// Shorter BLAKE3 outputs are prefixes of longer ones, and explicitly requesting a short output is -/// equivalent to truncating the default-length output. Note that this is a difference between -/// BLAKE2 and BLAKE3. -/// -/// # Security notes -/// -/// Outputs shorter than the default length of 32 bytes (256 bits) provide less security. An N-bit -/// BLAKE3 output is intended to provide N bits of first and second preimage resistance and N/2 -/// bits of collision resistance, for any N up to 256. Longer outputs don't provide any additional -/// security. -/// -/// Avoid relying on the secrecy of the output offset, that is, the number of output bytes read or -/// the arguments to [`seek`](struct.OutputReader.html#method.seek) or -/// [`set_position`](struct.OutputReader.html#method.set_position). [_Block-Cipher-Based Tree -/// Hashing_ by Aldo Gunsing](https://eprint.iacr.org/2022/283) shows that an attacker who knows -/// both the message and the key (if any) can easily determine the offset of an extended output. -/// For comparison, AES-CTR has a similar property: if you know the key, you can decrypt a block -/// from an unknown position in the output stream to recover its block index. Callers with strong -/// secret keys aren't affected in practice, but secret offsets are a [design -/// smell](https://en.wikipedia.org/wiki/Design_smell) in any case. -#[derive(Clone)] -pub struct OutputReader { - inner: Output, - position_within_block: u8, -} - -impl OutputReader { - fn new(inner: Output) -> Self { - Self { - inner, - position_within_block: 0, - } - } - - /// Fill a buffer with output bytes and advance the position of the - /// `OutputReader`. This is equivalent to [`Read::read`], except that it - /// doesn't return a `Result`. Both methods always fill the entire buffer. - /// - /// Note that `OutputReader` doesn't buffer output bytes internally, so - /// calling `fill` repeatedly with a short-length or odd-length slice will - /// end up performing the same compression multiple times. If you're - /// reading output in a loop, prefer a slice length that's a multiple of - /// 64. - /// - /// The maximum output size of BLAKE3 is 264-1 bytes. If you try - /// to extract more than that, for example by seeking near the end and - /// reading further, the behavior is unspecified. - /// - /// [`Read::read`]: #method.read - pub fn fill(&mut self, mut buf: &mut [u8]) { - while !buf.is_empty() { - let block: [u8; BLOCK_LEN] = self.inner.root_output_block(); - let output_bytes = &block[self.position_within_block as usize..]; - let take = cmp::min(buf.len(), output_bytes.len()); - buf[..take].copy_from_slice(&output_bytes[..take]); - buf = &mut buf[take..]; - self.position_within_block += take as u8; - if self.position_within_block == BLOCK_LEN as u8 { - self.inner.counter += 1; - self.position_within_block = 0; - } - } - } - - /// Return the current read position in the output stream. This is - /// equivalent to [`Seek::stream_position`], except that it doesn't return - /// a `Result`. The position of a new `OutputReader` starts at 0, and each - /// call to [`fill`] or [`Read::read`] moves the position forward by the - /// number of bytes read. - /// - /// [`Seek::stream_position`]: #method.stream_position - /// [`fill`]: #method.fill - /// [`Read::read`]: #method.read - pub fn position(&self) -> u64 { - self.inner.counter * BLOCK_LEN as u64 + self.position_within_block as u64 - } - - /// Seek to a new read position in the output stream. This is equivalent to - /// calling [`Seek::seek`] with [`SeekFrom::Start`], except that it doesn't - /// return a `Result`. - /// - /// [`Seek::seek`]: #method.seek - /// [`SeekFrom::Start`]: https://doc.rust-lang.org/std/io/enum.SeekFrom.html - pub fn set_position(&mut self, position: u64) { - self.position_within_block = (position % BLOCK_LEN as u64) as u8; - self.inner.counter = position / BLOCK_LEN as u64; - } -} - -// Don't derive(Debug), because the state may be secret. -impl fmt::Debug for OutputReader { - fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - f.debug_struct("OutputReader") - .field("position", &self.position()) - .finish() - } -} diff --git a/narcissus-core/src/blake3/platform.rs b/narcissus-core/src/blake3/platform.rs deleted file mode 100644 index d71086d..0000000 --- a/narcissus-core/src/blake3/platform.rs +++ /dev/null @@ -1,296 +0,0 @@ -use crate::slice::{array_chunks, array_chunks_mut}; - -use super::{portable, CVWords, IncrementCounter, BLOCK_LEN}; - -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -pub const MAX_SIMD_DEGREE: usize = 8; - -#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))] -pub const MAX_SIMD_DEGREE: usize = 1; - -// There are some places where we want a static size that's equal to the -// MAX_SIMD_DEGREE, but also at least 2. Constant contexts aren't currently -// allowed to use cmp::max, so we have to hardcode this additional constant -// value. Get rid of this once cmp::max is a const fn. -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -pub const MAX_SIMD_DEGREE_OR_2: usize = MAX_SIMD_DEGREE; - -#[cfg(not(any(target_arch = "x86", target_arch = "x86_64")))] -pub const MAX_SIMD_DEGREE_OR_2: usize = 2; - -#[derive(Clone, Copy, Debug)] -pub enum Platform { - Portable, - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - SSE2, - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - SSE41, - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - AVX2, -} - -impl Platform { - #[allow(unreachable_code)] - pub fn detect() -> Self { - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - { - if avx2_detected() { - return Platform::AVX2; - } - if sse41_detected() { - return Platform::SSE41; - } - if sse2_detected() { - return Platform::SSE2; - } - } - - Platform::Portable - } - - pub fn simd_degree(&self) -> usize { - let degree = match self { - Platform::Portable => 1, - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - Platform::SSE2 => 4, - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - Platform::SSE41 => 4, - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - Platform::AVX2 => 8, - }; - debug_assert!(degree <= MAX_SIMD_DEGREE); - degree - } - - pub fn compress_in_place( - &self, - cv: &mut CVWords, - block: &[u8; BLOCK_LEN], - block_len: u8, - counter: u64, - flags: u8, - ) { - match self { - Platform::Portable => portable::compress_in_place(cv, block, block_len, counter, flags), - // Safe because detect() checked for platform support. - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - Platform::SSE2 => unsafe { - super::sse2::compress_in_place(cv, block, block_len, counter, flags) - }, - // Safe because detect() checked for platform support. - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - Platform::SSE41 | Platform::AVX2 => unsafe { - super::sse41::compress_in_place(cv, block, block_len, counter, flags) - }, - } - } - - pub fn compress_xof( - &self, - cv: &CVWords, - block: &[u8; BLOCK_LEN], - block_len: u8, - counter: u64, - flags: u8, - ) -> [u8; 64] { - match self { - Platform::Portable => portable::compress_xof(cv, block, block_len, counter, flags), - // Safe because detect() checked for platform support. - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - Platform::SSE2 => unsafe { - super::sse2::compress_xof(cv, block, block_len, counter, flags) - }, - // Safe because detect() checked for platform support. - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - Platform::SSE41 | Platform::AVX2 => unsafe { - super::sse41::compress_xof(cv, block, block_len, counter, flags) - }, - } - } - - // IMPLEMENTATION NOTE - // =================== - // hash_many() applies two optimizations. The critically important - // optimization is the high-performance parallel SIMD hashing mode, - // described in detail in the spec. This more than doubles throughput per - // thread. Another optimization is keeping the state vectors transposed - // from block to block within a chunk. When state vectors are transposed - // after every block, there's a small but measurable performance loss. - // Compressing chunks with a dedicated loop avoids this. - - pub fn hash_many( - &self, - inputs: &[&[u8; N]], - key: &CVWords, - counter: u64, - increment_counter: IncrementCounter, - flags: u8, - flags_start: u8, - flags_end: u8, - out: &mut [u8], - ) { - match self { - Platform::Portable => portable::hash_many( - inputs, - key, - counter, - increment_counter, - flags, - flags_start, - flags_end, - out, - ), - // Safe because detect() checked for platform support. - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - Platform::SSE2 => unsafe { - super::sse2::hash_many( - inputs, - key, - counter, - increment_counter, - flags, - flags_start, - flags_end, - out, - ) - }, - // Safe because detect() checked for platform support. - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - Platform::SSE41 => unsafe { - super::sse41::hash_many( - inputs, - key, - counter, - increment_counter, - flags, - flags_start, - flags_end, - out, - ) - }, - // Safe because detect() checked for platform support. - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - Platform::AVX2 => unsafe { - super::avx2::hash_many( - inputs, - key, - counter, - increment_counter, - flags, - flags_start, - flags_end, - out, - ) - }, - } - } - - // Explicit platform constructors, for benchmarks. - - pub fn portable() -> Self { - Self::Portable - } - - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - pub fn sse2() -> Option { - if sse2_detected() { - Some(Self::SSE2) - } else { - None - } - } - - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - pub fn sse41() -> Option { - if sse41_detected() { - Some(Self::SSE41) - } else { - None - } - } - - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - pub fn avx2() -> Option { - if avx2_detected() { - Some(Self::AVX2) - } else { - None - } - } -} - -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -#[inline(always)] -pub fn avx2_detected() -> bool { - // Static check, e.g. for building with target-cpu=native. - #[cfg(target_feature = "avx2")] - { - return true; - } - - #[cfg(not(target_feature = "avx2"))] - return is_x86_feature_detected!("avx2"); -} - -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -#[inline(always)] -pub fn sse41_detected() -> bool { - // Static check, e.g. for building with target-cpu=native. - #[cfg(target_feature = "sse4.1")] - { - return true; - } - - #[cfg(not(target_feature = "sse4.1"))] - return is_x86_feature_detected!("sse4.1"); -} - -#[cfg(any(target_arch = "x86", target_arch = "x86_64"))] -#[inline(always)] -#[allow(unreachable_code)] -pub fn sse2_detected() -> bool { - // Static check, e.g. for building with target-cpu=native. - #[cfg(target_feature = "sse2")] - { - return true; - } - - #[cfg(not(target_feature = "sse2"))] - return is_x86_feature_detected!("sse2"); -} - -#[inline(always)] -pub fn words_from_le_bytes_32(bytes: &[u8; 32]) -> [u32; 8] { - let mut out = [0; 8]; - for (chunk, word) in array_chunks(bytes).zip(out.iter_mut()) { - *word = u32::from_le_bytes(*chunk) - } - out -} - -#[inline(always)] -pub fn words_from_le_bytes_64(bytes: &[u8; 64]) -> [u32; 16] { - let mut out = [0; 16]; - for (chunk, word) in array_chunks(bytes).zip(out.iter_mut()) { - *word = u32::from_le_bytes(*chunk) - } - out -} - -#[inline(always)] -pub fn le_bytes_from_words_32(words: &[u32; 8]) -> [u8; 32] { - let mut out = [0; 32]; - for (word, chunk) in words.iter().zip(array_chunks_mut(&mut out)) { - *chunk = word.to_le_bytes(); - } - out -} - -#[inline(always)] -pub fn le_bytes_from_words_64(words: &[u32; 16]) -> [u8; 64] { - let mut out = [0; 64]; - for (word, chunk) in words.iter().zip(array_chunks_mut(&mut out)) { - *chunk = word.to_le_bytes(); - } - out -} diff --git a/narcissus-core/src/blake3/portable.rs b/narcissus-core/src/blake3/portable.rs deleted file mode 100644 index 4a762c0..0000000 --- a/narcissus-core/src/blake3/portable.rs +++ /dev/null @@ -1,195 +0,0 @@ -use crate::slice::array_chunks_mut; - -use super::{ - counter_high, counter_low, platform, CVBytes, CVWords, IncrementCounter, BLOCK_LEN, IV, - MSG_SCHEDULE, OUT_LEN, -}; - -#[inline(always)] -fn g(state: &mut [u32; 16], a: usize, b: usize, c: usize, d: usize, x: u32, y: u32) { - state[a] = state[a].wrapping_add(state[b]).wrapping_add(x); - state[d] = (state[d] ^ state[a]).rotate_right(16); - state[c] = state[c].wrapping_add(state[d]); - state[b] = (state[b] ^ state[c]).rotate_right(12); - state[a] = state[a].wrapping_add(state[b]).wrapping_add(y); - state[d] = (state[d] ^ state[a]).rotate_right(8); - state[c] = state[c].wrapping_add(state[d]); - state[b] = (state[b] ^ state[c]).rotate_right(7); -} - -#[inline(always)] -fn round(state: &mut [u32; 16], msg: &[u32; 16], round: usize) { - // Select the message schedule based on the round. - let schedule = MSG_SCHEDULE[round]; - - // Mix the columns. - g(state, 0, 4, 8, 12, msg[schedule[0]], msg[schedule[1]]); - g(state, 1, 5, 9, 13, msg[schedule[2]], msg[schedule[3]]); - g(state, 2, 6, 10, 14, msg[schedule[4]], msg[schedule[5]]); - g(state, 3, 7, 11, 15, msg[schedule[6]], msg[schedule[7]]); - - // Mix the diagonals. - g(state, 0, 5, 10, 15, msg[schedule[8]], msg[schedule[9]]); - g(state, 1, 6, 11, 12, msg[schedule[10]], msg[schedule[11]]); - g(state, 2, 7, 8, 13, msg[schedule[12]], msg[schedule[13]]); - g(state, 3, 4, 9, 14, msg[schedule[14]], msg[schedule[15]]); -} - -#[inline(always)] -fn compress_pre( - cv: &CVWords, - block: &[u8; BLOCK_LEN], - block_len: u8, - counter: u64, - flags: u8, -) -> [u32; 16] { - let block_words = platform::words_from_le_bytes_64(block); - - let mut state = [ - cv[0], - cv[1], - cv[2], - cv[3], - cv[4], - cv[5], - cv[6], - cv[7], - IV[0], - IV[1], - IV[2], - IV[3], - counter_low(counter), - counter_high(counter), - block_len as u32, - flags as u32, - ]; - - round(&mut state, &block_words, 0); - round(&mut state, &block_words, 1); - round(&mut state, &block_words, 2); - round(&mut state, &block_words, 3); - round(&mut state, &block_words, 4); - round(&mut state, &block_words, 5); - round(&mut state, &block_words, 6); - - state -} - -pub fn compress_in_place( - cv: &mut CVWords, - block: &[u8; BLOCK_LEN], - block_len: u8, - counter: u64, - flags: u8, -) { - let state = compress_pre(cv, block, block_len, counter, flags); - - cv[0] = state[0] ^ state[8]; - cv[1] = state[1] ^ state[9]; - cv[2] = state[2] ^ state[10]; - cv[3] = state[3] ^ state[11]; - cv[4] = state[4] ^ state[12]; - cv[5] = state[5] ^ state[13]; - cv[6] = state[6] ^ state[14]; - cv[7] = state[7] ^ state[15]; -} - -pub fn compress_xof( - cv: &CVWords, - block: &[u8; BLOCK_LEN], - block_len: u8, - counter: u64, - flags: u8, -) -> [u8; 64] { - let mut state = compress_pre(cv, block, block_len, counter, flags); - state[0] ^= state[8]; - state[1] ^= state[9]; - state[2] ^= state[10]; - state[3] ^= state[11]; - state[4] ^= state[12]; - state[5] ^= state[13]; - state[6] ^= state[14]; - state[7] ^= state[15]; - state[8] ^= cv[0]; - state[9] ^= cv[1]; - state[10] ^= cv[2]; - state[11] ^= cv[3]; - state[12] ^= cv[4]; - state[13] ^= cv[5]; - state[14] ^= cv[6]; - state[15] ^= cv[7]; - platform::le_bytes_from_words_64(&state) -} - -pub fn hash1( - input: &[u8; N], - key: &CVWords, - counter: u64, - flags: u8, - flags_start: u8, - flags_end: u8, - out: &mut CVBytes, -) { - debug_assert_eq!(N % BLOCK_LEN, 0, "uneven blocks"); - let mut cv = *key; - let mut block_flags = flags | flags_start; - let mut slice = &input[..]; - while slice.len() >= BLOCK_LEN { - if slice.len() == BLOCK_LEN { - block_flags |= flags_end; - } - //let (block, rem) = split_array_ref(slice); - - compress_in_place( - &mut cv, - &slice[0..BLOCK_LEN].try_into().unwrap(), - BLOCK_LEN as u8, - counter, - block_flags, - ); - block_flags = flags; - //slice = rem; - slice = &slice[BLOCK_LEN..]; - } - *out = platform::le_bytes_from_words_32(&cv); -} - -pub fn hash_many( - inputs: &[&[u8; N]], - key: &CVWords, - mut counter: u64, - increment_counter: IncrementCounter, - flags: u8, - flags_start: u8, - flags_end: u8, - out: &mut [u8], -) { - debug_assert!(out.len() >= inputs.len() * OUT_LEN, "out too short"); - for (&input, output) in inputs.iter().zip(array_chunks_mut(out)) { - hash1(input, key, counter, flags, flags_start, flags_end, output); - if increment_counter.yes() { - counter += 1; - } - } -} - -#[cfg(test)] -pub mod test { - use super::super::test; - use super::*; - - // This is basically testing the portable implementation against itself, - // but it also checks that compress_in_place and compress_xof are - // consistent. And there are tests against the reference implementation and - // against hardcoded test vectors elsewhere. - #[test] - fn test_compress() { - test::test_compress_fn(compress_in_place, compress_xof); - } - - // Ditto. - #[test] - fn test_hash_many() { - test::test_hash_many_fn(hash_many, hash_many); - } -} diff --git a/narcissus-core/src/blake3/reference_impl.rs b/narcissus-core/src/blake3/reference_impl.rs deleted file mode 100644 index 83ac795..0000000 --- a/narcissus-core/src/blake3/reference_impl.rs +++ /dev/null @@ -1,384 +0,0 @@ -//! This is the reference implementation of BLAKE3. It is used for testing and -//! as a readable example of the algorithms involved. Section 5.1 of [the BLAKE3 -//! spec](https://github.com/BLAKE3-team/BLAKE3-specs/blob/master/blake3.pdf) -//! discusses this implementation. You can render docs for this implementation -//! by running `cargo doc --open` in this directory. -//! -//! # Example -//! -//! ``` -//! let mut hasher = reference_impl::Hasher::new(); -//! hasher.update(b"abc"); -//! hasher.update(b"def"); -//! let mut hash = [0; 32]; -//! hasher.finalize(&mut hash); -//! let mut extended_hash = [0; 500]; -//! hasher.finalize(&mut extended_hash); -//! assert_eq!(hash, extended_hash[..32]); -//! ``` - -use core::cmp::min; -use core::convert::TryInto; - -const OUT_LEN: usize = 32; -const KEY_LEN: usize = 32; -const BLOCK_LEN: usize = 64; -const CHUNK_LEN: usize = 1024; - -const CHUNK_START: u32 = 1 << 0; -const CHUNK_END: u32 = 1 << 1; -const PARENT: u32 = 1 << 2; -const ROOT: u32 = 1 << 3; -const KEYED_HASH: u32 = 1 << 4; -const DERIVE_KEY_CONTEXT: u32 = 1 << 5; -const DERIVE_KEY_MATERIAL: u32 = 1 << 6; - -const IV: [u32; 8] = [ - 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A, 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19, -]; - -const MSG_PERMUTATION: [usize; 16] = [2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8]; - -// The mixing function, G, which mixes either a column or a diagonal. -fn g(state: &mut [u32; 16], a: usize, b: usize, c: usize, d: usize, mx: u32, my: u32) { - state[a] = state[a].wrapping_add(state[b]).wrapping_add(mx); - state[d] = (state[d] ^ state[a]).rotate_right(16); - state[c] = state[c].wrapping_add(state[d]); - state[b] = (state[b] ^ state[c]).rotate_right(12); - state[a] = state[a].wrapping_add(state[b]).wrapping_add(my); - state[d] = (state[d] ^ state[a]).rotate_right(8); - state[c] = state[c].wrapping_add(state[d]); - state[b] = (state[b] ^ state[c]).rotate_right(7); -} - -fn round(state: &mut [u32; 16], m: &[u32; 16]) { - // Mix the columns. - g(state, 0, 4, 8, 12, m[0], m[1]); - g(state, 1, 5, 9, 13, m[2], m[3]); - g(state, 2, 6, 10, 14, m[4], m[5]); - g(state, 3, 7, 11, 15, m[6], m[7]); - // Mix the diagonals. - g(state, 0, 5, 10, 15, m[8], m[9]); - g(state, 1, 6, 11, 12, m[10], m[11]); - g(state, 2, 7, 8, 13, m[12], m[13]); - g(state, 3, 4, 9, 14, m[14], m[15]); -} - -fn permute(m: &mut [u32; 16]) { - let mut permuted = [0; 16]; - for i in 0..16 { - permuted[i] = m[MSG_PERMUTATION[i]]; - } - *m = permuted; -} - -fn compress( - chaining_value: &[u32; 8], - block_words: &[u32; 16], - counter: u64, - block_len: u32, - flags: u32, -) -> [u32; 16] { - let mut state = [ - chaining_value[0], - chaining_value[1], - chaining_value[2], - chaining_value[3], - chaining_value[4], - chaining_value[5], - chaining_value[6], - chaining_value[7], - IV[0], - IV[1], - IV[2], - IV[3], - counter as u32, - (counter >> 32) as u32, - block_len, - flags, - ]; - let mut block = *block_words; - - round(&mut state, &block); // round 1 - permute(&mut block); - round(&mut state, &block); // round 2 - permute(&mut block); - round(&mut state, &block); // round 3 - permute(&mut block); - round(&mut state, &block); // round 4 - permute(&mut block); - round(&mut state, &block); // round 5 - permute(&mut block); - round(&mut state, &block); // round 6 - permute(&mut block); - round(&mut state, &block); // round 7 - - for i in 0..8 { - state[i] ^= state[i + 8]; - state[i + 8] ^= chaining_value[i]; - } - state -} - -fn first_8_words(compression_output: [u32; 16]) -> [u32; 8] { - compression_output[0..8].try_into().unwrap() -} - -fn words_from_little_endian_bytes(bytes: &[u8], words: &mut [u32]) { - debug_assert_eq!(bytes.len(), 4 * words.len()); - for (four_bytes, word) in bytes.chunks_exact(4).zip(words) { - *word = u32::from_le_bytes(four_bytes.try_into().unwrap()); - } -} - -// Each chunk or parent node can produce either an 8-word chaining value or, by -// setting the ROOT flag, any number of final output bytes. The Output struct -// captures the state just prior to choosing between those two possibilities. -struct Output { - input_chaining_value: [u32; 8], - block_words: [u32; 16], - counter: u64, - block_len: u32, - flags: u32, -} - -impl Output { - fn chaining_value(&self) -> [u32; 8] { - first_8_words(compress( - &self.input_chaining_value, - &self.block_words, - self.counter, - self.block_len, - self.flags, - )) - } - - fn root_output_bytes(&self, out_slice: &mut [u8]) { - let mut output_block_counter = 0; - for out_block in out_slice.chunks_mut(2 * OUT_LEN) { - let words = compress( - &self.input_chaining_value, - &self.block_words, - output_block_counter, - self.block_len, - self.flags | ROOT, - ); - // The output length might not be a multiple of 4. - for (word, out_word) in words.iter().zip(out_block.chunks_mut(4)) { - out_word.copy_from_slice(&word.to_le_bytes()[..out_word.len()]); - } - output_block_counter += 1; - } - } -} - -struct ChunkState { - chaining_value: [u32; 8], - chunk_counter: u64, - block: [u8; BLOCK_LEN], - block_len: u8, - blocks_compressed: u8, - flags: u32, -} - -impl ChunkState { - fn new(key_words: [u32; 8], chunk_counter: u64, flags: u32) -> Self { - Self { - chaining_value: key_words, - chunk_counter, - block: [0; BLOCK_LEN], - block_len: 0, - blocks_compressed: 0, - flags, - } - } - - fn len(&self) -> usize { - BLOCK_LEN * self.blocks_compressed as usize + self.block_len as usize - } - - fn start_flag(&self) -> u32 { - if self.blocks_compressed == 0 { - CHUNK_START - } else { - 0 - } - } - - fn update(&mut self, mut input: &[u8]) { - while !input.is_empty() { - // If the block buffer is full, compress it and clear it. More - // input is coming, so this compression is not CHUNK_END. - if self.block_len as usize == BLOCK_LEN { - let mut block_words = [0; 16]; - words_from_little_endian_bytes(&self.block, &mut block_words); - self.chaining_value = first_8_words(compress( - &self.chaining_value, - &block_words, - self.chunk_counter, - BLOCK_LEN as u32, - self.flags | self.start_flag(), - )); - self.blocks_compressed += 1; - self.block = [0; BLOCK_LEN]; - self.block_len = 0; - } - - // Copy input bytes into the block buffer. - let want = BLOCK_LEN - self.block_len as usize; - let take = min(want, input.len()); - self.block[self.block_len as usize..][..take].copy_from_slice(&input[..take]); - self.block_len += take as u8; - input = &input[take..]; - } - } - - fn output(&self) -> Output { - let mut block_words = [0; 16]; - words_from_little_endian_bytes(&self.block, &mut block_words); - Output { - input_chaining_value: self.chaining_value, - block_words, - counter: self.chunk_counter, - block_len: self.block_len as u32, - flags: self.flags | self.start_flag() | CHUNK_END, - } - } -} - -fn parent_output( - left_child_cv: [u32; 8], - right_child_cv: [u32; 8], - key_words: [u32; 8], - flags: u32, -) -> Output { - let mut block_words = [0; 16]; - block_words[..8].copy_from_slice(&left_child_cv); - block_words[8..].copy_from_slice(&right_child_cv); - Output { - input_chaining_value: key_words, - block_words, - counter: 0, // Always 0 for parent nodes. - block_len: BLOCK_LEN as u32, // Always BLOCK_LEN (64) for parent nodes. - flags: PARENT | flags, - } -} - -fn parent_cv( - left_child_cv: [u32; 8], - right_child_cv: [u32; 8], - key_words: [u32; 8], - flags: u32, -) -> [u32; 8] { - parent_output(left_child_cv, right_child_cv, key_words, flags).chaining_value() -} - -/// An incremental hasher that can accept any number of writes. -pub struct Hasher { - chunk_state: ChunkState, - key_words: [u32; 8], - cv_stack: [[u32; 8]; 54], // Space for 54 subtree chaining values: - cv_stack_len: u8, // 2^54 * CHUNK_LEN = 2^64 - flags: u32, -} - -impl Hasher { - fn new_internal(key_words: [u32; 8], flags: u32) -> Self { - Self { - chunk_state: ChunkState::new(key_words, 0, flags), - key_words, - cv_stack: [[0; 8]; 54], - cv_stack_len: 0, - flags, - } - } - - /// Construct a new `Hasher` for the regular hash function. - pub fn new() -> Self { - Self::new_internal(IV, 0) - } - - /// Construct a new `Hasher` for the keyed hash function. - pub fn new_keyed(key: &[u8; KEY_LEN]) -> Self { - let mut key_words = [0; 8]; - words_from_little_endian_bytes(key, &mut key_words); - Self::new_internal(key_words, KEYED_HASH) - } - - /// Construct a new `Hasher` for the key derivation function. The context - /// string should be hardcoded, globally unique, and application-specific. - pub fn new_derive_key(context: &str) -> Self { - let mut context_hasher = Self::new_internal(IV, DERIVE_KEY_CONTEXT); - context_hasher.update(context.as_bytes()); - let mut context_key = [0; KEY_LEN]; - context_hasher.finalize(&mut context_key); - let mut context_key_words = [0; 8]; - words_from_little_endian_bytes(&context_key, &mut context_key_words); - Self::new_internal(context_key_words, DERIVE_KEY_MATERIAL) - } - - fn push_stack(&mut self, cv: [u32; 8]) { - self.cv_stack[self.cv_stack_len as usize] = cv; - self.cv_stack_len += 1; - } - - fn pop_stack(&mut self) -> [u32; 8] { - self.cv_stack_len -= 1; - self.cv_stack[self.cv_stack_len as usize] - } - - // Section 5.1.2 of the BLAKE3 spec explains this algorithm in more detail. - fn add_chunk_chaining_value(&mut self, mut new_cv: [u32; 8], mut total_chunks: u64) { - // This chunk might complete some subtrees. For each completed subtree, - // its left child will be the current top entry in the CV stack, and - // its right child will be the current value of `new_cv`. Pop each left - // child off the stack, merge it with `new_cv`, and overwrite `new_cv` - // with the result. After all these merges, push the final value of - // `new_cv` onto the stack. The number of completed subtrees is given - // by the number of trailing 0-bits in the new total number of chunks. - while total_chunks & 1 == 0 { - new_cv = parent_cv(self.pop_stack(), new_cv, self.key_words, self.flags); - total_chunks >>= 1; - } - self.push_stack(new_cv); - } - - /// Add input to the hash state. This can be called any number of times. - pub fn update(&mut self, mut input: &[u8]) { - while !input.is_empty() { - // If the current chunk is complete, finalize it and reset the - // chunk state. More input is coming, so this chunk is not ROOT. - if self.chunk_state.len() == CHUNK_LEN { - let chunk_cv = self.chunk_state.output().chaining_value(); - let total_chunks = self.chunk_state.chunk_counter + 1; - self.add_chunk_chaining_value(chunk_cv, total_chunks); - self.chunk_state = ChunkState::new(self.key_words, total_chunks, self.flags); - } - - // Compress input bytes into the current chunk state. - let want = CHUNK_LEN - self.chunk_state.len(); - let take = min(want, input.len()); - self.chunk_state.update(&input[..take]); - input = &input[take..]; - } - } - - /// Finalize the hash and write any number of output bytes. - pub fn finalize(&self, out_slice: &mut [u8]) { - // Starting with the Output from the current chunk, compute all the - // parent chaining values along the right edge of the tree, until we - // have the root Output. - let mut output = self.chunk_state.output(); - let mut parent_nodes_remaining = self.cv_stack_len as usize; - while parent_nodes_remaining > 0 { - parent_nodes_remaining -= 1; - output = parent_output( - self.cv_stack[parent_nodes_remaining], - output.chaining_value(), - self.key_words, - self.flags, - ); - } - output.root_output_bytes(out_slice); - } -} diff --git a/narcissus-core/src/blake3/rust_avx2.rs b/narcissus-core/src/blake3/rust_avx2.rs deleted file mode 100644 index 3826b37..0000000 --- a/narcissus-core/src/blake3/rust_avx2.rs +++ /dev/null @@ -1,477 +0,0 @@ -#[cfg(target_arch = "x86")] -use core::arch::x86::*; -#[cfg(target_arch = "x86_64")] -use core::arch::x86_64::*; - -use crate::slice::{array_chunks_mut, split_array_mut}; - -use super::{ - counter_high, counter_low, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE, OUT_LEN, -}; - -pub const DEGREE: usize = 8; - -#[inline(always)] -unsafe fn loadu(src: *const u8) -> __m256i { - // This is an unaligned load, so the pointer cast is allowed. - _mm256_loadu_si256(src as *const __m256i) -} - -#[inline(always)] -unsafe fn storeu(src: __m256i, dest: *mut u8) { - // This is an unaligned store, so the pointer cast is allowed. - _mm256_storeu_si256(dest as *mut __m256i, src) -} - -#[inline(always)] -unsafe fn add(a: __m256i, b: __m256i) -> __m256i { - _mm256_add_epi32(a, b) -} - -#[inline(always)] -unsafe fn xor(a: __m256i, b: __m256i) -> __m256i { - _mm256_xor_si256(a, b) -} - -#[inline(always)] -unsafe fn set1(x: u32) -> __m256i { - _mm256_set1_epi32(x as i32) -} - -#[inline(always)] -unsafe fn set8(a: u32, b: u32, c: u32, d: u32, e: u32, f: u32, g: u32, h: u32) -> __m256i { - _mm256_setr_epi32( - a as i32, b as i32, c as i32, d as i32, e as i32, f as i32, g as i32, h as i32, - ) -} - -// These rotations are the "simple/shifts version". For the -// "complicated/shuffles version", see -// https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2s-common.h#L63-L66. -// For a discussion of the tradeoffs, see -// https://github.com/sneves/blake2-avx2/pull/5. Due to an LLVM bug -// (https://bugs.llvm.org/show_bug.cgi?id=44379), this version performs better -// on recent x86 chips. - -#[inline(always)] -unsafe fn rot16(x: __m256i) -> __m256i { - _mm256_or_si256(_mm256_srli_epi32(x, 16), _mm256_slli_epi32(x, 32 - 16)) -} - -#[inline(always)] -unsafe fn rot12(x: __m256i) -> __m256i { - _mm256_or_si256(_mm256_srli_epi32(x, 12), _mm256_slli_epi32(x, 32 - 12)) -} - -#[inline(always)] -unsafe fn rot8(x: __m256i) -> __m256i { - _mm256_or_si256(_mm256_srli_epi32(x, 8), _mm256_slli_epi32(x, 32 - 8)) -} - -#[inline(always)] -unsafe fn rot7(x: __m256i) -> __m256i { - _mm256_or_si256(_mm256_srli_epi32(x, 7), _mm256_slli_epi32(x, 32 - 7)) -} - -#[inline(always)] -unsafe fn round(v: &mut [__m256i; 16], m: &[__m256i; 16], r: usize) { - v[0] = add(v[0], m[MSG_SCHEDULE[r][0]]); - v[1] = add(v[1], m[MSG_SCHEDULE[r][2]]); - v[2] = add(v[2], m[MSG_SCHEDULE[r][4]]); - v[3] = add(v[3], m[MSG_SCHEDULE[r][6]]); - v[0] = add(v[0], v[4]); - v[1] = add(v[1], v[5]); - v[2] = add(v[2], v[6]); - v[3] = add(v[3], v[7]); - v[12] = xor(v[12], v[0]); - v[13] = xor(v[13], v[1]); - v[14] = xor(v[14], v[2]); - v[15] = xor(v[15], v[3]); - v[12] = rot16(v[12]); - v[13] = rot16(v[13]); - v[14] = rot16(v[14]); - v[15] = rot16(v[15]); - v[8] = add(v[8], v[12]); - v[9] = add(v[9], v[13]); - v[10] = add(v[10], v[14]); - v[11] = add(v[11], v[15]); - v[4] = xor(v[4], v[8]); - v[5] = xor(v[5], v[9]); - v[6] = xor(v[6], v[10]); - v[7] = xor(v[7], v[11]); - v[4] = rot12(v[4]); - v[5] = rot12(v[5]); - v[6] = rot12(v[6]); - v[7] = rot12(v[7]); - v[0] = add(v[0], m[MSG_SCHEDULE[r][1]]); - v[1] = add(v[1], m[MSG_SCHEDULE[r][3]]); - v[2] = add(v[2], m[MSG_SCHEDULE[r][5]]); - v[3] = add(v[3], m[MSG_SCHEDULE[r][7]]); - v[0] = add(v[0], v[4]); - v[1] = add(v[1], v[5]); - v[2] = add(v[2], v[6]); - v[3] = add(v[3], v[7]); - v[12] = xor(v[12], v[0]); - v[13] = xor(v[13], v[1]); - v[14] = xor(v[14], v[2]); - v[15] = xor(v[15], v[3]); - v[12] = rot8(v[12]); - v[13] = rot8(v[13]); - v[14] = rot8(v[14]); - v[15] = rot8(v[15]); - v[8] = add(v[8], v[12]); - v[9] = add(v[9], v[13]); - v[10] = add(v[10], v[14]); - v[11] = add(v[11], v[15]); - v[4] = xor(v[4], v[8]); - v[5] = xor(v[5], v[9]); - v[6] = xor(v[6], v[10]); - v[7] = xor(v[7], v[11]); - v[4] = rot7(v[4]); - v[5] = rot7(v[5]); - v[6] = rot7(v[6]); - v[7] = rot7(v[7]); - - v[0] = add(v[0], m[MSG_SCHEDULE[r][8]]); - v[1] = add(v[1], m[MSG_SCHEDULE[r][10]]); - v[2] = add(v[2], m[MSG_SCHEDULE[r][12]]); - v[3] = add(v[3], m[MSG_SCHEDULE[r][14]]); - v[0] = add(v[0], v[5]); - v[1] = add(v[1], v[6]); - v[2] = add(v[2], v[7]); - v[3] = add(v[3], v[4]); - v[15] = xor(v[15], v[0]); - v[12] = xor(v[12], v[1]); - v[13] = xor(v[13], v[2]); - v[14] = xor(v[14], v[3]); - v[15] = rot16(v[15]); - v[12] = rot16(v[12]); - v[13] = rot16(v[13]); - v[14] = rot16(v[14]); - v[10] = add(v[10], v[15]); - v[11] = add(v[11], v[12]); - v[8] = add(v[8], v[13]); - v[9] = add(v[9], v[14]); - v[5] = xor(v[5], v[10]); - v[6] = xor(v[6], v[11]); - v[7] = xor(v[7], v[8]); - v[4] = xor(v[4], v[9]); - v[5] = rot12(v[5]); - v[6] = rot12(v[6]); - v[7] = rot12(v[7]); - v[4] = rot12(v[4]); - v[0] = add(v[0], m[MSG_SCHEDULE[r][9]]); - v[1] = add(v[1], m[MSG_SCHEDULE[r][11]]); - v[2] = add(v[2], m[MSG_SCHEDULE[r][13]]); - v[3] = add(v[3], m[MSG_SCHEDULE[r][15]]); - v[0] = add(v[0], v[5]); - v[1] = add(v[1], v[6]); - v[2] = add(v[2], v[7]); - v[3] = add(v[3], v[4]); - v[15] = xor(v[15], v[0]); - v[12] = xor(v[12], v[1]); - v[13] = xor(v[13], v[2]); - v[14] = xor(v[14], v[3]); - v[15] = rot8(v[15]); - v[12] = rot8(v[12]); - v[13] = rot8(v[13]); - v[14] = rot8(v[14]); - v[10] = add(v[10], v[15]); - v[11] = add(v[11], v[12]); - v[8] = add(v[8], v[13]); - v[9] = add(v[9], v[14]); - v[5] = xor(v[5], v[10]); - v[6] = xor(v[6], v[11]); - v[7] = xor(v[7], v[8]); - v[4] = xor(v[4], v[9]); - v[5] = rot7(v[5]); - v[6] = rot7(v[6]); - v[7] = rot7(v[7]); - v[4] = rot7(v[4]); -} - -#[inline(always)] -unsafe fn interleave128(a: __m256i, b: __m256i) -> (__m256i, __m256i) { - ( - _mm256_permute2x128_si256(a, b, 0x20), - _mm256_permute2x128_si256(a, b, 0x31), - ) -} - -// There are several ways to do a transposition. We could do it naively, with 8 separate -// _mm256_set_epi32 instructions, referencing each of the 32 words explicitly. Or we could copy -// the vecs into contiguous storage and then use gather instructions. This third approach is to use -// a series of unpack instructions to interleave the vectors. In my benchmarks, interleaving is the -// fastest approach. To test this, run `cargo +nightly bench --bench libtest load_8` in the -// https://github.com/oconnor663/bao_experiments repo. -#[inline(always)] -unsafe fn transpose_vecs(vecs: &mut [__m256i; DEGREE]) { - // Interleave 32-bit lanes. The low unpack is lanes 00/11/44/55, and the high is 22/33/66/77. - let ab_0145 = _mm256_unpacklo_epi32(vecs[0], vecs[1]); - let ab_2367 = _mm256_unpackhi_epi32(vecs[0], vecs[1]); - let cd_0145 = _mm256_unpacklo_epi32(vecs[2], vecs[3]); - let cd_2367 = _mm256_unpackhi_epi32(vecs[2], vecs[3]); - let ef_0145 = _mm256_unpacklo_epi32(vecs[4], vecs[5]); - let ef_2367 = _mm256_unpackhi_epi32(vecs[4], vecs[5]); - let gh_0145 = _mm256_unpacklo_epi32(vecs[6], vecs[7]); - let gh_2367 = _mm256_unpackhi_epi32(vecs[6], vecs[7]); - - // Interleave 64-bit lates. The low unpack is lanes 00/22 and the high is 11/33. - let abcd_04 = _mm256_unpacklo_epi64(ab_0145, cd_0145); - let abcd_15 = _mm256_unpackhi_epi64(ab_0145, cd_0145); - let abcd_26 = _mm256_unpacklo_epi64(ab_2367, cd_2367); - let abcd_37 = _mm256_unpackhi_epi64(ab_2367, cd_2367); - let efgh_04 = _mm256_unpacklo_epi64(ef_0145, gh_0145); - let efgh_15 = _mm256_unpackhi_epi64(ef_0145, gh_0145); - let efgh_26 = _mm256_unpacklo_epi64(ef_2367, gh_2367); - let efgh_37 = _mm256_unpackhi_epi64(ef_2367, gh_2367); - - // Interleave 128-bit lanes. - let (abcdefgh_0, abcdefgh_4) = interleave128(abcd_04, efgh_04); - let (abcdefgh_1, abcdefgh_5) = interleave128(abcd_15, efgh_15); - let (abcdefgh_2, abcdefgh_6) = interleave128(abcd_26, efgh_26); - let (abcdefgh_3, abcdefgh_7) = interleave128(abcd_37, efgh_37); - - vecs[0] = abcdefgh_0; - vecs[1] = abcdefgh_1; - vecs[2] = abcdefgh_2; - vecs[3] = abcdefgh_3; - vecs[4] = abcdefgh_4; - vecs[5] = abcdefgh_5; - vecs[6] = abcdefgh_6; - vecs[7] = abcdefgh_7; -} - -#[inline(always)] -unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [__m256i; 16] { - let mut vecs = [ - loadu(inputs[0].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[1].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[2].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[3].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[4].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[5].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[6].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[7].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[0].add(block_offset + 1 * 4 * DEGREE)), - loadu(inputs[1].add(block_offset + 1 * 4 * DEGREE)), - loadu(inputs[2].add(block_offset + 1 * 4 * DEGREE)), - loadu(inputs[3].add(block_offset + 1 * 4 * DEGREE)), - loadu(inputs[4].add(block_offset + 1 * 4 * DEGREE)), - loadu(inputs[5].add(block_offset + 1 * 4 * DEGREE)), - loadu(inputs[6].add(block_offset + 1 * 4 * DEGREE)), - loadu(inputs[7].add(block_offset + 1 * 4 * DEGREE)), - ]; - for prefetch in inputs.iter().take(DEGREE) { - _mm_prefetch(prefetch.add(block_offset + 256) as *const i8, _MM_HINT_T0); - } - for square in array_chunks_mut(&mut vecs) { - transpose_vecs(square); - } - vecs -} - -#[inline(always)] -unsafe fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m256i, __m256i) { - let mask = if increment_counter.yes() { !0 } else { 0 }; - ( - set8( - counter_low(counter + (mask & 0)), - counter_low(counter + (mask & 1)), - counter_low(counter + (mask & 2)), - counter_low(counter + (mask & 3)), - counter_low(counter + (mask & 4)), - counter_low(counter + (mask & 5)), - counter_low(counter + (mask & 6)), - counter_low(counter + (mask & 7)), - ), - set8( - counter_high(counter + (mask & 0)), - counter_high(counter + (mask & 1)), - counter_high(counter + (mask & 2)), - counter_high(counter + (mask & 3)), - counter_high(counter + (mask & 4)), - counter_high(counter + (mask & 5)), - counter_high(counter + (mask & 6)), - counter_high(counter + (mask & 7)), - ), - ) -} - -#[target_feature(enable = "avx2")] -pub unsafe fn hash8( - inputs: &[*const u8; DEGREE], - blocks: usize, - key: &CVWords, - counter: u64, - increment_counter: IncrementCounter, - flags: u8, - flags_start: u8, - flags_end: u8, - out: &mut [u8; DEGREE * OUT_LEN], -) { - let mut h_vecs = [ - set1(key[0]), - set1(key[1]), - set1(key[2]), - set1(key[3]), - set1(key[4]), - set1(key[5]), - set1(key[6]), - set1(key[7]), - ]; - let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter); - let mut block_flags = flags | flags_start; - - for block in 0..blocks { - if block + 1 == blocks { - block_flags |= flags_end; - } - let block_len_vec = set1(BLOCK_LEN as u32); // full blocks only - let block_flags_vec = set1(block_flags as u32); - let msg_vecs = transpose_msg_vecs(inputs, block * BLOCK_LEN); - - // The transposed compression function. Note that inlining this - // manually here improves compile times by a lot, compared to factoring - // it out into its own function and making it #[inline(always)]. Just - // guessing, it might have something to do with loop unrolling. - let mut v = [ - h_vecs[0], - h_vecs[1], - h_vecs[2], - h_vecs[3], - h_vecs[4], - h_vecs[5], - h_vecs[6], - h_vecs[7], - set1(IV[0]), - set1(IV[1]), - set1(IV[2]), - set1(IV[3]), - counter_low_vec, - counter_high_vec, - block_len_vec, - block_flags_vec, - ]; - round(&mut v, &msg_vecs, 0); - round(&mut v, &msg_vecs, 1); - round(&mut v, &msg_vecs, 2); - round(&mut v, &msg_vecs, 3); - round(&mut v, &msg_vecs, 4); - round(&mut v, &msg_vecs, 5); - round(&mut v, &msg_vecs, 6); - h_vecs[0] = xor(v[0], v[8]); - h_vecs[1] = xor(v[1], v[9]); - h_vecs[2] = xor(v[2], v[10]); - h_vecs[3] = xor(v[3], v[11]); - h_vecs[4] = xor(v[4], v[12]); - h_vecs[5] = xor(v[5], v[13]); - h_vecs[6] = xor(v[6], v[14]); - h_vecs[7] = xor(v[7], v[15]); - - block_flags = flags; - } - - transpose_vecs(&mut h_vecs); - storeu(h_vecs[0], out.as_mut_ptr().add(0 * 4 * DEGREE)); - storeu(h_vecs[1], out.as_mut_ptr().add(1 * 4 * DEGREE)); - storeu(h_vecs[2], out.as_mut_ptr().add(2 * 4 * DEGREE)); - storeu(h_vecs[3], out.as_mut_ptr().add(3 * 4 * DEGREE)); - storeu(h_vecs[4], out.as_mut_ptr().add(4 * 4 * DEGREE)); - storeu(h_vecs[5], out.as_mut_ptr().add(5 * 4 * DEGREE)); - storeu(h_vecs[6], out.as_mut_ptr().add(6 * 4 * DEGREE)); - storeu(h_vecs[7], out.as_mut_ptr().add(7 * 4 * DEGREE)); -} - -#[target_feature(enable = "avx2")] -pub unsafe fn hash_many( - mut inputs: &[&[u8; N]], - key: &CVWords, - mut counter: u64, - increment_counter: IncrementCounter, - flags: u8, - flags_start: u8, - flags_end: u8, - mut output: &mut [u8], -) { - debug_assert!(output.len() >= inputs.len() * OUT_LEN, "out too short"); - while inputs.len() >= DEGREE && output.len() >= DEGREE * OUT_LEN { - // Safe because the layout of arrays is guaranteed, and because the - // `blocks` count is determined statically from the argument type. - let input_ptrs: &[*const u8; DEGREE] = &*(inputs.as_ptr() as *const [*const u8; DEGREE]); - let blocks = N / BLOCK_LEN; - let (out, rem) = split_array_mut(output); - hash8( - input_ptrs, - blocks, - key, - counter, - increment_counter, - flags, - flags_start, - flags_end, - out, - ); - if increment_counter.yes() { - counter += DEGREE as u64; - } - inputs = &inputs[DEGREE..]; - output = rem; - } - super::sse41::hash_many( - inputs, - key, - counter, - increment_counter, - flags, - flags_start, - flags_end, - output, - ); -} - -#[cfg(test)] -mod test { - use super::*; - use crate::blake3::{avx2::transpose_vecs, platform, test::test_hash_many_fn}; - - #[test] - fn test_transpose() { - if !platform::avx2_detected() { - return; - } - - #[target_feature(enable = "avx2")] - unsafe fn transpose_wrapper(vecs: &mut [__m256i; DEGREE]) { - transpose_vecs(vecs); - } - - let mut matrix = [[0 as u32; DEGREE]; DEGREE]; - for i in 0..DEGREE { - for j in 0..DEGREE { - matrix[i][j] = (i * DEGREE + j) as u32; - } - } - - unsafe { - let mut vecs: [__m256i; DEGREE] = core::mem::transmute(matrix); - transpose_wrapper(&mut vecs); - matrix = core::mem::transmute(vecs); - } - - for i in 0..DEGREE { - for j in 0..DEGREE { - // Reversed indexes from above. - assert_eq!(matrix[j][i], (i * DEGREE + j) as u32); - } - } - } - - #[test] - fn test_hash_many() { - if !platform::avx2_detected() { - return; - } - test_hash_many_fn(hash_many, hash_many); - } -} diff --git a/narcissus-core/src/blake3/rust_sse2.rs b/narcissus-core/src/blake3/rust_sse2.rs deleted file mode 100644 index a49fa1b..0000000 --- a/narcissus-core/src/blake3/rust_sse2.rs +++ /dev/null @@ -1,774 +0,0 @@ -#[cfg(target_arch = "x86")] -use core::arch::x86::*; -#[cfg(target_arch = "x86_64")] -use core::arch::x86_64::*; - -use crate::slice::{array_chunks_mut, split_array_mut}; - -use super::{ - counter_high, counter_low, CVBytes, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE, - OUT_LEN, -}; - -pub const DEGREE: usize = 4; - -#[inline(always)] -unsafe fn loadu(src: *const u8) -> __m128i { - // This is an unaligned load, so the pointer cast is allowed. - _mm_loadu_si128(src as *const __m128i) -} - -#[inline(always)] -unsafe fn storeu(src: __m128i, dest: *mut u8) { - // This is an unaligned store, so the pointer cast is allowed. - _mm_storeu_si128(dest as *mut __m128i, src) -} - -#[inline(always)] -unsafe fn add(a: __m128i, b: __m128i) -> __m128i { - _mm_add_epi32(a, b) -} - -#[inline(always)] -unsafe fn xor(a: __m128i, b: __m128i) -> __m128i { - _mm_xor_si128(a, b) -} - -#[inline(always)] -unsafe fn set1(x: u32) -> __m128i { - _mm_set1_epi32(x as i32) -} - -#[inline(always)] -unsafe fn set4(a: u32, b: u32, c: u32, d: u32) -> __m128i { - _mm_setr_epi32(a as i32, b as i32, c as i32, d as i32) -} - -// These rotations are the "simple/shifts version". For the -// "complicated/shuffles version", see -// https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2s-common.h#L63-L66. -// For a discussion of the tradeoffs, see -// https://github.com/sneves/blake2-avx2/pull/5. Due to an LLVM bug -// (https://bugs.llvm.org/show_bug.cgi?id=44379), this version performs better -// on recent x86 chips. - -#[inline(always)] -unsafe fn rot16(a: __m128i) -> __m128i { - _mm_or_si128(_mm_srli_epi32(a, 16), _mm_slli_epi32(a, 32 - 16)) -} - -#[inline(always)] -unsafe fn rot12(a: __m128i) -> __m128i { - _mm_or_si128(_mm_srli_epi32(a, 12), _mm_slli_epi32(a, 32 - 12)) -} - -#[inline(always)] -unsafe fn rot8(a: __m128i) -> __m128i { - _mm_or_si128(_mm_srli_epi32(a, 8), _mm_slli_epi32(a, 32 - 8)) -} - -#[inline(always)] -unsafe fn rot7(a: __m128i) -> __m128i { - _mm_or_si128(_mm_srli_epi32(a, 7), _mm_slli_epi32(a, 32 - 7)) -} - -#[inline(always)] -unsafe fn g1( - row0: &mut __m128i, - row1: &mut __m128i, - row2: &mut __m128i, - row3: &mut __m128i, - m: __m128i, -) { - *row0 = add(add(*row0, m), *row1); - *row3 = xor(*row3, *row0); - *row3 = rot16(*row3); - *row2 = add(*row2, *row3); - *row1 = xor(*row1, *row2); - *row1 = rot12(*row1); -} - -#[inline(always)] -unsafe fn g2( - row0: &mut __m128i, - row1: &mut __m128i, - row2: &mut __m128i, - row3: &mut __m128i, - m: __m128i, -) { - *row0 = add(add(*row0, m), *row1); - *row3 = xor(*row3, *row0); - *row3 = rot8(*row3); - *row2 = add(*row2, *row3); - *row1 = xor(*row1, *row2); - *row1 = rot7(*row1); -} - -// Adapted from https://github.com/rust-lang-nursery/stdsimd/pull/479. -macro_rules! _MM_SHUFFLE { - ($z:expr, $y:expr, $x:expr, $w:expr) => { - ($z << 6) | ($y << 4) | ($x << 2) | $w - }; -} - -macro_rules! shuffle2 { - ($a:expr, $b:expr, $c:expr) => { - _mm_castps_si128(_mm_shuffle_ps( - _mm_castsi128_ps($a), - _mm_castsi128_ps($b), - $c, - )) - }; -} - -// Note the optimization here of leaving row1 as the unrotated row, rather than -// row0. All the message loads below are adjusted to compensate for this. See -// discussion at https://github.com/sneves/blake2-avx2/pull/4 -#[inline(always)] -unsafe fn diagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) { - *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(2, 1, 0, 3)); - *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2)); - *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(0, 3, 2, 1)); -} - -#[inline(always)] -unsafe fn undiagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) { - *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(0, 3, 2, 1)); - *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2)); - *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(2, 1, 0, 3)); -} - -#[inline(always)] -unsafe fn blend_epi16(a: __m128i, b: __m128i, imm8: i32) -> __m128i { - let bits = _mm_set_epi16(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01); - let mut mask = _mm_set1_epi16(imm8 as i16); - mask = _mm_and_si128(mask, bits); - mask = _mm_cmpeq_epi16(mask, bits); - _mm_or_si128(_mm_and_si128(mask, b), _mm_andnot_si128(mask, a)) -} - -#[inline(always)] -unsafe fn compress_pre( - cv: &CVWords, - block: &[u8; BLOCK_LEN], - block_len: u8, - counter: u64, - flags: u8, -) -> [__m128i; 4] { - let row0 = &mut loadu(cv.as_ptr().add(0) as *const u8); - let row1 = &mut loadu(cv.as_ptr().add(4) as *const u8); - let row2 = &mut set4(IV[0], IV[1], IV[2], IV[3]); - let row3 = &mut set4( - counter_low(counter), - counter_high(counter), - block_len as u32, - flags as u32, - ); - - let mut m0 = loadu(block.as_ptr().add(0 * 4 * DEGREE)); - let mut m1 = loadu(block.as_ptr().add(1 * 4 * DEGREE)); - let mut m2 = loadu(block.as_ptr().add(2 * 4 * DEGREE)); - let mut m3 = loadu(block.as_ptr().add(3 * 4 * DEGREE)); - - let mut t0; - let mut t1; - let mut t2; - let mut t3; - let mut tt; - - // Round 1. The first round permutes the message words from the original - // input order, into the groups that get mixed in parallel. - t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(2, 0, 2, 0)); // 6 4 2 0 - g1(row0, row1, row2, row3, t0); - t1 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 3, 1)); // 7 5 3 1 - g2(row0, row1, row2, row3, t1); - diagonalize(row0, row2, row3); - t2 = shuffle2!(m2, m3, _MM_SHUFFLE!(2, 0, 2, 0)); // 14 12 10 8 - t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE!(2, 1, 0, 3)); // 12 10 8 14 - g1(row0, row1, row2, row3, t2); - t3 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 1, 3, 1)); // 15 13 11 9 - t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE!(2, 1, 0, 3)); // 13 11 9 15 - g2(row0, row1, row2, row3, t3); - undiagonalize(row0, row2, row3); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 2. This round and all following rounds apply a fixed permutation - // to the message words from the round before. - t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); - g1(row0, row1, row2, row3, t0); - t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); - t1 = blend_epi16(tt, t1, 0xCC); - g2(row0, row1, row2, row3, t1); - diagonalize(row0, row2, row3); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); - g1(row0, row1, row2, row3, t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); - g2(row0, row1, row2, row3, t3); - undiagonalize(row0, row2, row3); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 3 - t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); - g1(row0, row1, row2, row3, t0); - t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); - t1 = blend_epi16(tt, t1, 0xCC); - g2(row0, row1, row2, row3, t1); - diagonalize(row0, row2, row3); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); - g1(row0, row1, row2, row3, t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); - g2(row0, row1, row2, row3, t3); - undiagonalize(row0, row2, row3); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 4 - t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); - g1(row0, row1, row2, row3, t0); - t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); - t1 = blend_epi16(tt, t1, 0xCC); - g2(row0, row1, row2, row3, t1); - diagonalize(row0, row2, row3); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); - g1(row0, row1, row2, row3, t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); - g2(row0, row1, row2, row3, t3); - undiagonalize(row0, row2, row3); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 5 - t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); - g1(row0, row1, row2, row3, t0); - t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); - t1 = blend_epi16(tt, t1, 0xCC); - g2(row0, row1, row2, row3, t1); - diagonalize(row0, row2, row3); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); - g1(row0, row1, row2, row3, t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); - g2(row0, row1, row2, row3, t3); - undiagonalize(row0, row2, row3); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 6 - t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); - g1(row0, row1, row2, row3, t0); - t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); - t1 = blend_epi16(tt, t1, 0xCC); - g2(row0, row1, row2, row3, t1); - diagonalize(row0, row2, row3); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); - g1(row0, row1, row2, row3, t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); - g2(row0, row1, row2, row3, t3); - undiagonalize(row0, row2, row3); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 7 - t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); - g1(row0, row1, row2, row3, t0); - t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); - t1 = blend_epi16(tt, t1, 0xCC); - g2(row0, row1, row2, row3, t1); - diagonalize(row0, row2, row3); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); - g1(row0, row1, row2, row3, t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); - g2(row0, row1, row2, row3, t3); - undiagonalize(row0, row2, row3); - - [*row0, *row1, *row2, *row3] -} - -#[target_feature(enable = "sse2")] -pub unsafe fn compress_in_place( - cv: &mut CVWords, - block: &[u8; BLOCK_LEN], - block_len: u8, - counter: u64, - flags: u8, -) { - let [row0, row1, row2, row3] = compress_pre(cv, block, block_len, counter, flags); - storeu(xor(row0, row2), cv.as_mut_ptr().add(0) as *mut u8); - storeu(xor(row1, row3), cv.as_mut_ptr().add(4) as *mut u8); -} - -#[target_feature(enable = "sse2")] -pub unsafe fn compress_xof( - cv: &CVWords, - block: &[u8; BLOCK_LEN], - block_len: u8, - counter: u64, - flags: u8, -) -> [u8; 64] { - let [mut row0, mut row1, mut row2, mut row3] = - compress_pre(cv, block, block_len, counter, flags); - row0 = xor(row0, row2); - row1 = xor(row1, row3); - row2 = xor(row2, loadu(cv.as_ptr().add(0) as *const u8)); - row3 = xor(row3, loadu(cv.as_ptr().add(4) as *const u8)); - core::mem::transmute([row0, row1, row2, row3]) -} - -#[inline(always)] -unsafe fn round(v: &mut [__m128i; 16], m: &[__m128i; 16], r: usize) { - v[0] = add(v[0], m[MSG_SCHEDULE[r][0]]); - v[1] = add(v[1], m[MSG_SCHEDULE[r][2]]); - v[2] = add(v[2], m[MSG_SCHEDULE[r][4]]); - v[3] = add(v[3], m[MSG_SCHEDULE[r][6]]); - v[0] = add(v[0], v[4]); - v[1] = add(v[1], v[5]); - v[2] = add(v[2], v[6]); - v[3] = add(v[3], v[7]); - v[12] = xor(v[12], v[0]); - v[13] = xor(v[13], v[1]); - v[14] = xor(v[14], v[2]); - v[15] = xor(v[15], v[3]); - v[12] = rot16(v[12]); - v[13] = rot16(v[13]); - v[14] = rot16(v[14]); - v[15] = rot16(v[15]); - v[8] = add(v[8], v[12]); - v[9] = add(v[9], v[13]); - v[10] = add(v[10], v[14]); - v[11] = add(v[11], v[15]); - v[4] = xor(v[4], v[8]); - v[5] = xor(v[5], v[9]); - v[6] = xor(v[6], v[10]); - v[7] = xor(v[7], v[11]); - v[4] = rot12(v[4]); - v[5] = rot12(v[5]); - v[6] = rot12(v[6]); - v[7] = rot12(v[7]); - v[0] = add(v[0], m[MSG_SCHEDULE[r][1]]); - v[1] = add(v[1], m[MSG_SCHEDULE[r][3]]); - v[2] = add(v[2], m[MSG_SCHEDULE[r][5]]); - v[3] = add(v[3], m[MSG_SCHEDULE[r][7]]); - v[0] = add(v[0], v[4]); - v[1] = add(v[1], v[5]); - v[2] = add(v[2], v[6]); - v[3] = add(v[3], v[7]); - v[12] = xor(v[12], v[0]); - v[13] = xor(v[13], v[1]); - v[14] = xor(v[14], v[2]); - v[15] = xor(v[15], v[3]); - v[12] = rot8(v[12]); - v[13] = rot8(v[13]); - v[14] = rot8(v[14]); - v[15] = rot8(v[15]); - v[8] = add(v[8], v[12]); - v[9] = add(v[9], v[13]); - v[10] = add(v[10], v[14]); - v[11] = add(v[11], v[15]); - v[4] = xor(v[4], v[8]); - v[5] = xor(v[5], v[9]); - v[6] = xor(v[6], v[10]); - v[7] = xor(v[7], v[11]); - v[4] = rot7(v[4]); - v[5] = rot7(v[5]); - v[6] = rot7(v[6]); - v[7] = rot7(v[7]); - - v[0] = add(v[0], m[MSG_SCHEDULE[r][8]]); - v[1] = add(v[1], m[MSG_SCHEDULE[r][10]]); - v[2] = add(v[2], m[MSG_SCHEDULE[r][12]]); - v[3] = add(v[3], m[MSG_SCHEDULE[r][14]]); - v[0] = add(v[0], v[5]); - v[1] = add(v[1], v[6]); - v[2] = add(v[2], v[7]); - v[3] = add(v[3], v[4]); - v[15] = xor(v[15], v[0]); - v[12] = xor(v[12], v[1]); - v[13] = xor(v[13], v[2]); - v[14] = xor(v[14], v[3]); - v[15] = rot16(v[15]); - v[12] = rot16(v[12]); - v[13] = rot16(v[13]); - v[14] = rot16(v[14]); - v[10] = add(v[10], v[15]); - v[11] = add(v[11], v[12]); - v[8] = add(v[8], v[13]); - v[9] = add(v[9], v[14]); - v[5] = xor(v[5], v[10]); - v[6] = xor(v[6], v[11]); - v[7] = xor(v[7], v[8]); - v[4] = xor(v[4], v[9]); - v[5] = rot12(v[5]); - v[6] = rot12(v[6]); - v[7] = rot12(v[7]); - v[4] = rot12(v[4]); - v[0] = add(v[0], m[MSG_SCHEDULE[r][9]]); - v[1] = add(v[1], m[MSG_SCHEDULE[r][11]]); - v[2] = add(v[2], m[MSG_SCHEDULE[r][13]]); - v[3] = add(v[3], m[MSG_SCHEDULE[r][15]]); - v[0] = add(v[0], v[5]); - v[1] = add(v[1], v[6]); - v[2] = add(v[2], v[7]); - v[3] = add(v[3], v[4]); - v[15] = xor(v[15], v[0]); - v[12] = xor(v[12], v[1]); - v[13] = xor(v[13], v[2]); - v[14] = xor(v[14], v[3]); - v[15] = rot8(v[15]); - v[12] = rot8(v[12]); - v[13] = rot8(v[13]); - v[14] = rot8(v[14]); - v[10] = add(v[10], v[15]); - v[11] = add(v[11], v[12]); - v[8] = add(v[8], v[13]); - v[9] = add(v[9], v[14]); - v[5] = xor(v[5], v[10]); - v[6] = xor(v[6], v[11]); - v[7] = xor(v[7], v[8]); - v[4] = xor(v[4], v[9]); - v[5] = rot7(v[5]); - v[6] = rot7(v[6]); - v[7] = rot7(v[7]); - v[4] = rot7(v[4]); -} - -#[inline(always)] -unsafe fn transpose_vecs(vecs: &mut [__m128i; DEGREE]) { - // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is - // 22/33. Note that this doesn't split the vector into two lanes, as the - // AVX2 counterparts do. - let ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); - let ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); - let cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); - let cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); - - // Interleave 64-bit lanes. - let abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01); - let abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01); - let abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23); - let abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23); - - vecs[0] = abcd_0; - vecs[1] = abcd_1; - vecs[2] = abcd_2; - vecs[3] = abcd_3; -} - -#[inline(always)] -unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [__m128i; 16] { - let mut vecs = [ - loadu(inputs[0].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[1].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[2].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[3].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[0].add(block_offset + 1 * 4 * DEGREE)), - loadu(inputs[1].add(block_offset + 1 * 4 * DEGREE)), - loadu(inputs[2].add(block_offset + 1 * 4 * DEGREE)), - loadu(inputs[3].add(block_offset + 1 * 4 * DEGREE)), - loadu(inputs[0].add(block_offset + 2 * 4 * DEGREE)), - loadu(inputs[1].add(block_offset + 2 * 4 * DEGREE)), - loadu(inputs[2].add(block_offset + 2 * 4 * DEGREE)), - loadu(inputs[3].add(block_offset + 2 * 4 * DEGREE)), - loadu(inputs[0].add(block_offset + 3 * 4 * DEGREE)), - loadu(inputs[1].add(block_offset + 3 * 4 * DEGREE)), - loadu(inputs[2].add(block_offset + 3 * 4 * DEGREE)), - loadu(inputs[3].add(block_offset + 3 * 4 * DEGREE)), - ]; - for prefetch in inputs.iter().take(DEGREE) { - _mm_prefetch(prefetch.add(block_offset + 256) as *const i8, _MM_HINT_T0); - } - for square in array_chunks_mut(&mut vecs) { - transpose_vecs(square); - } - vecs -} - -#[inline(always)] -unsafe fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m128i, __m128i) { - let mask = if increment_counter.yes() { !0 } else { 0 }; - ( - set4( - counter_low(counter + (mask & 0)), - counter_low(counter + (mask & 1)), - counter_low(counter + (mask & 2)), - counter_low(counter + (mask & 3)), - ), - set4( - counter_high(counter + (mask & 0)), - counter_high(counter + (mask & 1)), - counter_high(counter + (mask & 2)), - counter_high(counter + (mask & 3)), - ), - ) -} - -#[target_feature(enable = "sse2")] -pub unsafe fn hash4( - inputs: &[*const u8; DEGREE], - blocks: usize, - key: &CVWords, - counter: u64, - increment_counter: IncrementCounter, - flags: u8, - flags_start: u8, - flags_end: u8, - out: &mut [u8; DEGREE * OUT_LEN], -) { - let mut h_vecs = [ - set1(key[0]), - set1(key[1]), - set1(key[2]), - set1(key[3]), - set1(key[4]), - set1(key[5]), - set1(key[6]), - set1(key[7]), - ]; - let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter); - let mut block_flags = flags | flags_start; - - for block in 0..blocks { - if block + 1 == blocks { - block_flags |= flags_end; - } - let block_len_vec = set1(BLOCK_LEN as u32); // full blocks only - let block_flags_vec = set1(block_flags as u32); - let msg_vecs = transpose_msg_vecs(inputs, block * BLOCK_LEN); - - // The transposed compression function. Note that inlining this - // manually here improves compile times by a lot, compared to factoring - // it out into its own function and making it #[inline(always)]. Just - // guessing, it might have something to do with loop unrolling. - let mut v = [ - h_vecs[0], - h_vecs[1], - h_vecs[2], - h_vecs[3], - h_vecs[4], - h_vecs[5], - h_vecs[6], - h_vecs[7], - set1(IV[0]), - set1(IV[1]), - set1(IV[2]), - set1(IV[3]), - counter_low_vec, - counter_high_vec, - block_len_vec, - block_flags_vec, - ]; - round(&mut v, &msg_vecs, 0); - round(&mut v, &msg_vecs, 1); - round(&mut v, &msg_vecs, 2); - round(&mut v, &msg_vecs, 3); - round(&mut v, &msg_vecs, 4); - round(&mut v, &msg_vecs, 5); - round(&mut v, &msg_vecs, 6); - h_vecs[0] = xor(v[0], v[8]); - h_vecs[1] = xor(v[1], v[9]); - h_vecs[2] = xor(v[2], v[10]); - h_vecs[3] = xor(v[3], v[11]); - h_vecs[4] = xor(v[4], v[12]); - h_vecs[5] = xor(v[5], v[13]); - h_vecs[6] = xor(v[6], v[14]); - h_vecs[7] = xor(v[7], v[15]); - - block_flags = flags; - } - - for square in array_chunks_mut(&mut h_vecs) { - transpose_vecs(square); - } - - // The first four vecs now contain the first half of each output, and the - // second four vecs contain the second half of each output. - storeu(h_vecs[0], out.as_mut_ptr().add(0 * 4 * DEGREE)); - storeu(h_vecs[4], out.as_mut_ptr().add(1 * 4 * DEGREE)); - storeu(h_vecs[1], out.as_mut_ptr().add(2 * 4 * DEGREE)); - storeu(h_vecs[5], out.as_mut_ptr().add(3 * 4 * DEGREE)); - storeu(h_vecs[2], out.as_mut_ptr().add(4 * 4 * DEGREE)); - storeu(h_vecs[6], out.as_mut_ptr().add(5 * 4 * DEGREE)); - storeu(h_vecs[3], out.as_mut_ptr().add(6 * 4 * DEGREE)); - storeu(h_vecs[7], out.as_mut_ptr().add(7 * 4 * DEGREE)); -} - -#[target_feature(enable = "sse2")] -unsafe fn hash1( - input: &[u8; N], - key: &CVWords, - counter: u64, - flags: u8, - flags_start: u8, - flags_end: u8, - out: &mut CVBytes, -) { - debug_assert_eq!(N % BLOCK_LEN, 0, "uneven blocks"); - let mut cv = *key; - let mut block_flags = flags | flags_start; - let mut slice = &input[..]; - while slice.len() >= BLOCK_LEN { - if slice.len() == BLOCK_LEN { - block_flags |= flags_end; - } - compress_in_place( - &mut cv, - slice[0..BLOCK_LEN].try_into().unwrap(), - BLOCK_LEN as u8, - counter, - block_flags, - ); - block_flags = flags; - slice = &slice[BLOCK_LEN..]; - } - *out = core::mem::transmute(cv); // x86 is little-endian -} - -#[target_feature(enable = "sse2")] -pub unsafe fn hash_many( - mut inputs: &[&[u8; N]], - key: &CVWords, - mut counter: u64, - increment_counter: IncrementCounter, - flags: u8, - flags_start: u8, - flags_end: u8, - mut output: &mut [u8], -) { - debug_assert!(output.len() >= inputs.len() * OUT_LEN, "out too short"); - while inputs.len() >= DEGREE && output.len() >= DEGREE * OUT_LEN { - // Safe because the layout of arrays is guaranteed, and because the - // `blocks` count is determined statically from the argument type. - let input_ptrs: &[*const u8; DEGREE] = &*(inputs.as_ptr() as *const [*const u8; DEGREE]); - let blocks = N / BLOCK_LEN; - - let (out, rem) = split_array_mut(output); - hash4( - input_ptrs, - blocks, - key, - counter, - increment_counter, - flags, - flags_start, - flags_end, - out, - ); - if increment_counter.yes() { - counter += DEGREE as u64; - } - inputs = &inputs[DEGREE..]; - output = rem; - } - for (&input, output) in inputs.iter().zip(array_chunks_mut(output)) { - hash1(input, key, counter, flags, flags_start, flags_end, output); - if increment_counter.yes() { - counter += 1; - } - } -} - -#[cfg(test)] -mod test { - use crate::blake3::{ - platform, - test::{test_compress_fn, test_hash_many_fn}, - }; - - use super::*; - - #[test] - fn test_transpose() { - if !platform::sse2_detected() { - return; - } - - #[target_feature(enable = "sse2")] - unsafe fn transpose_wrapper(vecs: &mut [__m128i; DEGREE]) { - transpose_vecs(vecs); - } - - let mut matrix = [[0 as u32; DEGREE]; DEGREE]; - for i in 0..DEGREE { - for j in 0..DEGREE { - matrix[i][j] = (i * DEGREE + j) as u32; - } - } - - unsafe { - let mut vecs: [__m128i; DEGREE] = core::mem::transmute(matrix); - transpose_wrapper(&mut vecs); - matrix = core::mem::transmute(vecs); - } - - for i in 0..DEGREE { - for j in 0..DEGREE { - // Reversed indexes from above. - assert_eq!(matrix[j][i], (i * DEGREE + j) as u32); - } - } - } - - #[test] - fn test_compress() { - if !platform::sse2_detected() { - return; - } - test_compress_fn(compress_in_place, compress_xof); - } - - #[test] - fn test_hash_many() { - if !platform::sse2_detected() { - return; - } - test_hash_many_fn(hash_many, hash_many); - } -} diff --git a/narcissus-core/src/blake3/rust_sse41.rs b/narcissus-core/src/blake3/rust_sse41.rs deleted file mode 100644 index 6da3aee..0000000 --- a/narcissus-core/src/blake3/rust_sse41.rs +++ /dev/null @@ -1,762 +0,0 @@ -#[cfg(target_arch = "x86")] -use core::arch::x86::*; -#[cfg(target_arch = "x86_64")] -use core::arch::x86_64::*; - -use crate::slice::{array_chunks_mut, split_array_mut}; - -use super::{ - counter_high, counter_low, CVBytes, CVWords, IncrementCounter, BLOCK_LEN, IV, MSG_SCHEDULE, - OUT_LEN, -}; - -pub const DEGREE: usize = 4; - -#[inline(always)] -unsafe fn loadu(src: *const u8) -> __m128i { - // This is an unaligned load, so the pointer cast is allowed. - _mm_loadu_si128(src as *const __m128i) -} - -#[inline(always)] -unsafe fn storeu(src: __m128i, dest: *mut u8) { - // This is an unaligned store, so the pointer cast is allowed. - _mm_storeu_si128(dest as *mut __m128i, src) -} - -#[inline(always)] -unsafe fn add(a: __m128i, b: __m128i) -> __m128i { - _mm_add_epi32(a, b) -} - -#[inline(always)] -unsafe fn xor(a: __m128i, b: __m128i) -> __m128i { - _mm_xor_si128(a, b) -} - -#[inline(always)] -unsafe fn set1(x: u32) -> __m128i { - _mm_set1_epi32(x as i32) -} - -#[inline(always)] -unsafe fn set4(a: u32, b: u32, c: u32, d: u32) -> __m128i { - _mm_setr_epi32(a as i32, b as i32, c as i32, d as i32) -} - -// These rotations are the "simple/shifts version". For the -// "complicated/shuffles version", see -// https://github.com/sneves/blake2-avx2/blob/b3723921f668df09ece52dcd225a36d4a4eea1d9/blake2s-common.h#L63-L66. -// For a discussion of the tradeoffs, see -// https://github.com/sneves/blake2-avx2/pull/5. Due to an LLVM bug -// (https://bugs.llvm.org/show_bug.cgi?id=44379), this version performs better -// on recent x86 chips. - -#[inline(always)] -unsafe fn rot16(a: __m128i) -> __m128i { - _mm_or_si128(_mm_srli_epi32(a, 16), _mm_slli_epi32(a, 32 - 16)) -} - -#[inline(always)] -unsafe fn rot12(a: __m128i) -> __m128i { - _mm_or_si128(_mm_srli_epi32(a, 12), _mm_slli_epi32(a, 32 - 12)) -} - -#[inline(always)] -unsafe fn rot8(a: __m128i) -> __m128i { - _mm_or_si128(_mm_srli_epi32(a, 8), _mm_slli_epi32(a, 32 - 8)) -} - -#[inline(always)] -unsafe fn rot7(a: __m128i) -> __m128i { - _mm_or_si128(_mm_srli_epi32(a, 7), _mm_slli_epi32(a, 32 - 7)) -} - -#[inline(always)] -unsafe fn g1( - row0: &mut __m128i, - row1: &mut __m128i, - row2: &mut __m128i, - row3: &mut __m128i, - m: __m128i, -) { - *row0 = add(add(*row0, m), *row1); - *row3 = xor(*row3, *row0); - *row3 = rot16(*row3); - *row2 = add(*row2, *row3); - *row1 = xor(*row1, *row2); - *row1 = rot12(*row1); -} - -#[inline(always)] -unsafe fn g2( - row0: &mut __m128i, - row1: &mut __m128i, - row2: &mut __m128i, - row3: &mut __m128i, - m: __m128i, -) { - *row0 = add(add(*row0, m), *row1); - *row3 = xor(*row3, *row0); - *row3 = rot8(*row3); - *row2 = add(*row2, *row3); - *row1 = xor(*row1, *row2); - *row1 = rot7(*row1); -} - -// Adapted from https://github.com/rust-lang-nursery/stdsimd/pull/479. -macro_rules! _MM_SHUFFLE { - ($z:expr, $y:expr, $x:expr, $w:expr) => { - ($z << 6) | ($y << 4) | ($x << 2) | $w - }; -} - -macro_rules! shuffle2 { - ($a:expr, $b:expr, $c:expr) => { - _mm_castps_si128(_mm_shuffle_ps( - _mm_castsi128_ps($a), - _mm_castsi128_ps($b), - $c, - )) - }; -} - -// Note the optimization here of leaving row1 as the unrotated row, rather than -// row0. All the message loads below are adjusted to compensate for this. See -// discussion at https://github.com/sneves/blake2-avx2/pull/4 -#[inline(always)] -unsafe fn diagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) { - *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(2, 1, 0, 3)); - *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2)); - *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(0, 3, 2, 1)); -} - -#[inline(always)] -unsafe fn undiagonalize(row0: &mut __m128i, row2: &mut __m128i, row3: &mut __m128i) { - *row0 = _mm_shuffle_epi32(*row0, _MM_SHUFFLE!(0, 3, 2, 1)); - *row3 = _mm_shuffle_epi32(*row3, _MM_SHUFFLE!(1, 0, 3, 2)); - *row2 = _mm_shuffle_epi32(*row2, _MM_SHUFFLE!(2, 1, 0, 3)); -} - -#[inline(always)] -unsafe fn compress_pre( - cv: &CVWords, - block: &[u8; BLOCK_LEN], - block_len: u8, - counter: u64, - flags: u8, -) -> [__m128i; 4] { - let row0 = &mut loadu(cv.as_ptr().add(0) as *const u8); - let row1 = &mut loadu(cv.as_ptr().add(4) as *const u8); - let row2 = &mut set4(IV[0], IV[1], IV[2], IV[3]); - let row3 = &mut set4( - counter_low(counter), - counter_high(counter), - block_len as u32, - flags as u32, - ); - - let mut m0 = loadu(block.as_ptr().add(0 * 4 * DEGREE)); - let mut m1 = loadu(block.as_ptr().add(1 * 4 * DEGREE)); - let mut m2 = loadu(block.as_ptr().add(2 * 4 * DEGREE)); - let mut m3 = loadu(block.as_ptr().add(3 * 4 * DEGREE)); - - let mut t0; - let mut t1; - let mut t2; - let mut t3; - let mut tt; - - // Round 1. The first round permutes the message words from the original - // input order, into the groups that get mixed in parallel. - t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(2, 0, 2, 0)); // 6 4 2 0 - g1(row0, row1, row2, row3, t0); - t1 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 3, 1)); // 7 5 3 1 - g2(row0, row1, row2, row3, t1); - diagonalize(row0, row2, row3); - t2 = shuffle2!(m2, m3, _MM_SHUFFLE!(2, 0, 2, 0)); // 14 12 10 8 - t2 = _mm_shuffle_epi32(t2, _MM_SHUFFLE!(2, 1, 0, 3)); // 12 10 8 14 - g1(row0, row1, row2, row3, t2); - t3 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 1, 3, 1)); // 15 13 11 9 - t3 = _mm_shuffle_epi32(t3, _MM_SHUFFLE!(2, 1, 0, 3)); // 13 11 9 15 - g2(row0, row1, row2, row3, t3); - undiagonalize(row0, row2, row3); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 2. This round and all following rounds apply a fixed permutation - // to the message words from the round before. - t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); - g1(row0, row1, row2, row3, t0); - t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(row0, row1, row2, row3, t1); - diagonalize(row0, row2, row3); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); - g1(row0, row1, row2, row3, t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); - g2(row0, row1, row2, row3, t3); - undiagonalize(row0, row2, row3); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 3 - t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); - g1(row0, row1, row2, row3, t0); - t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(row0, row1, row2, row3, t1); - diagonalize(row0, row2, row3); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); - g1(row0, row1, row2, row3, t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); - g2(row0, row1, row2, row3, t3); - undiagonalize(row0, row2, row3); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 4 - t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); - g1(row0, row1, row2, row3, t0); - t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(row0, row1, row2, row3, t1); - diagonalize(row0, row2, row3); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); - g1(row0, row1, row2, row3, t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); - g2(row0, row1, row2, row3, t3); - undiagonalize(row0, row2, row3); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 5 - t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); - g1(row0, row1, row2, row3, t0); - t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(row0, row1, row2, row3, t1); - diagonalize(row0, row2, row3); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); - g1(row0, row1, row2, row3, t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); - g2(row0, row1, row2, row3, t3); - undiagonalize(row0, row2, row3); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 6 - t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); - g1(row0, row1, row2, row3, t0); - t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(row0, row1, row2, row3, t1); - diagonalize(row0, row2, row3); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); - g1(row0, row1, row2, row3, t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); - g2(row0, row1, row2, row3, t3); - undiagonalize(row0, row2, row3); - m0 = t0; - m1 = t1; - m2 = t2; - m3 = t3; - - // Round 7 - t0 = shuffle2!(m0, m1, _MM_SHUFFLE!(3, 1, 1, 2)); - t0 = _mm_shuffle_epi32(t0, _MM_SHUFFLE!(0, 3, 2, 1)); - g1(row0, row1, row2, row3, t0); - t1 = shuffle2!(m2, m3, _MM_SHUFFLE!(3, 3, 2, 2)); - tt = _mm_shuffle_epi32(m0, _MM_SHUFFLE!(0, 0, 3, 3)); - t1 = _mm_blend_epi16(tt, t1, 0xCC); - g2(row0, row1, row2, row3, t1); - diagonalize(row0, row2, row3); - t2 = _mm_unpacklo_epi64(m3, m1); - tt = _mm_blend_epi16(t2, m2, 0xC0); - t2 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(1, 3, 2, 0)); - g1(row0, row1, row2, row3, t2); - t3 = _mm_unpackhi_epi32(m1, m3); - tt = _mm_unpacklo_epi32(m2, t3); - t3 = _mm_shuffle_epi32(tt, _MM_SHUFFLE!(0, 1, 3, 2)); - g2(row0, row1, row2, row3, t3); - undiagonalize(row0, row2, row3); - - [*row0, *row1, *row2, *row3] -} - -#[target_feature(enable = "sse4.1")] -pub unsafe fn compress_in_place( - cv: &mut CVWords, - block: &[u8; BLOCK_LEN], - block_len: u8, - counter: u64, - flags: u8, -) { - let [row0, row1, row2, row3] = compress_pre(cv, block, block_len, counter, flags); - storeu(xor(row0, row2), cv.as_mut_ptr().add(0) as *mut u8); - storeu(xor(row1, row3), cv.as_mut_ptr().add(4) as *mut u8); -} - -#[target_feature(enable = "sse4.1")] -pub unsafe fn compress_xof( - cv: &CVWords, - block: &[u8; BLOCK_LEN], - block_len: u8, - counter: u64, - flags: u8, -) -> [u8; 64] { - let [mut row0, mut row1, mut row2, mut row3] = - compress_pre(cv, block, block_len, counter, flags); - row0 = xor(row0, row2); - row1 = xor(row1, row3); - row2 = xor(row2, loadu(cv.as_ptr().add(0) as *const u8)); - row3 = xor(row3, loadu(cv.as_ptr().add(4) as *const u8)); - core::mem::transmute([row0, row1, row2, row3]) -} - -#[inline(always)] -unsafe fn round(v: &mut [__m128i; 16], m: &[__m128i; 16], r: usize) { - v[0] = add(v[0], m[MSG_SCHEDULE[r][0]]); - v[1] = add(v[1], m[MSG_SCHEDULE[r][2]]); - v[2] = add(v[2], m[MSG_SCHEDULE[r][4]]); - v[3] = add(v[3], m[MSG_SCHEDULE[r][6]]); - v[0] = add(v[0], v[4]); - v[1] = add(v[1], v[5]); - v[2] = add(v[2], v[6]); - v[3] = add(v[3], v[7]); - v[12] = xor(v[12], v[0]); - v[13] = xor(v[13], v[1]); - v[14] = xor(v[14], v[2]); - v[15] = xor(v[15], v[3]); - v[12] = rot16(v[12]); - v[13] = rot16(v[13]); - v[14] = rot16(v[14]); - v[15] = rot16(v[15]); - v[8] = add(v[8], v[12]); - v[9] = add(v[9], v[13]); - v[10] = add(v[10], v[14]); - v[11] = add(v[11], v[15]); - v[4] = xor(v[4], v[8]); - v[5] = xor(v[5], v[9]); - v[6] = xor(v[6], v[10]); - v[7] = xor(v[7], v[11]); - v[4] = rot12(v[4]); - v[5] = rot12(v[5]); - v[6] = rot12(v[6]); - v[7] = rot12(v[7]); - v[0] = add(v[0], m[MSG_SCHEDULE[r][1]]); - v[1] = add(v[1], m[MSG_SCHEDULE[r][3]]); - v[2] = add(v[2], m[MSG_SCHEDULE[r][5]]); - v[3] = add(v[3], m[MSG_SCHEDULE[r][7]]); - v[0] = add(v[0], v[4]); - v[1] = add(v[1], v[5]); - v[2] = add(v[2], v[6]); - v[3] = add(v[3], v[7]); - v[12] = xor(v[12], v[0]); - v[13] = xor(v[13], v[1]); - v[14] = xor(v[14], v[2]); - v[15] = xor(v[15], v[3]); - v[12] = rot8(v[12]); - v[13] = rot8(v[13]); - v[14] = rot8(v[14]); - v[15] = rot8(v[15]); - v[8] = add(v[8], v[12]); - v[9] = add(v[9], v[13]); - v[10] = add(v[10], v[14]); - v[11] = add(v[11], v[15]); - v[4] = xor(v[4], v[8]); - v[5] = xor(v[5], v[9]); - v[6] = xor(v[6], v[10]); - v[7] = xor(v[7], v[11]); - v[4] = rot7(v[4]); - v[5] = rot7(v[5]); - v[6] = rot7(v[6]); - v[7] = rot7(v[7]); - - v[0] = add(v[0], m[MSG_SCHEDULE[r][8]]); - v[1] = add(v[1], m[MSG_SCHEDULE[r][10]]); - v[2] = add(v[2], m[MSG_SCHEDULE[r][12]]); - v[3] = add(v[3], m[MSG_SCHEDULE[r][14]]); - v[0] = add(v[0], v[5]); - v[1] = add(v[1], v[6]); - v[2] = add(v[2], v[7]); - v[3] = add(v[3], v[4]); - v[15] = xor(v[15], v[0]); - v[12] = xor(v[12], v[1]); - v[13] = xor(v[13], v[2]); - v[14] = xor(v[14], v[3]); - v[15] = rot16(v[15]); - v[12] = rot16(v[12]); - v[13] = rot16(v[13]); - v[14] = rot16(v[14]); - v[10] = add(v[10], v[15]); - v[11] = add(v[11], v[12]); - v[8] = add(v[8], v[13]); - v[9] = add(v[9], v[14]); - v[5] = xor(v[5], v[10]); - v[6] = xor(v[6], v[11]); - v[7] = xor(v[7], v[8]); - v[4] = xor(v[4], v[9]); - v[5] = rot12(v[5]); - v[6] = rot12(v[6]); - v[7] = rot12(v[7]); - v[4] = rot12(v[4]); - v[0] = add(v[0], m[MSG_SCHEDULE[r][9]]); - v[1] = add(v[1], m[MSG_SCHEDULE[r][11]]); - v[2] = add(v[2], m[MSG_SCHEDULE[r][13]]); - v[3] = add(v[3], m[MSG_SCHEDULE[r][15]]); - v[0] = add(v[0], v[5]); - v[1] = add(v[1], v[6]); - v[2] = add(v[2], v[7]); - v[3] = add(v[3], v[4]); - v[15] = xor(v[15], v[0]); - v[12] = xor(v[12], v[1]); - v[13] = xor(v[13], v[2]); - v[14] = xor(v[14], v[3]); - v[15] = rot8(v[15]); - v[12] = rot8(v[12]); - v[13] = rot8(v[13]); - v[14] = rot8(v[14]); - v[10] = add(v[10], v[15]); - v[11] = add(v[11], v[12]); - v[8] = add(v[8], v[13]); - v[9] = add(v[9], v[14]); - v[5] = xor(v[5], v[10]); - v[6] = xor(v[6], v[11]); - v[7] = xor(v[7], v[8]); - v[4] = xor(v[4], v[9]); - v[5] = rot7(v[5]); - v[6] = rot7(v[6]); - v[7] = rot7(v[7]); - v[4] = rot7(v[4]); -} - -#[inline(always)] -unsafe fn transpose_vecs(vecs: &mut [__m128i; DEGREE]) { - // Interleave 32-bit lates. The low unpack is lanes 00/11 and the high is - // 22/33. Note that this doesn't split the vector into two lanes, as the - // AVX2 counterparts do. - let ab_01 = _mm_unpacklo_epi32(vecs[0], vecs[1]); - let ab_23 = _mm_unpackhi_epi32(vecs[0], vecs[1]); - let cd_01 = _mm_unpacklo_epi32(vecs[2], vecs[3]); - let cd_23 = _mm_unpackhi_epi32(vecs[2], vecs[3]); - - // Interleave 64-bit lanes. - let abcd_0 = _mm_unpacklo_epi64(ab_01, cd_01); - let abcd_1 = _mm_unpackhi_epi64(ab_01, cd_01); - let abcd_2 = _mm_unpacklo_epi64(ab_23, cd_23); - let abcd_3 = _mm_unpackhi_epi64(ab_23, cd_23); - - vecs[0] = abcd_0; - vecs[1] = abcd_1; - vecs[2] = abcd_2; - vecs[3] = abcd_3; -} - -#[inline(always)] -unsafe fn transpose_msg_vecs(inputs: &[*const u8; DEGREE], block_offset: usize) -> [__m128i; 16] { - let mut vecs = [ - loadu(inputs[0].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[1].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[2].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[3].add(block_offset + 0 * 4 * DEGREE)), - loadu(inputs[0].add(block_offset + 1 * 4 * DEGREE)), - loadu(inputs[1].add(block_offset + 1 * 4 * DEGREE)), - loadu(inputs[2].add(block_offset + 1 * 4 * DEGREE)), - loadu(inputs[3].add(block_offset + 1 * 4 * DEGREE)), - loadu(inputs[0].add(block_offset + 2 * 4 * DEGREE)), - loadu(inputs[1].add(block_offset + 2 * 4 * DEGREE)), - loadu(inputs[2].add(block_offset + 2 * 4 * DEGREE)), - loadu(inputs[3].add(block_offset + 2 * 4 * DEGREE)), - loadu(inputs[0].add(block_offset + 3 * 4 * DEGREE)), - loadu(inputs[1].add(block_offset + 3 * 4 * DEGREE)), - loadu(inputs[2].add(block_offset + 3 * 4 * DEGREE)), - loadu(inputs[3].add(block_offset + 3 * 4 * DEGREE)), - ]; - for prefetch in inputs.iter().take(DEGREE) { - _mm_prefetch(prefetch.add(block_offset + 256) as *const i8, _MM_HINT_T0); - } - for square in array_chunks_mut(&mut vecs) { - transpose_vecs(square); - } - vecs -} - -#[inline(always)] -unsafe fn load_counters(counter: u64, increment_counter: IncrementCounter) -> (__m128i, __m128i) { - let mask = if increment_counter.yes() { !0 } else { 0 }; - ( - set4( - counter_low(counter + (mask & 0)), - counter_low(counter + (mask & 1)), - counter_low(counter + (mask & 2)), - counter_low(counter + (mask & 3)), - ), - set4( - counter_high(counter + (mask & 0)), - counter_high(counter + (mask & 1)), - counter_high(counter + (mask & 2)), - counter_high(counter + (mask & 3)), - ), - ) -} - -#[target_feature(enable = "sse4.1")] -pub unsafe fn hash4( - inputs: &[*const u8; DEGREE], - blocks: usize, - key: &CVWords, - counter: u64, - increment_counter: IncrementCounter, - flags: u8, - flags_start: u8, - flags_end: u8, - out: &mut [u8; DEGREE * OUT_LEN], -) { - let mut h_vecs = [ - set1(key[0]), - set1(key[1]), - set1(key[2]), - set1(key[3]), - set1(key[4]), - set1(key[5]), - set1(key[6]), - set1(key[7]), - ]; - let (counter_low_vec, counter_high_vec) = load_counters(counter, increment_counter); - let mut block_flags = flags | flags_start; - - for block in 0..blocks { - if block + 1 == blocks { - block_flags |= flags_end; - } - let block_len_vec = set1(BLOCK_LEN as u32); // full blocks only - let block_flags_vec = set1(block_flags as u32); - let msg_vecs = transpose_msg_vecs(inputs, block * BLOCK_LEN); - - // The transposed compression function. Note that inlining this - // manually here improves compile times by a lot, compared to factoring - // it out into its own function and making it #[inline(always)]. Just - // guessing, it might have something to do with loop unrolling. - let mut v = [ - h_vecs[0], - h_vecs[1], - h_vecs[2], - h_vecs[3], - h_vecs[4], - h_vecs[5], - h_vecs[6], - h_vecs[7], - set1(IV[0]), - set1(IV[1]), - set1(IV[2]), - set1(IV[3]), - counter_low_vec, - counter_high_vec, - block_len_vec, - block_flags_vec, - ]; - round(&mut v, &msg_vecs, 0); - round(&mut v, &msg_vecs, 1); - round(&mut v, &msg_vecs, 2); - round(&mut v, &msg_vecs, 3); - round(&mut v, &msg_vecs, 4); - round(&mut v, &msg_vecs, 5); - round(&mut v, &msg_vecs, 6); - h_vecs[0] = xor(v[0], v[8]); - h_vecs[1] = xor(v[1], v[9]); - h_vecs[2] = xor(v[2], v[10]); - h_vecs[3] = xor(v[3], v[11]); - h_vecs[4] = xor(v[4], v[12]); - h_vecs[5] = xor(v[5], v[13]); - h_vecs[6] = xor(v[6], v[14]); - h_vecs[7] = xor(v[7], v[15]); - - block_flags = flags; - } - - for square in array_chunks_mut(&mut h_vecs) { - transpose_vecs(square); - } - - // The first four vecs now contain the first half of each output, and the - // second four vecs contain the second half of each output. - storeu(h_vecs[0], out.as_mut_ptr().add(0 * 4 * DEGREE)); - storeu(h_vecs[4], out.as_mut_ptr().add(1 * 4 * DEGREE)); - storeu(h_vecs[1], out.as_mut_ptr().add(2 * 4 * DEGREE)); - storeu(h_vecs[5], out.as_mut_ptr().add(3 * 4 * DEGREE)); - storeu(h_vecs[2], out.as_mut_ptr().add(4 * 4 * DEGREE)); - storeu(h_vecs[6], out.as_mut_ptr().add(5 * 4 * DEGREE)); - storeu(h_vecs[3], out.as_mut_ptr().add(6 * 4 * DEGREE)); - storeu(h_vecs[7], out.as_mut_ptr().add(7 * 4 * DEGREE)); -} - -#[target_feature(enable = "sse4.1")] -unsafe fn hash1( - input: &[u8; N], - key: &CVWords, - counter: u64, - flags: u8, - flags_start: u8, - flags_end: u8, - out: &mut CVBytes, -) { - debug_assert_eq!(N % BLOCK_LEN, 0, "uneven blocks"); - let mut cv = *key; - let mut block_flags = flags | flags_start; - let mut slice = &input[..]; - while slice.len() >= BLOCK_LEN { - if slice.len() == BLOCK_LEN { - block_flags |= flags_end; - } - compress_in_place( - &mut cv, - slice[0..BLOCK_LEN].try_into().unwrap(), - BLOCK_LEN as u8, - counter, - block_flags, - ); - block_flags = flags; - slice = &slice[BLOCK_LEN..]; - } - *out = core::mem::transmute(cv); // x86 is little-endian -} - -#[target_feature(enable = "sse4.1")] -pub unsafe fn hash_many( - mut inputs: &[&[u8; N]], - key: &CVWords, - mut counter: u64, - increment_counter: IncrementCounter, - flags: u8, - flags_start: u8, - flags_end: u8, - mut output: &mut [u8], -) { - debug_assert!(output.len() >= inputs.len() * OUT_LEN, "out too short"); - while inputs.len() >= DEGREE && output.len() >= DEGREE * OUT_LEN { - // Safe because the layout of arrays is guaranteed, and because the - // `blocks` count is determined statically from the argument type. - let input_ptrs: &[*const u8; DEGREE] = &*(inputs.as_ptr() as *const [*const u8; DEGREE]); - let blocks = N / BLOCK_LEN; - - let (out, rem) = split_array_mut(output); - hash4( - input_ptrs, - blocks, - key, - counter, - increment_counter, - flags, - flags_start, - flags_end, - out, - ); - if increment_counter.yes() { - counter += DEGREE as u64; - } - inputs = &inputs[DEGREE..]; - output = rem; - } - for (&input, output) in inputs.iter().zip(array_chunks_mut(output)) { - hash1(input, key, counter, flags, flags_start, flags_end, output); - if increment_counter.yes() { - counter += 1; - } - } -} - -#[cfg(test)] -mod test { - use super::super::platform; - use super::super::test::*; - use super::*; - - #[test] - fn test_transpose() { - if !platform::sse41_detected() { - return; - } - - #[target_feature(enable = "sse4.1")] - unsafe fn transpose_wrapper(vecs: &mut [__m128i; DEGREE]) { - transpose_vecs(vecs); - } - - let mut matrix = [[0 as u32; DEGREE]; DEGREE]; - for i in 0..DEGREE { - for j in 0..DEGREE { - matrix[i][j] = (i * DEGREE + j) as u32; - } - } - - unsafe { - let mut vecs: [__m128i; DEGREE] = core::mem::transmute(matrix); - transpose_wrapper(&mut vecs); - matrix = core::mem::transmute(vecs); - } - - for i in 0..DEGREE { - for j in 0..DEGREE { - // Reversed indexes from above. - assert_eq!(matrix[j][i], (i * DEGREE + j) as u32); - } - } - } - - #[test] - fn test_compress() { - if !platform::sse41_detected() { - return; - } - test_compress_fn(compress_in_place, compress_xof); - } - - #[test] - fn test_hash_many() { - if !platform::sse41_detected() { - return; - } - test_hash_many_fn(hash_many, hash_many); - } -} diff --git a/narcissus-core/src/blake3/test.rs b/narcissus-core/src/blake3/test.rs deleted file mode 100644 index 04597ef..0000000 --- a/narcissus-core/src/blake3/test.rs +++ /dev/null @@ -1,507 +0,0 @@ -use crate::blake3::reference_impl; -use crate::rand::Pcg64; -use crate::slice::array_chunks; -use crate::FixedVec; - -use super::portable; -use super::{CVBytes, CVWords, IncrementCounter, BLOCK_LEN, CHUNK_LEN, OUT_LEN}; -use core::usize; - -// Interesting input lengths to run tests on. -pub const TEST_CASES: &[usize] = &[ - 0, - 1, - 2, - 3, - 4, - 5, - 6, - 7, - 8, - BLOCK_LEN - 1, - BLOCK_LEN, - BLOCK_LEN + 1, - 2 * BLOCK_LEN - 1, - 2 * BLOCK_LEN, - 2 * BLOCK_LEN + 1, - CHUNK_LEN - 1, - CHUNK_LEN, - CHUNK_LEN + 1, - 2 * CHUNK_LEN, - 2 * CHUNK_LEN + 1, - 3 * CHUNK_LEN, - 3 * CHUNK_LEN + 1, - 4 * CHUNK_LEN, - 4 * CHUNK_LEN + 1, - 5 * CHUNK_LEN, - 5 * CHUNK_LEN + 1, - 6 * CHUNK_LEN, - 6 * CHUNK_LEN + 1, - 7 * CHUNK_LEN, - 7 * CHUNK_LEN + 1, - 8 * CHUNK_LEN, - 8 * CHUNK_LEN + 1, - 16 * CHUNK_LEN, // AVX512's bandwidth - 31 * CHUNK_LEN, // 16 + 8 + 4 + 2 + 1 - 100 * CHUNK_LEN, // subtrees larger than MAX_SIMD_DEGREE chunks -]; - -pub const TEST_CASES_MAX: usize = 100 * CHUNK_LEN; - -// There's a test to make sure these two are equal below. -pub const TEST_KEY: CVBytes = *b"whats the Elvish word for friend"; -pub const TEST_KEY_WORDS: CVWords = [ - 1952540791, 1752440947, 1816469605, 1752394102, 1919907616, 1868963940, 1919295602, 1684956521, -]; - -// Paint the input with a repeating byte pattern. We use a cycle length of 251, -// because that's the largets prime number less than 256. This makes it -// unlikely to swapping any two adjacent input blocks or chunks will give the -// same answer. -pub fn paint_test_input(buf: &mut [u8]) { - for (i, b) in buf.iter_mut().enumerate() { - *b = (i % 251) as u8; - } -} - -type CompressInPlaceFn = - unsafe fn(cv: &mut CVWords, block: &[u8; BLOCK_LEN], block_len: u8, counter: u64, flags: u8); - -type CompressXofFn = unsafe fn( - cv: &CVWords, - block: &[u8; BLOCK_LEN], - block_len: u8, - counter: u64, - flags: u8, -) -> [u8; 64]; - -// A shared helper function for platform-specific tests. -pub fn test_compress_fn(compress_in_place_fn: CompressInPlaceFn, compress_xof_fn: CompressXofFn) { - let initial_state = TEST_KEY_WORDS; - let block_len: u8 = 61; - let mut block = [0; BLOCK_LEN]; - paint_test_input(&mut block[..block_len as usize]); - // Use a counter with set bits in both 32-bit words. - let counter = (5u64 << 32) + 6; - let flags = super::CHUNK_END | super::ROOT | super::KEYED_HASH; - - let portable_out = - portable::compress_xof(&initial_state, &block, block_len, counter as u64, flags); - - let mut test_state = initial_state; - unsafe { compress_in_place_fn(&mut test_state, &block, block_len, counter as u64, flags) }; - let test_state_bytes = super::platform::le_bytes_from_words_32(&test_state); - let test_xof = - unsafe { compress_xof_fn(&initial_state, &block, block_len, counter as u64, flags) }; - - assert_eq!(&portable_out[..32], &test_state_bytes[..]); - assert_eq!(&portable_out[..], &test_xof[..]); -} - -type HashManyFn = unsafe fn( - inputs: &[&A], - key: &CVWords, - counter: u64, - increment_counter: IncrementCounter, - flags: u8, - flags_start: u8, - flags_end: u8, - out: &mut [u8], -); - -// A shared helper function for platform-specific tests. -pub fn test_hash_many_fn( - hash_many_chunks_fn: HashManyFn<[u8; CHUNK_LEN]>, - hash_many_parents_fn: HashManyFn<[u8; 2 * OUT_LEN]>, -) { - // Test a few different initial counter values. - // - 0: The base case. - // - u32::MAX: The low word of the counter overflows for all inputs except the first. - // - i32::MAX: *No* overflow. But carry bugs in tricky SIMD code can screw this up, if you XOR - // when you're supposed to ANDNOT... - let initial_counters = [0, u32::MAX as u64, i32::MAX as u64]; - for counter in initial_counters { - // 31 (16 + 8 + 4 + 2 + 1) inputs - const NUM_INPUTS: usize = 31; - let mut input_buf = [0; CHUNK_LEN * NUM_INPUTS]; - super::test::paint_test_input(&mut input_buf); - - // First hash chunks. - let mut chunks = FixedVec::<&[u8; CHUNK_LEN], NUM_INPUTS>::new(); - for chunk in array_chunks(&input_buf).take(NUM_INPUTS) { - chunks.push(chunk); - } - let mut portable_chunks_out = [0; NUM_INPUTS * OUT_LEN]; - portable::hash_many( - &chunks, - &TEST_KEY_WORDS, - counter, - IncrementCounter::Yes, - super::KEYED_HASH, - super::CHUNK_START, - super::CHUNK_END, - &mut portable_chunks_out, - ); - - let mut test_chunks_out = [0; NUM_INPUTS * OUT_LEN]; - unsafe { - hash_many_chunks_fn( - &chunks[..], - &TEST_KEY_WORDS, - counter, - IncrementCounter::Yes, - super::KEYED_HASH, - super::CHUNK_START, - super::CHUNK_END, - &mut test_chunks_out, - ); - } - for n in 0..NUM_INPUTS { - assert_eq!( - &portable_chunks_out[n * OUT_LEN..][..OUT_LEN], - &test_chunks_out[n * OUT_LEN..][..OUT_LEN] - ); - } - - // Then hash parents. - let mut parents = FixedVec::<&[u8; 2 * OUT_LEN], NUM_INPUTS>::new(); - for parent in array_chunks(&input_buf).take(NUM_INPUTS) { - parents.push(parent); - } - let mut portable_parents_out = [0; NUM_INPUTS * OUT_LEN]; - portable::hash_many( - &parents, - &TEST_KEY_WORDS, - counter, - IncrementCounter::No, - super::KEYED_HASH | super::PARENT, - 0, - 0, - &mut portable_parents_out, - ); - - let mut test_parents_out = [0; NUM_INPUTS * OUT_LEN]; - unsafe { - hash_many_parents_fn( - &parents[..], - &TEST_KEY_WORDS, - counter, - IncrementCounter::No, - super::KEYED_HASH | super::PARENT, - 0, - 0, - &mut test_parents_out, - ); - } - for n in 0..NUM_INPUTS { - assert_eq!( - &portable_parents_out[n * OUT_LEN..][..OUT_LEN], - &test_parents_out[n * OUT_LEN..][..OUT_LEN] - ); - } - } -} - -#[test] -fn test_key_bytes_equal_key_words() { - assert_eq!( - TEST_KEY_WORDS, - super::platform::words_from_le_bytes_32(&TEST_KEY), - ); -} - -#[test] -fn test_reference_impl_size() { - // Because the Rust compiler optimizes struct layout, it's possible that - // some future version of the compiler will produce a different size. If - // that happens, we can either disable this test, or test for multiple - // expected values. For now, the purpose of this test is to make sure we - // notice if that happens. - assert_eq!(1880, core::mem::size_of::()); -} - -#[test] -fn test_counter_words() { - let counter: u64 = (1 << 32) + 2; - assert_eq!(super::counter_low(counter), 2); - assert_eq!(super::counter_high(counter), 1); -} - -#[test] -fn test_largest_power_of_two_leq() { - let input_output = &[ - // The zero case is nonsensical, but it does work. - (0, 1), - (1, 1), - (2, 2), - (3, 2), - (4, 4), - (5, 4), - (6, 4), - (7, 4), - (8, 8), - // the largest possible usize - (usize::MAX, (usize::MAX >> 1) + 1), - ]; - for &(input, output) in input_output { - assert_eq!( - output, - super::largest_power_of_two_leq(input), - "wrong output for n={}", - input - ); - } -} - -#[test] -fn test_left_len() { - let input_output = &[ - (CHUNK_LEN + 1, CHUNK_LEN), - (2 * CHUNK_LEN - 1, CHUNK_LEN), - (2 * CHUNK_LEN, CHUNK_LEN), - (2 * CHUNK_LEN + 1, 2 * CHUNK_LEN), - (4 * CHUNK_LEN - 1, 2 * CHUNK_LEN), - (4 * CHUNK_LEN, 2 * CHUNK_LEN), - (4 * CHUNK_LEN + 1, 4 * CHUNK_LEN), - ]; - for &(input, output) in input_output { - assert_eq!(super::left_len(input), output); - } -} - -#[test] -fn test_compare_reference_impl() { - const OUT: usize = 303; // more than 64, not a multiple of 4 - let mut input_buf = [0; TEST_CASES_MAX]; - paint_test_input(&mut input_buf); - for &case in TEST_CASES { - let input = &input_buf[..case]; - - // regular - { - let mut reference_hasher = reference_impl::Hasher::new(); - reference_hasher.update(input); - let mut expected_out = [0; OUT]; - reference_hasher.finalize(&mut expected_out); - - // all at once - let test_out = super::hash(input); - let out: &[u8; 32] = &expected_out[0..32].try_into().unwrap(); - assert_eq!(test_out, *out); - // incremental - let mut hasher = super::Hasher::new(); - hasher.update(input); - assert_eq!(hasher.finalize(), *&expected_out[0..32]); - assert_eq!(hasher.finalize(), test_out); - // xof - let mut extended = [0; OUT]; - hasher.finalize_xof().fill(&mut extended); - assert_eq!(extended, expected_out); - } - - // keyed - { - let mut reference_hasher = reference_impl::Hasher::new_keyed(&TEST_KEY); - reference_hasher.update(input); - let mut expected_out = [0; OUT]; - reference_hasher.finalize(&mut expected_out); - - // all at once - let test_out = super::keyed_hash(&TEST_KEY, input); - assert_eq!(test_out, expected_out[0..32]); - // incremental - let mut hasher = super::Hasher::new_keyed(&TEST_KEY); - hasher.update(input); - assert_eq!(hasher.finalize(), expected_out[0..32]); - assert_eq!(hasher.finalize(), test_out); - // xof - let mut extended = [0; OUT]; - hasher.finalize_xof().fill(&mut extended); - assert_eq!(extended, expected_out); - } - - // derive_key - { - let context = "BLAKE3 2019-12-27 16:13:59 example context (not the test vector one)"; - let mut reference_hasher = reference_impl::Hasher::new_derive_key(context); - reference_hasher.update(input); - let mut expected_out = [0; OUT]; - reference_hasher.finalize(&mut expected_out); - - // all at once - let test_out = super::derive_key(context, input); - assert_eq!(test_out, expected_out[..32]); - // incremental - let mut hasher = super::Hasher::new_derive_key(context); - hasher.update(input); - assert_eq!(hasher.finalize(), expected_out[0..32]); - assert_eq!(hasher.finalize(), test_out[0..32]); - // xof - let mut extended = [0; OUT]; - hasher.finalize_xof().fill(&mut extended); - assert_eq!(extended, expected_out); - } - } -} - -fn reference_hash(input: &[u8]) -> super::Hash { - let mut hasher = reference_impl::Hasher::new(); - hasher.update(input); - let mut bytes = [0; 32]; - hasher.finalize(&mut bytes); - bytes.into() -} - -#[test] -fn test_compare_update_multiple() { - // Don't use all the long test cases here, since that's unnecessarily slow - // in debug mode. - let mut short_test_cases = TEST_CASES; - while *short_test_cases.last().unwrap() > 4 * CHUNK_LEN { - short_test_cases = &short_test_cases[..short_test_cases.len() - 1]; - } - assert_eq!(*short_test_cases.last().unwrap(), 4 * CHUNK_LEN); - - let mut input_buf = [0; 2 * TEST_CASES_MAX]; - paint_test_input(&mut input_buf); - - for &first_update in short_test_cases { - #[cfg(feature = "std")] - dbg!(first_update); - let first_input = &input_buf[..first_update]; - let mut test_hasher = super::Hasher::new(); - test_hasher.update(first_input); - - for &second_update in short_test_cases { - #[cfg(feature = "std")] - dbg!(second_update); - let second_input = &input_buf[first_update..][..second_update]; - let total_input = &input_buf[..first_update + second_update]; - - // Clone the hasher with first_update bytes already written, so - // that the next iteration can reuse it. - let mut test_hasher = test_hasher.clone(); - test_hasher.update(second_input); - let expected = reference_hash(total_input); - assert_eq!(expected, test_hasher.finalize()); - } - } -} - -#[test] -fn test_fuzz_hasher() { - const INPUT_MAX: usize = 4 * CHUNK_LEN; - let mut input_buf = [0; 3 * INPUT_MAX]; - paint_test_input(&mut input_buf); - - // Don't do too many iterations in debug mode, to keep the tests under a - // second or so. CI should run tests in release mode also. Provide an - // environment variable for specifying a larger number of fuzz iterations. - let num_tests = if cfg!(debug_assertions) { 100 } else { 10_000 }; - - // Use a fixed RNG seed for reproducibility. - let mut rng = Pcg64::new(); - for _num_test in 0..num_tests { - #[cfg(feature = "std")] - dbg!(_num_test); - let mut hasher = super::Hasher::new(); - let mut total_input = 0; - // For each test, write 3 inputs of random length. - for _ in 0..3 { - let input_len = rng.next_bound_u64((INPUT_MAX + 1) as u64) as usize; - let input = &input_buf[total_input..][..input_len]; - hasher.update(input); - total_input += input_len; - } - let expected = reference_hash(&input_buf[..total_input]); - assert_eq!(expected, hasher.finalize()); - } -} - -#[test] -fn test_xof_seek() { - let mut out = [0; 533]; - let mut hasher = super::Hasher::new(); - hasher.update(b"foo"); - hasher.finalize_xof().fill(&mut out); - assert_eq!(hasher.finalize().as_bytes(), &out[0..32]); - - let mut reader = hasher.finalize_xof(); - reader.set_position(303); - let mut out2 = [0; 102]; - reader.fill(&mut out2); - assert_eq!(&out[303..][..102], &out2[..]); -} - -#[test] -fn test_msg_schdule_permutation() { - let permutation = [2, 6, 3, 10, 7, 0, 4, 13, 1, 11, 12, 5, 9, 14, 15, 8]; - - let mut generated = [[0; 16]; 7]; - generated[0] = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]; - - for round in 1..7 { - for i in 0..16 { - generated[round][i] = generated[round - 1][permutation[i]]; - } - } - - assert_eq!(generated, super::MSG_SCHEDULE); -} - -#[test] -fn test_reset() { - let mut hasher = super::Hasher::new(); - hasher.update(&[42; 3 * CHUNK_LEN + 7]); - hasher.reset(); - hasher.update(&[42; CHUNK_LEN + 3]); - assert_eq!(hasher.finalize(), super::hash(&[42; CHUNK_LEN + 3])); - - let key = &[99; super::KEY_LEN]; - let mut keyed_hasher = super::Hasher::new_keyed(key); - keyed_hasher.update(&[42; 3 * CHUNK_LEN + 7]); - keyed_hasher.reset(); - keyed_hasher.update(&[42; CHUNK_LEN + 3]); - assert_eq!( - keyed_hasher.finalize(), - super::keyed_hash(key, &[42; CHUNK_LEN + 3]), - ); - - let context = "BLAKE3 2020-02-12 10:20:58 reset test"; - let mut kdf = super::Hasher::new_derive_key(context); - kdf.update(&[42; 3 * CHUNK_LEN + 7]); - kdf.reset(); - kdf.update(&[42; CHUNK_LEN + 3]); - let expected = super::derive_key(context, &[42; CHUNK_LEN + 3]); - assert_eq!(kdf.finalize(), expected); -} - -// This test is a mimized failure case for the Windows SSE2 bug described in -// https://github.com/BLAKE3-team/BLAKE3/issues/206. -// -// Before that issue was fixed, this test would fail on Windows in the following configuration: -// -// cargo test --features=no_avx512,no_avx2,no_sse41 --release -// -// Bugs like this one (stomping on a caller's register) are very sensitive to the details of -// surrounding code, so it's not especially likely that this test will catch another bug (or even -// the same bug) in the future. Still, there's no harm in keeping it. -#[test] -fn test_issue_206_windows_sse2() { - // This stupid loop has to be here to trigger the bug. I don't know why. - for _ in &[0] { - // The length 65 (two blocks) is significant. It doesn't repro with 64 (one block). It also - // doesn't repro with an all-zero input. - let input = &[0xff; 65]; - let expected_hash = [ - 183, 235, 50, 217, 156, 24, 190, 219, 2, 216, 176, 255, 224, 53, 28, 95, 57, 148, 179, - 245, 162, 90, 37, 121, 0, 142, 219, 62, 234, 204, 225, 161, - ]; - - // This throwaway call has to be here to trigger the bug. - super::Hasher::new().update(input); - - // This assert fails when the bug is triggered. - assert_eq!(super::Hasher::new().update(input).finalize(), expected_hash); - } -} diff --git a/narcissus-core/src/lib.rs b/narcissus-core/src/lib.rs index a117e75..2b4380c 100644 --- a/narcissus-core/src/lib.rs +++ b/narcissus-core/src/lib.rs @@ -1,6 +1,5 @@ mod arena; mod bitset; -pub mod blake3; mod fixed_vec; mod libc; pub mod manual_arc; -- 2.49.0