// k12 implements the KangarooTwelve XOF. // // KangarooTwelve is being standardised at the CFRG working group // of the IRTF. This package implements draft 10. // // https://datatracker.ietf.org/doc/draft-irtf-cfrg-kangarootwelve/10/ package k12 import ( "encoding/binary" "github.com/cloudflare/circl/internal/sha3" "github.com/cloudflare/circl/simd/keccakf1600" ) const chunkSize = 8192 // aka B // KangarooTwelve splits the message into chunks of 8192 bytes each. // The first chunk is absorbed directly in a TurboSHAKE128 instance, which // we call the stalk. The subsequent chunks aren't absorbed directly, but // instead their hash is absorbed: they're like leafs on a stalk. // If we have a fast TurboSHAKE128 available, we buffer chunks until we have // enough to do the parallel TurboSHAKE128. If not, we absorb directly into // a separate TurboSHAKE128 state. type State struct { initialTodo int // Bytes left to absorb for the first chunk. stalk sha3.State context []byte // context string "C" provided by the user // buffer of incoming data so we can do parallel TurboSHAKE128: // nil when we haven't absorbed the first chunk yet; // empty if we have, but we do not have a fast parallel TurboSHAKE128; // and chunkSize*lanes in length if we have. buf []byte offset int // offset in buf or bytes written to leaf // Number of chunk hashes ("CV_i") absorbed into the stalk. chunk uint // TurboSHAKE128 instance to compute the leaf in case we don't have // a fast parallel TurboSHAKE128, viz when lanes == 1. leaf *sha3.State lanes uint8 // number of TurboSHAKE128s to compute in parallel } // NewDraft10 creates a new instance of Kangaroo12 draft version -10. func NewDraft10(c []byte) State { var lanes byte = 1 if keccakf1600.IsEnabledX4() { lanes = 4 } else if keccakf1600.IsEnabledX2() { lanes = 2 } return newDraft10(c, lanes) } func newDraft10(c []byte, lanes byte) State { return State{ initialTodo: chunkSize, stalk: sha3.NewTurboShake128(0x07), context: c, lanes: lanes, } } func (s *State) Reset() { s.initialTodo = chunkSize s.stalk.Reset() s.stalk.SwitchDS(0x07) s.buf = nil s.offset = 0 s.chunk = 0 } func (s *State) Clone() State { stalk := s.stalk.Clone().(*sha3.State) ret := State{ initialTodo: s.initialTodo, stalk: *stalk, context: s.context, offset: s.offset, chunk: s.chunk, lanes: s.lanes, } if s.leaf != nil { ret.leaf = s.leaf.Clone().(*sha3.State) } if s.buf != nil { ret.buf = make([]byte, len(s.buf)) copy(ret.buf, s.buf) } return ret } func Draft10Sum(hash []byte, msg []byte, c []byte) { // TODO Tweak number of lanes depending on the length of the message s := NewDraft10(c) _, _ = s.Write(msg) _, _ = s.Read(hash) } func (s *State) Write(p []byte) (int, error) { written := len(p) // The first chunk is written directly to the stalk. if s.initialTodo > 0 { taken := s.initialTodo if len(p) < taken { taken = len(p) } headP := p[:taken] _, _ = s.stalk.Write(headP) s.initialTodo -= taken p = p[taken:] } if len(p) == 0 { return written, nil } // If this is the first bit of data written after the initial chunk, // we're out of the fast-path and allocate some buffers. if s.buf == nil { if s.lanes != 1 { s.buf = make([]byte, int(s.lanes)*chunkSize) } else { // We create the buffer to signal we're past the first chunk, // but do not use it. s.buf = make([]byte, 0) h := sha3.NewTurboShake128(0x0B) s.leaf = &h } _, _ = s.stalk.Write([]byte{0x03, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}) s.stalk.SwitchDS(0x06) } // If we're just using one lane, we don't need to cache in a buffer // for parallel hashing. Instead, we feed directly to TurboSHAKE. if s.lanes == 1 { for len(p) > 0 { // Write to current leaf. to := chunkSize - s.offset if len(p) < to { to = len(p) } _, _ = s.leaf.Write(p[:to]) p = p[to:] s.offset += to // Did we fill the chunk? if s.offset == chunkSize { var cv [32]byte _, _ = s.leaf.Read(cv[:]) _, _ = s.stalk.Write(cv[:]) s.leaf.Reset() s.offset = 0 s.chunk++ } } return written, nil } // If we can't fill all our lanes or the buffer isn't empty, we write the // data to the buffer. if s.offset != 0 || len(p) < len(s.buf) { to := len(s.buf) - s.offset if len(p) < to { to = len(p) } p2 := p[:to] p = p[to:] copy(s.buf[s.offset:], p2) s.offset += to } // Absorb the buffer if we filled it if s.offset == len(s.buf) { s.writeX(s.buf) s.offset = 0 } // Note that at this point we may assume that s.offset = 0 if len(p) != 0 if len(p) != 0 && s.offset != 0 { panic("shouldn't happen") } // Absorb a bunch of chunks at the same time. if len(p) >= int(s.lanes)*chunkSize { p = s.writeX(p) } // Put the remainder in the buffer. if len(p) > 0 { copy(s.buf, p) s.offset = len(p) } return written, nil } // Absorb a multiple of a multiple of lanes * chunkSize. // Returns the remainder. func (s *State) writeX(p []byte) []byte { switch s.lanes { case 4: return s.writeX4(p) default: return s.writeX2(p) } } func (s *State) writeX4(p []byte) []byte { for len(p) >= 4*chunkSize { var x4 keccakf1600.StateX4 a := x4.Initialize(true) for offset := 0; offset < 48*168; offset += 168 { for i := 0; i < 21; i++ { a[i*4] ^= binary.LittleEndian.Uint64( p[8*i+offset:], ) a[i*4+1] ^= binary.LittleEndian.Uint64( p[chunkSize+8*i+offset:], ) a[i*4+2] ^= binary.LittleEndian.Uint64( p[chunkSize*2+8*i+offset:], ) a[i*4+3] ^= binary.LittleEndian.Uint64( p[chunkSize*3+8*i+offset:], ) } x4.Permute() } for i := 0; i < 16; i++ { a[i*4] ^= binary.LittleEndian.Uint64( p[8*i+48*168:], ) a[i*4+1] ^= binary.LittleEndian.Uint64( p[chunkSize+8*i+48*168:], ) a[i*4+2] ^= binary.LittleEndian.Uint64( p[chunkSize*2+8*i+48*168:], ) a[i*4+3] ^= binary.LittleEndian.Uint64( p[chunkSize*3+8*i+48*168:], ) } a[16*4] ^= 0x0b a[16*4+1] ^= 0x0b a[16*4+2] ^= 0x0b a[16*4+3] ^= 0x0b a[20*4] ^= 0x80 << 56 a[20*4+1] ^= 0x80 << 56 a[20*4+2] ^= 0x80 << 56 a[20*4+3] ^= 0x80 << 56 x4.Permute() var buf [32 * 4]byte for i := 0; i < 4; i++ { binary.LittleEndian.PutUint64(buf[8*i:], a[4*i]) binary.LittleEndian.PutUint64(buf[32+8*i:], a[4*i+1]) binary.LittleEndian.PutUint64(buf[32*2+8*i:], a[4*i+2]) binary.LittleEndian.PutUint64(buf[32*3+8*i:], a[4*i+3]) } _, _ = s.stalk.Write(buf[:]) p = p[chunkSize*4:] s.chunk += 4 } return p } func (s *State) writeX2(p []byte) []byte { // TODO On M2 Pro, 1/3 of the time is spent on this function // and LittleEndian.Uint64 excluding the actual permutation. // Rewriting in assembler might be worthwhile. for len(p) >= 2*chunkSize { var x2 keccakf1600.StateX2 a := x2.Initialize(true) for offset := 0; offset < 48*168; offset += 168 { for i := 0; i < 21; i++ { a[i*2] ^= binary.LittleEndian.Uint64( p[8*i+offset:], ) a[i*2+1] ^= binary.LittleEndian.Uint64( p[chunkSize+8*i+offset:], ) } x2.Permute() } for i := 0; i < 16; i++ { a[i*2] ^= binary.LittleEndian.Uint64( p[8*i+48*168:], ) a[i*2+1] ^= binary.LittleEndian.Uint64( p[chunkSize+8*i+48*168:], ) } a[16*2] ^= 0x0b a[16*2+1] ^= 0x0b a[20*2] ^= 0x80 << 56 a[20*2+1] ^= 0x80 << 56 x2.Permute() var buf [32 * 2]byte for i := 0; i < 4; i++ { binary.LittleEndian.PutUint64(buf[8*i:], a[2*i]) binary.LittleEndian.PutUint64(buf[32+8*i:], a[2*i+1]) } _, _ = s.stalk.Write(buf[:]) p = p[chunkSize*2:] s.chunk += 2 } return p } func (s *State) Read(p []byte) (int, error) { if s.stalk.IsAbsorbing() { // Write context string C _, _ = s.Write(s.context) // Write length_encode( |C| ) var buf [9]byte binary.BigEndian.PutUint64(buf[:8], uint64(len(s.context))) // Find first non-zero digit in big endian encoding of context length i := 0 for buf[i] == 0 && i < 8 { i++ } buf[8] = byte(8 - i) // number of bytes to represent |C| _, _ = s.Write(buf[i:]) // We need to write the chunk number if we're past the first chunk. if s.buf != nil { // Write last remaining chunk(s) var cv [32]byte if s.lanes == 1 { if s.offset != 0 { _, _ = s.leaf.Read(cv[:]) _, _ = s.stalk.Write(cv[:]) s.chunk++ } } else { remainingBuf := s.buf[:s.offset] for len(remainingBuf) > 0 { h := sha3.NewTurboShake128(0x0B) to := chunkSize if len(remainingBuf) < to { to = len(remainingBuf) } _, _ = h.Write(remainingBuf[:to]) _, _ = h.Read(cv[:]) _, _ = s.stalk.Write(cv[:]) s.chunk++ remainingBuf = remainingBuf[to:] } } // Write length_encode( chunk ) binary.BigEndian.PutUint64(buf[:8], uint64(s.chunk)) // Find first non-zero digit in big endian encoding of number of chunks i = 0 for buf[i] == 0 && i < 8 { i++ } buf[8] = byte(8 - i) // number of bytes to represent number of chunks. _, _ = s.stalk.Write(buf[i:]) _, _ = s.stalk.Write([]byte{0xff, 0xff}) } } return s.stalk.Read(p) }