From ca14eaf77c86bd5492329d2be6f1a82afe7802f5 Mon Sep 17 00:00:00 2001 From: Dmitri Shuralyov Date: Thu, 21 Nov 2024 19:54:12 -0500 Subject: [PATCH] all: update golang.org/x/net [generated] A part of the keeping Go's vendored dependencies and generated code up to date. This updates h2_bundle.go with unencrypted HTTP/2 support. For #36905. For #67816. [git-generate] cd src go get golang.org/x/net@v0.31.0 go mod tidy go mod vendor cd cmd go get golang.org/x/net@v0.31.0 go mod tidy go mod vendor go generate -run=bundle std Change-Id: I2b77f651b990f260fbe7d551c7a819518f1c983f Reviewed-on: https://go-review.googlesource.com/c/go/+/631035 Reviewed-by: Dmitri Shuralyov LUCI-TryBot-Result: Go LUCI Reviewed-by: Damien Neil Auto-Submit: Dmitri Shuralyov --- src/cmd/go.mod | 4 +- src/cmd/go.sum | 8 +- src/cmd/vendor/golang.org/x/term/README.md | 11 +- .../vendor/golang.org/x/term/term_windows.go | 1 + src/cmd/vendor/modules.txt | 4 +- src/go.mod | 6 +- src/go.sum | 12 +- src/net/http/h2_bundle.go | 851 +- .../x/crypto/chacha20/chacha_noasm.go | 2 +- .../{chacha_ppc64le.go => chacha_ppc64x.go} | 2 +- .../{chacha_ppc64le.s => chacha_ppc64x.s} | 114 +- .../chacha20poly1305/chacha20poly1305_amd64.s | 11503 +++++++++++++--- .../x/crypto/internal/poly1305/mac_noasm.go | 2 +- .../x/crypto/internal/poly1305/sum_amd64.s | 133 +- .../{sum_ppc64le.go => sum_ppc64x.go} | 2 +- .../poly1305/{sum_ppc64le.s => sum_ppc64x.s} | 30 +- src/vendor/golang.org/x/net/route/address.go | 27 +- src/vendor/modules.txt | 6 +- 18 files changed, 10124 insertions(+), 2594 deletions(-) rename src/vendor/golang.org/x/crypto/chacha20/{chacha_ppc64le.go => chacha_ppc64x.go} (89%) rename src/vendor/golang.org/x/crypto/chacha20/{chacha_ppc64le.s => chacha_ppc64x.s} (76%) rename src/vendor/golang.org/x/crypto/internal/poly1305/{sum_ppc64le.go => sum_ppc64x.go} (95%) rename src/vendor/golang.org/x/crypto/internal/poly1305/{sum_ppc64le.s => sum_ppc64x.s} (89%) diff --git a/src/cmd/go.mod b/src/cmd/go.mod index 8488e0a47addff..b441fe55a929e8 100644 --- a/src/cmd/go.mod +++ b/src/cmd/go.mod @@ -10,12 +10,12 @@ require ( golang.org/x/sync v0.9.0 golang.org/x/sys v0.27.0 golang.org/x/telemetry v0.0.0-20240828202201-a797f331ea97 - golang.org/x/term v0.22.1-0.20240716160707-d4346f0be292 + golang.org/x/term v0.26.0 golang.org/x/tools v0.27.0 ) require ( github.com/ianlancetaylor/demangle v0.0.0-20240912202439-0a2b6291aafd // indirect - golang.org/x/text v0.19.0 // indirect + golang.org/x/text v0.20.0 // indirect rsc.io/markdown v0.0.0-20240306144322-0bf8f97ee8ef // indirect ) diff --git a/src/cmd/go.sum b/src/cmd/go.sum index 7ad2cb7f9447a0..685fd76a0f896d 100644 --- a/src/cmd/go.sum +++ b/src/cmd/go.sum @@ -18,10 +18,10 @@ golang.org/x/sys v0.27.0 h1:wBqf8DvsY9Y/2P8gAfPDEYNuS30J4lPHJxXSb/nJZ+s= golang.org/x/sys v0.27.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/telemetry v0.0.0-20240828202201-a797f331ea97 h1:5xPN7d0u5VdgF2gFFXUDaeD3NP1pPgFMHocnCQGAh5M= golang.org/x/telemetry v0.0.0-20240828202201-a797f331ea97/go.mod h1:m7R/r+o5h7UvF2JD9n2iLSGY4v8v+zNSyTJ6xynLrqs= -golang.org/x/term v0.22.1-0.20240716160707-d4346f0be292 h1:BOrQi08eIX3cDgGcMgFONf27MxXigcYa9x+iW5JuCXw= -golang.org/x/term v0.22.1-0.20240716160707-d4346f0be292/go.mod h1:F3qCibpT5AMpCRfhfT53vVJwhLtIVHhB9XDjfFvnMI4= -golang.org/x/text v0.19.0 h1:kTxAhCbGbxhK0IwgSKiMO5awPoDQ0RpfiVYBfK860YM= -golang.org/x/text v0.19.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= +golang.org/x/term v0.26.0 h1:WEQa6V3Gja/BhNxg540hBip/kkaYtRg3cxg4oXSw4AU= +golang.org/x/term v0.26.0/go.mod h1:Si5m1o57C5nBNQo5z1iq+XDijt21BDBDp2bK0QI8e3E= +golang.org/x/text v0.20.0 h1:gK/Kv2otX8gz+wn7Rmb3vT96ZwuoxnQlY+HlJVj7Qug= +golang.org/x/text v0.20.0/go.mod h1:D4IsuqiFMhST5bX19pQ9ikHC2GsaKyk/oF+pn3ducp4= golang.org/x/tools v0.27.0 h1:qEKojBykQkQ4EynWy4S8Weg69NumxKdn40Fce3uc/8o= golang.org/x/tools v0.27.0/go.mod h1:sUi0ZgbwW9ZPAq26Ekut+weQPR5eIM6GQLQ1Yjm1H0Q= rsc.io/markdown v0.0.0-20240306144322-0bf8f97ee8ef h1:mqLYrXCXYEZOop9/Dbo6RPX11539nwiCNBb1icVPmw8= diff --git a/src/cmd/vendor/golang.org/x/term/README.md b/src/cmd/vendor/golang.org/x/term/README.md index d03d0aefef685c..05ff623f94f1d7 100644 --- a/src/cmd/vendor/golang.org/x/term/README.md +++ b/src/cmd/vendor/golang.org/x/term/README.md @@ -4,16 +4,13 @@ This repository provides Go terminal and console support packages. -## Download/Install - -The easiest way to install is to run `go get -u golang.org/x/term`. You can -also manually git clone the repository to `$GOPATH/src/golang.org/x/term`. - ## Report Issues / Send Patches This repository uses Gerrit for code changes. To learn how to submit changes to -this repository, see https://golang.org/doc/contribute.html. +this repository, see https://go.dev/doc/contribute. + +The git repository is https://go.googlesource.com/term. The main issue tracker for the term repository is located at -https://github.com/golang/go/issues. Prefix your issue with "x/term:" in the +https://go.dev/issues. Prefix your issue with "x/term:" in the subject line, so it is easy to find. diff --git a/src/cmd/vendor/golang.org/x/term/term_windows.go b/src/cmd/vendor/golang.org/x/term/term_windows.go index 465f560604e41a..df6bf948e14070 100644 --- a/src/cmd/vendor/golang.org/x/term/term_windows.go +++ b/src/cmd/vendor/golang.org/x/term/term_windows.go @@ -26,6 +26,7 @@ func makeRaw(fd int) (*State, error) { return nil, err } raw := st &^ (windows.ENABLE_ECHO_INPUT | windows.ENABLE_PROCESSED_INPUT | windows.ENABLE_LINE_INPUT | windows.ENABLE_PROCESSED_OUTPUT) + raw |= windows.ENABLE_VIRTUAL_TERMINAL_INPUT if err := windows.SetConsoleMode(windows.Handle(fd), raw); err != nil { return nil, err } diff --git a/src/cmd/vendor/modules.txt b/src/cmd/vendor/modules.txt index cc2ce3df7e7e70..6ea68b2153e4f3 100644 --- a/src/cmd/vendor/modules.txt +++ b/src/cmd/vendor/modules.txt @@ -60,10 +60,10 @@ golang.org/x/telemetry/internal/crashmonitor golang.org/x/telemetry/internal/mmap golang.org/x/telemetry/internal/telemetry golang.org/x/telemetry/internal/upload -# golang.org/x/term v0.22.1-0.20240716160707-d4346f0be292 +# golang.org/x/term v0.26.0 ## explicit; go 1.18 golang.org/x/term -# golang.org/x/text v0.19.0 +# golang.org/x/text v0.20.0 ## explicit; go 1.18 golang.org/x/text/cases golang.org/x/text/internal diff --git a/src/go.mod b/src/go.mod index 7a7b99150fc297..47ef05be87c09a 100644 --- a/src/go.mod +++ b/src/go.mod @@ -3,11 +3,11 @@ module std go 1.24 require ( - golang.org/x/crypto v0.25.1-0.20240722173533-bb80217080b0 - golang.org/x/net v0.27.1-0.20240722181819-765c7e89b3bd + golang.org/x/crypto v0.29.0 + golang.org/x/net v0.31.0 ) require ( golang.org/x/sys v0.27.0 // indirect - golang.org/x/text v0.19.0 // indirect + golang.org/x/text v0.20.0 // indirect ) diff --git a/src/go.sum b/src/go.sum index 26a27ac8099c78..9d72b7d6542c13 100644 --- a/src/go.sum +++ b/src/go.sum @@ -1,8 +1,8 @@ -golang.org/x/crypto v0.25.1-0.20240722173533-bb80217080b0 h1:wxHbFWyu21uEPJJnYaSDaHSWbvnZ9gLSSOPwnEc3lLM= -golang.org/x/crypto v0.25.1-0.20240722173533-bb80217080b0/go.mod h1:T+wALwcMOSE0kXgUAnPAHqTLW+XHgcELELW8VaDgm/M= -golang.org/x/net v0.27.1-0.20240722181819-765c7e89b3bd h1:pHzwejE8Zkb94bG4nA+fUeskKPFp1HPldrhv62dabro= -golang.org/x/net v0.27.1-0.20240722181819-765c7e89b3bd/go.mod h1:dDi0PyhWNoiUOrAS8uXv/vnScO4wnHQO4mj9fn/RytE= +golang.org/x/crypto v0.29.0 h1:L5SG1JTTXupVV3n6sUqMTeWbjAyfPwoda2DLX8J8FrQ= +golang.org/x/crypto v0.29.0/go.mod h1:+F4F4N5hv6v38hfeYwTdx20oUvLLc+QfrE9Ax9HtgRg= +golang.org/x/net v0.31.0 h1:68CPQngjLL0r2AlUKiSxtQFKvzRVbnzLwMUn5SzcLHo= +golang.org/x/net v0.31.0/go.mod h1:P4fl1q7dY2hnZFxEk4pPSkDHF+QqjitcnDjUQyMM+pM= golang.org/x/sys v0.27.0 h1:wBqf8DvsY9Y/2P8gAfPDEYNuS30J4lPHJxXSb/nJZ+s= golang.org/x/sys v0.27.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= -golang.org/x/text v0.19.0 h1:kTxAhCbGbxhK0IwgSKiMO5awPoDQ0RpfiVYBfK860YM= -golang.org/x/text v0.19.0/go.mod h1:BuEKDfySbSR4drPmRPG/7iBdf8hvFMuRexcpahXilzY= +golang.org/x/text v0.20.0 h1:gK/Kv2otX8gz+wn7Rmb3vT96ZwuoxnQlY+HlJVj7Qug= +golang.org/x/text v0.20.0/go.mod h1:D4IsuqiFMhST5bX19pQ9ikHC2GsaKyk/oF+pn3ducp4= diff --git a/src/net/http/h2_bundle.go b/src/net/http/h2_bundle.go index af86c1430dbf6c..6b40923a867a15 100644 --- a/src/net/http/h2_bundle.go +++ b/src/net/http/h2_bundle.go @@ -883,7 +883,7 @@ func (c *http2dialCall) dial(ctx context.Context, addr string) { // This code decides which ones live or die. // The return value used is whether c was used. // c is never closed. -func (p *http2clientConnPool) addConnIfNeeded(key string, t *http2Transport, c *tls.Conn) (used bool, err error) { +func (p *http2clientConnPool) addConnIfNeeded(key string, t *http2Transport, c net.Conn) (used bool, err error) { p.mu.Lock() for _, cc := range p.conns[key] { if cc.CanTakeNewRequest() { @@ -919,8 +919,8 @@ type http2addConnCall struct { err error } -func (c *http2addConnCall) run(t *http2Transport, key string, tc *tls.Conn) { - cc, err := t.NewClientConn(tc) +func (c *http2addConnCall) run(t *http2Transport, key string, nc net.Conn) { + cc, err := t.NewClientConn(nc) p := c.p p.mu.Lock() @@ -1035,6 +1035,169 @@ func http2shouldRetryDial(call *http2dialCall, req *Request) bool { return call.ctx.Err() != nil } +// http2Config is a package-internal version of net/http.HTTP2Config. +// +// http.HTTP2Config was added in Go 1.24. +// When running with a version of net/http that includes HTTP2Config, +// we merge the configuration with the fields in Transport or Server +// to produce an http2Config. +// +// Zero valued fields in http2Config are interpreted as in the +// net/http.HTTPConfig documentation. +// +// Precedence order for reconciling configurations is: +// +// - Use the net/http.{Server,Transport}.HTTP2Config value, when non-zero. +// - Otherwise use the http2.{Server.Transport} value. +// - If the resulting value is zero or out of range, use a default. +type http2http2Config struct { + MaxConcurrentStreams uint32 + MaxDecoderHeaderTableSize uint32 + MaxEncoderHeaderTableSize uint32 + MaxReadFrameSize uint32 + MaxUploadBufferPerConnection int32 + MaxUploadBufferPerStream int32 + SendPingTimeout time.Duration + PingTimeout time.Duration + WriteByteTimeout time.Duration + PermitProhibitedCipherSuites bool + CountError func(errType string) +} + +// configFromServer merges configuration settings from +// net/http.Server.HTTP2Config and http2.Server. +func http2configFromServer(h1 *Server, h2 *http2Server) http2http2Config { + conf := http2http2Config{ + MaxConcurrentStreams: h2.MaxConcurrentStreams, + MaxEncoderHeaderTableSize: h2.MaxEncoderHeaderTableSize, + MaxDecoderHeaderTableSize: h2.MaxDecoderHeaderTableSize, + MaxReadFrameSize: h2.MaxReadFrameSize, + MaxUploadBufferPerConnection: h2.MaxUploadBufferPerConnection, + MaxUploadBufferPerStream: h2.MaxUploadBufferPerStream, + SendPingTimeout: h2.ReadIdleTimeout, + PingTimeout: h2.PingTimeout, + WriteByteTimeout: h2.WriteByteTimeout, + PermitProhibitedCipherSuites: h2.PermitProhibitedCipherSuites, + CountError: h2.CountError, + } + http2fillNetHTTPServerConfig(&conf, h1) + http2setConfigDefaults(&conf, true) + return conf +} + +// configFromServer merges configuration settings from h2 and h2.t1.HTTP2 +// (the net/http Transport). +func http2configFromTransport(h2 *http2Transport) http2http2Config { + conf := http2http2Config{ + MaxEncoderHeaderTableSize: h2.MaxEncoderHeaderTableSize, + MaxDecoderHeaderTableSize: h2.MaxDecoderHeaderTableSize, + MaxReadFrameSize: h2.MaxReadFrameSize, + SendPingTimeout: h2.ReadIdleTimeout, + PingTimeout: h2.PingTimeout, + WriteByteTimeout: h2.WriteByteTimeout, + } + + // Unlike most config fields, where out-of-range values revert to the default, + // Transport.MaxReadFrameSize clips. + if conf.MaxReadFrameSize < http2minMaxFrameSize { + conf.MaxReadFrameSize = http2minMaxFrameSize + } else if conf.MaxReadFrameSize > http2maxFrameSize { + conf.MaxReadFrameSize = http2maxFrameSize + } + + if h2.t1 != nil { + http2fillNetHTTPTransportConfig(&conf, h2.t1) + } + http2setConfigDefaults(&conf, false) + return conf +} + +func http2setDefault[T ~int | ~int32 | ~uint32 | ~int64](v *T, minval, maxval, defval T) { + if *v < minval || *v > maxval { + *v = defval + } +} + +func http2setConfigDefaults(conf *http2http2Config, server bool) { + http2setDefault(&conf.MaxConcurrentStreams, 1, math.MaxUint32, http2defaultMaxStreams) + http2setDefault(&conf.MaxEncoderHeaderTableSize, 1, math.MaxUint32, http2initialHeaderTableSize) + http2setDefault(&conf.MaxDecoderHeaderTableSize, 1, math.MaxUint32, http2initialHeaderTableSize) + if server { + http2setDefault(&conf.MaxUploadBufferPerConnection, http2initialWindowSize, math.MaxInt32, 1<<20) + } else { + http2setDefault(&conf.MaxUploadBufferPerConnection, http2initialWindowSize, math.MaxInt32, http2transportDefaultConnFlow) + } + if server { + http2setDefault(&conf.MaxUploadBufferPerStream, 1, math.MaxInt32, 1<<20) + } else { + http2setDefault(&conf.MaxUploadBufferPerStream, 1, math.MaxInt32, http2transportDefaultStreamFlow) + } + http2setDefault(&conf.MaxReadFrameSize, http2minMaxFrameSize, http2maxFrameSize, http2defaultMaxReadFrameSize) + http2setDefault(&conf.PingTimeout, 1, math.MaxInt64, 15*time.Second) +} + +// adjustHTTP1MaxHeaderSize converts a limit in bytes on the size of an HTTP/1 header +// to an HTTP/2 MAX_HEADER_LIST_SIZE value. +func http2adjustHTTP1MaxHeaderSize(n int64) int64 { + // http2's count is in a slightly different unit and includes 32 bytes per pair. + // So, take the net/http.Server value and pad it up a bit, assuming 10 headers. + const perFieldOverhead = 32 // per http2 spec + const typicalHeaders = 10 // conservative + return n + typicalHeaders*perFieldOverhead +} + +// fillNetHTTPServerConfig sets fields in conf from srv.HTTP2. +func http2fillNetHTTPServerConfig(conf *http2http2Config, srv *Server) { + http2fillNetHTTPConfig(conf, srv.HTTP2) +} + +// fillNetHTTPServerConfig sets fields in conf from tr.HTTP2. +func http2fillNetHTTPTransportConfig(conf *http2http2Config, tr *Transport) { + http2fillNetHTTPConfig(conf, tr.HTTP2) +} + +func http2fillNetHTTPConfig(conf *http2http2Config, h2 *HTTP2Config) { + if h2 == nil { + return + } + if h2.MaxConcurrentStreams != 0 { + conf.MaxConcurrentStreams = uint32(h2.MaxConcurrentStreams) + } + if h2.MaxEncoderHeaderTableSize != 0 { + conf.MaxEncoderHeaderTableSize = uint32(h2.MaxEncoderHeaderTableSize) + } + if h2.MaxDecoderHeaderTableSize != 0 { + conf.MaxDecoderHeaderTableSize = uint32(h2.MaxDecoderHeaderTableSize) + } + if h2.MaxConcurrentStreams != 0 { + conf.MaxConcurrentStreams = uint32(h2.MaxConcurrentStreams) + } + if h2.MaxReadFrameSize != 0 { + conf.MaxReadFrameSize = uint32(h2.MaxReadFrameSize) + } + if h2.MaxReceiveBufferPerConnection != 0 { + conf.MaxUploadBufferPerConnection = int32(h2.MaxReceiveBufferPerConnection) + } + if h2.MaxReceiveBufferPerStream != 0 { + conf.MaxUploadBufferPerStream = int32(h2.MaxReceiveBufferPerStream) + } + if h2.SendPingTimeout != 0 { + conf.SendPingTimeout = h2.SendPingTimeout + } + if h2.PingTimeout != 0 { + conf.PingTimeout = h2.PingTimeout + } + if h2.WriteByteTimeout != 0 { + conf.WriteByteTimeout = h2.WriteByteTimeout + } + if h2.PermitProhibitedCipherSuites { + conf.PermitProhibitedCipherSuites = true + } + if h2.CountError != nil { + conf.CountError = h2.CountError + } +} + // Buffer chunks are allocated from a pool to reduce pressure on GC. // The maximum wasted space per dataBuffer is 2x the largest size class, // which happens when the dataBuffer has multiple chunks and there is @@ -3550,13 +3713,19 @@ func (cw http2closeWaiter) Wait() { // Its buffered writer is lazily allocated as needed, to minimize // idle memory usage with many connections. type http2bufferedWriter struct { - _ http2incomparable - w io.Writer // immutable - bw *bufio.Writer // non-nil when data is buffered + _ http2incomparable + group http2synctestGroupInterface // immutable + conn net.Conn // immutable + bw *bufio.Writer // non-nil when data is buffered + byteTimeout time.Duration // immutable, WriteByteTimeout } -func http2newBufferedWriter(w io.Writer) *http2bufferedWriter { - return &http2bufferedWriter{w: w} +func http2newBufferedWriter(group http2synctestGroupInterface, conn net.Conn, timeout time.Duration) *http2bufferedWriter { + return &http2bufferedWriter{ + group: group, + conn: conn, + byteTimeout: timeout, + } } // bufWriterPoolBufferSize is the size of bufio.Writer's @@ -3583,7 +3752,7 @@ func (w *http2bufferedWriter) Available() int { func (w *http2bufferedWriter) Write(p []byte) (n int, err error) { if w.bw == nil { bw := http2bufWriterPool.Get().(*bufio.Writer) - bw.Reset(w.w) + bw.Reset((*http2bufferedWriterTimeoutWriter)(w)) w.bw = bw } return w.bw.Write(p) @@ -3601,6 +3770,38 @@ func (w *http2bufferedWriter) Flush() error { return err } +type http2bufferedWriterTimeoutWriter http2bufferedWriter + +func (w *http2bufferedWriterTimeoutWriter) Write(p []byte) (n int, err error) { + return http2writeWithByteTimeout(w.group, w.conn, w.byteTimeout, p) +} + +// writeWithByteTimeout writes to conn. +// If more than timeout passes without any bytes being written to the connection, +// the write fails. +func http2writeWithByteTimeout(group http2synctestGroupInterface, conn net.Conn, timeout time.Duration, p []byte) (n int, err error) { + if timeout <= 0 { + return conn.Write(p) + } + for { + var now time.Time + if group == nil { + now = time.Now() + } else { + now = group.Now() + } + conn.SetWriteDeadline(now.Add(timeout)) + nn, err := conn.Write(p[n:]) + n += nn + if n == len(p) || nn == 0 || !errors.Is(err, os.ErrDeadlineExceeded) { + // Either we finished the write, made no progress, or hit the deadline. + // Whichever it is, we're done now. + conn.SetWriteDeadline(time.Time{}) + return n, err + } + } +} + func http2mustUint31(v int32) uint32 { if v < 0 || v > 2147483647 { panic("out of range") @@ -3882,10 +4083,14 @@ func (p *http2pipe) Done() <-chan struct{} { } const ( - http2prefaceTimeout = 10 * time.Second - http2firstSettingsTimeout = 2 * time.Second // should be in-flight with preface anyway - http2handlerChunkWriteSize = 4 << 10 - http2defaultMaxStreams = 250 // TODO: make this 100 as the GFE seems to? + http2prefaceTimeout = 10 * time.Second + http2firstSettingsTimeout = 2 * time.Second // should be in-flight with preface anyway + http2handlerChunkWriteSize = 4 << 10 + http2defaultMaxStreams = 250 // TODO: make this 100 as the GFE seems to? + + // maxQueuedControlFrames is the maximum number of control frames like + // SETTINGS, PING and RST_STREAM that will be queued for writing before + // the connection is closed to prevent memory exhaustion attacks. http2maxQueuedControlFrames = 10000 ) @@ -3957,6 +4162,22 @@ type http2Server struct { // If zero or negative, there is no timeout. IdleTimeout time.Duration + // ReadIdleTimeout is the timeout after which a health check using a ping + // frame will be carried out if no frame is received on the connection. + // If zero, no health check is performed. + ReadIdleTimeout time.Duration + + // PingTimeout is the timeout after which the connection will be closed + // if a response to a ping is not received. + // If zero, a default of 15 seconds is used. + PingTimeout time.Duration + + // WriteByteTimeout is the timeout after which a connection will be + // closed if no data can be written to it. The timeout begins when data is + // available to write, and is extended whenever any bytes are written. + // If zero or negative, there is no timeout. + WriteByteTimeout time.Duration + // MaxUploadBufferPerConnection is the size of the initial flow // control window for each connections. The HTTP/2 spec does not // allow this to be smaller than 65535 or larger than 2^32-1. @@ -4019,57 +4240,6 @@ func (s *http2Server) afterFunc(d time.Duration, f func()) http2timer { return http2timeTimer{time.AfterFunc(d, f)} } -func (s *http2Server) initialConnRecvWindowSize() int32 { - if s.MaxUploadBufferPerConnection >= http2initialWindowSize { - return s.MaxUploadBufferPerConnection - } - return 1 << 20 -} - -func (s *http2Server) initialStreamRecvWindowSize() int32 { - if s.MaxUploadBufferPerStream > 0 { - return s.MaxUploadBufferPerStream - } - return 1 << 20 -} - -func (s *http2Server) maxReadFrameSize() uint32 { - if v := s.MaxReadFrameSize; v >= http2minMaxFrameSize && v <= http2maxFrameSize { - return v - } - return http2defaultMaxReadFrameSize -} - -func (s *http2Server) maxConcurrentStreams() uint32 { - if v := s.MaxConcurrentStreams; v > 0 { - return v - } - return http2defaultMaxStreams -} - -func (s *http2Server) maxDecoderHeaderTableSize() uint32 { - if v := s.MaxDecoderHeaderTableSize; v > 0 { - return v - } - return http2initialHeaderTableSize -} - -func (s *http2Server) maxEncoderHeaderTableSize() uint32 { - if v := s.MaxEncoderHeaderTableSize; v > 0 { - return v - } - return http2initialHeaderTableSize -} - -// maxQueuedControlFrames is the maximum number of control frames like -// SETTINGS, PING and RST_STREAM that will be queued for writing before -// the connection is closed to prevent memory exhaustion attacks. -func (s *http2Server) maxQueuedControlFrames() int { - // TODO: if anybody asks, add a Server field, and remember to define the - // behavior of negative values. - return http2maxQueuedControlFrames -} - type http2serverInternalState struct { mu sync.Mutex activeConns map[*http2serverConn]struct{} @@ -4166,7 +4336,7 @@ func http2ConfigureServer(s *Server, conf *http2Server) error { if s.TLSNextProto == nil { s.TLSNextProto = map[string]func(*Server, *tls.Conn, Handler){} } - protoHandler := func(hs *Server, c *tls.Conn, h Handler) { + protoHandler := func(hs *Server, c net.Conn, h Handler, sawClientPreface bool) { if http2testHookOnConn != nil { http2testHookOnConn() } @@ -4183,12 +4353,31 @@ func http2ConfigureServer(s *Server, conf *http2Server) error { ctx = bc.BaseContext() } conf.ServeConn(c, &http2ServeConnOpts{ - Context: ctx, - Handler: h, - BaseConfig: hs, + Context: ctx, + Handler: h, + BaseConfig: hs, + SawClientPreface: sawClientPreface, }) } - s.TLSNextProto[http2NextProtoTLS] = protoHandler + s.TLSNextProto[http2NextProtoTLS] = func(hs *Server, c *tls.Conn, h Handler) { + protoHandler(hs, c, h, false) + } + // The "unencrypted_http2" TLSNextProto key is used to pass off non-TLS HTTP/2 conns. + // + // A connection passed in this method has already had the HTTP/2 preface read from it. + s.TLSNextProto[http2nextProtoUnencryptedHTTP2] = func(hs *Server, c *tls.Conn, h Handler) { + nc, err := http2unencryptedNetConnFromTLSConn(c) + if err != nil { + if lg := hs.ErrorLog; lg != nil { + lg.Print(err) + } else { + log.Print(err) + } + go c.Close() + return + } + protoHandler(hs, nc, h, true) + } return nil } @@ -4270,13 +4459,15 @@ func (s *http2Server) serveConn(c net.Conn, opts *http2ServeConnOpts, newf func( baseCtx, cancel := http2serverConnBaseContext(c, opts) defer cancel() + http1srv := opts.baseConfig() + conf := http2configFromServer(http1srv, s) sc := &http2serverConn{ srv: s, - hs: opts.baseConfig(), + hs: http1srv, conn: c, baseCtx: baseCtx, remoteAddrStr: c.RemoteAddr().String(), - bw: http2newBufferedWriter(c), + bw: http2newBufferedWriter(s.group, c, conf.WriteByteTimeout), handler: opts.handler(), streams: make(map[uint32]*http2stream), readFrameCh: make(chan http2readFrameResult), @@ -4286,9 +4477,12 @@ func (s *http2Server) serveConn(c net.Conn, opts *http2ServeConnOpts, newf func( bodyReadCh: make(chan http2bodyReadMsg), // buffering doesn't matter either way doneServing: make(chan struct{}), clientMaxStreams: math.MaxUint32, // Section 6.5.2: "Initially, there is no limit to this value" - advMaxStreams: s.maxConcurrentStreams(), + advMaxStreams: conf.MaxConcurrentStreams, initialStreamSendWindowSize: http2initialWindowSize, + initialStreamRecvWindowSize: conf.MaxUploadBufferPerStream, maxFrameSize: http2initialMaxFrameSize, + pingTimeout: conf.PingTimeout, + countErrorFunc: conf.CountError, serveG: http2newGoroutineLock(), pushEnabled: true, sawClientPreface: opts.SawClientPreface, @@ -4321,15 +4515,15 @@ func (s *http2Server) serveConn(c net.Conn, opts *http2ServeConnOpts, newf func( sc.flow.add(http2initialWindowSize) sc.inflow.init(http2initialWindowSize) sc.hpackEncoder = hpack.NewEncoder(&sc.headerWriteBuf) - sc.hpackEncoder.SetMaxDynamicTableSizeLimit(s.maxEncoderHeaderTableSize()) + sc.hpackEncoder.SetMaxDynamicTableSizeLimit(conf.MaxEncoderHeaderTableSize) fr := http2NewFramer(sc.bw, c) - if s.CountError != nil { - fr.countError = s.CountError + if conf.CountError != nil { + fr.countError = conf.CountError } - fr.ReadMetaHeaders = hpack.NewDecoder(s.maxDecoderHeaderTableSize(), nil) + fr.ReadMetaHeaders = hpack.NewDecoder(conf.MaxDecoderHeaderTableSize, nil) fr.MaxHeaderListSize = sc.maxHeaderListSize() - fr.SetMaxReadFrameSize(s.maxReadFrameSize()) + fr.SetMaxReadFrameSize(conf.MaxReadFrameSize) sc.framer = fr if tc, ok := c.(http2connectionStater); ok { @@ -4362,7 +4556,7 @@ func (s *http2Server) serveConn(c net.Conn, opts *http2ServeConnOpts, newf func( // So for now, do nothing here again. } - if !s.PermitProhibitedCipherSuites && http2isBadCipher(sc.tlsState.CipherSuite) { + if !conf.PermitProhibitedCipherSuites && http2isBadCipher(sc.tlsState.CipherSuite) { // "Endpoints MAY choose to generate a connection error // (Section 5.4.1) of type INADEQUATE_SECURITY if one of // the prohibited cipher suites are negotiated." @@ -4399,7 +4593,7 @@ func (s *http2Server) serveConn(c net.Conn, opts *http2ServeConnOpts, newf func( opts.UpgradeRequest = nil } - sc.serve() + sc.serve(conf) } func http2serverConnBaseContext(c net.Conn, opts *http2ServeConnOpts) (ctx context.Context, cancel func()) { @@ -4439,6 +4633,7 @@ type http2serverConn struct { tlsState *tls.ConnectionState // shared by all handlers, like net/http remoteAddrStr string writeSched http2WriteScheduler + countErrorFunc func(errType string) // Everything following is owned by the serve loop; use serveG.check(): serveG http2goroutineLock // used to verify funcs are on serve() @@ -4458,6 +4653,7 @@ type http2serverConn struct { streams map[uint32]*http2stream unstartedHandlers []http2unstartedHandler initialStreamSendWindowSize int32 + initialStreamRecvWindowSize int32 maxFrameSize int32 peerMaxHeaderListSize uint32 // zero means unknown (default) canonHeader map[string]string // http2-lower-case -> Go-Canonical-Case @@ -4468,9 +4664,14 @@ type http2serverConn struct { inGoAway bool // we've started to or sent GOAWAY inFrameScheduleLoop bool // whether we're in the scheduleFrameWrite loop needToSendGoAway bool // we need to schedule a GOAWAY frame write + pingSent bool + sentPingData [8]byte goAwayCode http2ErrCode shutdownTimer http2timer // nil until used idleTimer http2timer // nil if unused + readIdleTimeout time.Duration + pingTimeout time.Duration + readIdleTimer http2timer // nil if unused // Owned by the writeFrameAsync goroutine: headerWriteBuf bytes.Buffer @@ -4485,11 +4686,7 @@ func (sc *http2serverConn) maxHeaderListSize() uint32 { if n <= 0 { n = DefaultMaxHeaderBytes } - // http2's count is in a slightly different unit and includes 32 bytes per pair. - // So, take the net/http.Server value and pad it up a bit, assuming 10 headers. - const perFieldOverhead = 32 // per http2 spec - const typicalHeaders = 10 // conservative - return uint32(n + typicalHeaders*perFieldOverhead) + return uint32(http2adjustHTTP1MaxHeaderSize(int64(n))) } func (sc *http2serverConn) curOpenStreams() uint32 { @@ -4756,7 +4953,7 @@ func (sc *http2serverConn) notePanic() { } } -func (sc *http2serverConn) serve() { +func (sc *http2serverConn) serve(conf http2http2Config) { sc.serveG.check() defer sc.notePanic() defer sc.conn.Close() @@ -4770,18 +4967,18 @@ func (sc *http2serverConn) serve() { sc.writeFrame(http2FrameWriteRequest{ write: http2writeSettings{ - {http2SettingMaxFrameSize, sc.srv.maxReadFrameSize()}, + {http2SettingMaxFrameSize, conf.MaxReadFrameSize}, {http2SettingMaxConcurrentStreams, sc.advMaxStreams}, {http2SettingMaxHeaderListSize, sc.maxHeaderListSize()}, - {http2SettingHeaderTableSize, sc.srv.maxDecoderHeaderTableSize()}, - {http2SettingInitialWindowSize, uint32(sc.srv.initialStreamRecvWindowSize())}, + {http2SettingHeaderTableSize, conf.MaxDecoderHeaderTableSize}, + {http2SettingInitialWindowSize, uint32(sc.initialStreamRecvWindowSize)}, }, }) sc.unackedSettings++ // Each connection starts with initialWindowSize inflow tokens. // If a higher value is configured, we add more tokens. - if diff := sc.srv.initialConnRecvWindowSize() - http2initialWindowSize; diff > 0 { + if diff := conf.MaxUploadBufferPerConnection - http2initialWindowSize; diff > 0 { sc.sendWindowUpdate(nil, int(diff)) } @@ -4801,11 +4998,18 @@ func (sc *http2serverConn) serve() { defer sc.idleTimer.Stop() } + if conf.SendPingTimeout > 0 { + sc.readIdleTimeout = conf.SendPingTimeout + sc.readIdleTimer = sc.srv.afterFunc(conf.SendPingTimeout, sc.onReadIdleTimer) + defer sc.readIdleTimer.Stop() + } + go sc.readFrames() // closed by defer sc.conn.Close above settingsTimer := sc.srv.afterFunc(http2firstSettingsTimeout, sc.onSettingsTimer) defer settingsTimer.Stop() + lastFrameTime := sc.srv.now() loopNum := 0 for { loopNum++ @@ -4819,6 +5023,7 @@ func (sc *http2serverConn) serve() { case res := <-sc.wroteFrameCh: sc.wroteFrame(res) case res := <-sc.readFrameCh: + lastFrameTime = sc.srv.now() // Process any written frames before reading new frames from the client since a // written frame could have triggered a new stream to be started. if sc.writingFrameAsync { @@ -4850,6 +5055,8 @@ func (sc *http2serverConn) serve() { case http2idleTimerMsg: sc.vlogf("connection is idle") sc.goAway(http2ErrCodeNo) + case http2readIdleTimerMsg: + sc.handlePingTimer(lastFrameTime) case http2shutdownTimerMsg: sc.vlogf("GOAWAY close timer fired; closing conn from %v", sc.conn.RemoteAddr()) return @@ -4872,7 +5079,7 @@ func (sc *http2serverConn) serve() { // If the peer is causing us to generate a lot of control frames, // but not reading them from us, assume they are trying to make us // run out of memory. - if sc.queuedControlFrames > sc.srv.maxQueuedControlFrames() { + if sc.queuedControlFrames > http2maxQueuedControlFrames { sc.vlogf("http2: too many control frames in send queue, closing connection") return } @@ -4888,12 +5095,39 @@ func (sc *http2serverConn) serve() { } } +func (sc *http2serverConn) handlePingTimer(lastFrameReadTime time.Time) { + if sc.pingSent { + sc.vlogf("timeout waiting for PING response") + sc.conn.Close() + return + } + + pingAt := lastFrameReadTime.Add(sc.readIdleTimeout) + now := sc.srv.now() + if pingAt.After(now) { + // We received frames since arming the ping timer. + // Reset it for the next possible timeout. + sc.readIdleTimer.Reset(pingAt.Sub(now)) + return + } + + sc.pingSent = true + // Ignore crypto/rand.Read errors: It generally can't fail, and worse case if it does + // is we send a PING frame containing 0s. + _, _ = rand.Read(sc.sentPingData[:]) + sc.writeFrame(http2FrameWriteRequest{ + write: &http2writePing{data: sc.sentPingData}, + }) + sc.readIdleTimer.Reset(sc.pingTimeout) +} + type http2serverMessage int // Message values sent to serveMsgCh. var ( http2settingsTimerMsg = new(http2serverMessage) http2idleTimerMsg = new(http2serverMessage) + http2readIdleTimerMsg = new(http2serverMessage) http2shutdownTimerMsg = new(http2serverMessage) http2gracefulShutdownMsg = new(http2serverMessage) http2handlerDoneMsg = new(http2serverMessage) @@ -4903,6 +5137,8 @@ func (sc *http2serverConn) onSettingsTimer() { sc.sendServeMsg(http2settingsTime func (sc *http2serverConn) onIdleTimer() { sc.sendServeMsg(http2idleTimerMsg) } +func (sc *http2serverConn) onReadIdleTimer() { sc.sendServeMsg(http2readIdleTimerMsg) } + func (sc *http2serverConn) onShutdownTimer() { sc.sendServeMsg(http2shutdownTimerMsg) } func (sc *http2serverConn) sendServeMsg(msg interface{}) { @@ -5155,6 +5391,10 @@ func (sc *http2serverConn) wroteFrame(res http2frameWriteResult) { sc.writingFrame = false sc.writingFrameAsync = false + if res.err != nil { + sc.conn.Close() + } + wr := res.wr if http2writeEndsStream(wr.write) { @@ -5429,6 +5669,11 @@ func (sc *http2serverConn) processFrame(f http2Frame) error { func (sc *http2serverConn) processPing(f *http2PingFrame) error { sc.serveG.check() if f.IsAck() { + if sc.pingSent && sc.sentPingData == f.Data { + // This is a response to a PING we sent. + sc.pingSent = false + sc.readIdleTimer.Reset(sc.readIdleTimeout) + } // 6.7 PING: " An endpoint MUST NOT respond to PING frames // containing this flag." return nil @@ -5995,7 +6240,7 @@ func (sc *http2serverConn) newStream(id, pusherID uint32, state http2streamState st.cw.Init() st.flow.conn = &sc.flow // link to conn-level counter st.flow.add(sc.initialStreamSendWindowSize) - st.inflow.init(sc.srv.initialStreamRecvWindowSize()) + st.inflow.init(sc.initialStreamRecvWindowSize) if sc.hs.WriteTimeout > 0 { st.writeDeadline = sc.srv.afterFunc(sc.hs.WriteTimeout, st.onWriteTimeout) } @@ -6690,6 +6935,11 @@ func (w *http2responseWriter) SetWriteDeadline(deadline time.Time) error { return nil } +func (w *http2responseWriter) EnableFullDuplex() error { + // We always support full duplex responses, so this is a no-op. + return nil +} + func (w *http2responseWriter) Flush() { w.FlushError() } @@ -7136,7 +7386,7 @@ func (sc *http2serverConn) countError(name string, err error) error { if sc == nil || sc.srv == nil { return err } - f := sc.srv.CountError + f := sc.countErrorFunc if f == nil { return err } @@ -7339,6 +7589,20 @@ func (t *http2Transport) markNewGoroutine() { } } +func (t *http2Transport) now() time.Time { + if t != nil && t.http2transportTestHooks != nil { + return t.http2transportTestHooks.group.Now() + } + return time.Now() +} + +func (t *http2Transport) timeSince(when time.Time) time.Duration { + if t != nil && t.http2transportTestHooks != nil { + return t.now().Sub(when) + } + return time.Since(when) +} + // newTimer creates a new time.Timer, or a synthetic timer in tests. func (t *http2Transport) newTimer(d time.Duration) http2timer { if t.http2transportTestHooks != nil { @@ -7363,40 +7627,26 @@ func (t *http2Transport) contextWithTimeout(ctx context.Context, d time.Duration } func (t *http2Transport) maxHeaderListSize() uint32 { - if t.MaxHeaderListSize == 0 { + n := int64(t.MaxHeaderListSize) + if t.t1 != nil && t.t1.MaxResponseHeaderBytes != 0 { + n = t.t1.MaxResponseHeaderBytes + if n > 0 { + n = http2adjustHTTP1MaxHeaderSize(n) + } + } + if n <= 0 { return 10 << 20 } - if t.MaxHeaderListSize == 0xffffffff { + if n >= 0xffffffff { return 0 } - return t.MaxHeaderListSize -} - -func (t *http2Transport) maxFrameReadSize() uint32 { - if t.MaxReadFrameSize == 0 { - return 0 // use the default provided by the peer - } - if t.MaxReadFrameSize < http2minMaxFrameSize { - return http2minMaxFrameSize - } - if t.MaxReadFrameSize > http2maxFrameSize { - return http2maxFrameSize - } - return t.MaxReadFrameSize + return uint32(n) } func (t *http2Transport) disableCompression() bool { return t.DisableCompression || (t.t1 != nil && t.t1.DisableCompression) } -func (t *http2Transport) pingTimeout() time.Duration { - if t.PingTimeout == 0 { - return 15 * time.Second - } - return t.PingTimeout - -} - // ConfigureTransport configures a net/http HTTP/1 Transport to use HTTP/2. // It returns an error if t1 has already been HTTP/2-enabled. // @@ -7432,8 +7682,8 @@ func http2configureTransports(t1 *Transport) (*http2Transport, error) { if !http2strSliceContains(t1.TLSClientConfig.NextProtos, "http/1.1") { t1.TLSClientConfig.NextProtos = append(t1.TLSClientConfig.NextProtos, "http/1.1") } - upgradeFn := func(authority string, c *tls.Conn) RoundTripper { - addr := http2authorityAddr("https", authority) + upgradeFn := func(scheme, authority string, c net.Conn) RoundTripper { + addr := http2authorityAddr(scheme, authority) if used, err := connPool.addConnIfNeeded(addr, t2, c); err != nil { go c.Close() return http2erringRoundTripper{err} @@ -7444,18 +7694,37 @@ func http2configureTransports(t1 *Transport) (*http2Transport, error) { // was unknown) go c.Close() } + if scheme == "http" { + return (*http2unencryptedTransport)(t2) + } return t2 } - if m := t1.TLSNextProto; len(m) == 0 { - t1.TLSNextProto = map[string]func(string, *tls.Conn) RoundTripper{ - "h2": upgradeFn, + if t1.TLSNextProto == nil { + t1.TLSNextProto = make(map[string]func(string, *tls.Conn) RoundTripper) + } + t1.TLSNextProto[http2NextProtoTLS] = func(authority string, c *tls.Conn) RoundTripper { + return upgradeFn("https", authority, c) + } + // The "unencrypted_http2" TLSNextProto key is used to pass off non-TLS HTTP/2 conns. + t1.TLSNextProto[http2nextProtoUnencryptedHTTP2] = func(authority string, c *tls.Conn) RoundTripper { + nc, err := http2unencryptedNetConnFromTLSConn(c) + if err != nil { + go c.Close() + return http2erringRoundTripper{err} } - } else { - m["h2"] = upgradeFn + return upgradeFn("http", authority, nc) } return t2, nil } +// unencryptedTransport is a Transport with a RoundTrip method that +// always permits http:// URLs. +type http2unencryptedTransport http2Transport + +func (t *http2unencryptedTransport) RoundTrip(req *Request) (*Response, error) { + return (*http2Transport)(t).RoundTripOpt(req, http2RoundTripOpt{allowHTTP: true}) +} + func (t *http2Transport) connPool() http2ClientConnPool { t.connPoolOnce.Do(t.initConnPool) return t.connPoolOrDef @@ -7475,7 +7744,7 @@ type http2ClientConn struct { t *http2Transport tconn net.Conn // usually *tls.Conn, except specialized impls tlsState *tls.ConnectionState // nil only for specialized impls - reused uint32 // whether conn is being reused; atomic + atomicReused uint32 // whether conn is being reused; atomic singleUse bool // whether being used for a single http.Request getConnCalled bool // used by clientConnPool @@ -7506,11 +7775,22 @@ type http2ClientConn struct { lastActive time.Time lastIdle time.Time // time last idle // Settings from peer: (also guarded by wmu) - maxFrameSize uint32 - maxConcurrentStreams uint32 - peerMaxHeaderListSize uint64 - peerMaxHeaderTableSize uint32 - initialWindowSize uint32 + maxFrameSize uint32 + maxConcurrentStreams uint32 + peerMaxHeaderListSize uint64 + peerMaxHeaderTableSize uint32 + initialWindowSize uint32 + initialStreamRecvWindowSize int32 + readIdleTimeout time.Duration + pingTimeout time.Duration + + // pendingResets is the number of RST_STREAM frames we have sent to the peer, + // without confirming that the peer has received them. When we send a RST_STREAM, + // we bundle it with a PING frame, unless a PING is already in flight. We count + // the reset stream against the connection's concurrency limit until we get + // a PING response. This limits the number of requests we'll try to send to a + // completely unresponsive connection. + pendingResets int // reqHeaderMu is a 1-element semaphore channel controlling access to sending new requests. // Write to reqHeaderMu to lock it, read from it to unlock. @@ -7568,12 +7848,12 @@ type http2clientStream struct { sentHeaders bool // owned by clientConnReadLoop: - firstByte bool // got the first response byte - pastHeaders bool // got first MetaHeadersFrame (actual headers) - pastTrailers bool // got optional second MetaHeadersFrame (trailers) - num1xx uint8 // number of 1xx responses seen - readClosed bool // peer sent an END_STREAM flag - readAborted bool // read loop reset the stream + firstByte bool // got the first response byte + pastHeaders bool // got first MetaHeadersFrame (actual headers) + pastTrailers bool // got optional second MetaHeadersFrame (trailers) + readClosed bool // peer sent an END_STREAM flag + readAborted bool // read loop reset the stream + totalHeaderSize int64 // total size of 1xx headers seen trailer Header // accumulated trailers resTrailer *Header // client's Response.Trailer @@ -7635,6 +7915,7 @@ func (cs *http2clientStream) closeReqBodyLocked() { } type http2stickyErrWriter struct { + group http2synctestGroupInterface conn net.Conn timeout time.Duration err *error @@ -7644,22 +7925,9 @@ func (sew http2stickyErrWriter) Write(p []byte) (n int, err error) { if *sew.err != nil { return 0, *sew.err } - for { - if sew.timeout != 0 { - sew.conn.SetWriteDeadline(time.Now().Add(sew.timeout)) - } - nn, err := sew.conn.Write(p[n:]) - n += nn - if n < len(p) && nn > 0 && errors.Is(err, os.ErrDeadlineExceeded) { - // Keep extending the deadline so long as we're making progress. - continue - } - if sew.timeout != 0 { - sew.conn.SetWriteDeadline(time.Time{}) - } - *sew.err = err - return n, err - } + n, err = http2writeWithByteTimeout(sew.group, sew.conn, sew.timeout, p) + *sew.err = err + return n, err } // noCachedConnError is the concrete type of ErrNoCachedConn, which @@ -7691,6 +7959,8 @@ type http2RoundTripOpt struct { // no cached connection is available, RoundTripOpt // will return ErrNoCachedConn. OnlyCachedConn bool + + allowHTTP bool // allow http:// URLs } func (t *http2Transport) RoundTrip(req *Request) (*Response, error) { @@ -7723,7 +7993,14 @@ func http2authorityAddr(scheme string, authority string) (addr string) { // RoundTripOpt is like RoundTrip, but takes options. func (t *http2Transport) RoundTripOpt(req *Request, opt http2RoundTripOpt) (*Response, error) { - if !(req.URL.Scheme == "https" || (req.URL.Scheme == "http" && t.AllowHTTP)) { + switch req.URL.Scheme { + case "https": + // Always okay. + case "http": + if !t.AllowHTTP && !opt.allowHTTP { + return nil, errors.New("http2: unencrypted HTTP/2 not enabled") + } + default: return nil, errors.New("http2: unsupported scheme") } @@ -7734,7 +8011,7 @@ func (t *http2Transport) RoundTripOpt(req *Request, opt http2RoundTripOpt) (*Res t.vlogf("http2: Transport failed to get client conn for %s: %v", addr, err) return nil, err } - reused := !atomic.CompareAndSwapUint32(&cc.reused, 0, 1) + reused := !atomic.CompareAndSwapUint32(&cc.atomicReused, 0, 1) http2traceGotConn(req, cc, reused) res, err := cc.RoundTrip(req) if err != nil && retry <= 6 { @@ -7759,6 +8036,22 @@ func (t *http2Transport) RoundTripOpt(req *Request, opt http2RoundTripOpt) (*Res } } } + if err == http2errClientConnNotEstablished { + // This ClientConn was created recently, + // this is the first request to use it, + // and the connection is closed and not usable. + // + // In this state, cc.idleTimer will remove the conn from the pool + // when it fires. Stop the timer and remove it here so future requests + // won't try to use this connection. + // + // If the timer has already fired and we're racing it, the redundant + // call to MarkDead is harmless. + if cc.idleTimer != nil { + cc.idleTimer.Stop() + } + t.connPool().MarkDead(cc) + } if err != nil { t.vlogf("RoundTrip failure: %v", err) return nil, err @@ -7777,9 +8070,10 @@ func (t *http2Transport) CloseIdleConnections() { } var ( - http2errClientConnClosed = errors.New("http2: client conn is closed") - http2errClientConnUnusable = errors.New("http2: client conn not usable") - http2errClientConnGotGoAway = errors.New("http2: Transport received Server's graceful shutdown GOAWAY") + http2errClientConnClosed = errors.New("http2: client conn is closed") + http2errClientConnUnusable = errors.New("http2: client conn not usable") + http2errClientConnNotEstablished = errors.New("http2: client conn could not be established") + http2errClientConnGotGoAway = errors.New("http2: Transport received Server's graceful shutdown GOAWAY") ) // shouldRetryRequest is called by RoundTrip when a request fails to get @@ -7895,44 +8189,37 @@ func (t *http2Transport) expectContinueTimeout() time.Duration { return t.t1.ExpectContinueTimeout } -func (t *http2Transport) maxDecoderHeaderTableSize() uint32 { - if v := t.MaxDecoderHeaderTableSize; v > 0 { - return v - } - return http2initialHeaderTableSize -} - -func (t *http2Transport) maxEncoderHeaderTableSize() uint32 { - if v := t.MaxEncoderHeaderTableSize; v > 0 { - return v - } - return http2initialHeaderTableSize -} - func (t *http2Transport) NewClientConn(c net.Conn) (*http2ClientConn, error) { return t.newClientConn(c, t.disableKeepAlives()) } func (t *http2Transport) newClientConn(c net.Conn, singleUse bool) (*http2ClientConn, error) { + conf := http2configFromTransport(t) cc := &http2ClientConn{ - t: t, - tconn: c, - readerDone: make(chan struct{}), - nextStreamID: 1, - maxFrameSize: 16 << 10, // spec default - initialWindowSize: 65535, // spec default - maxConcurrentStreams: http2initialMaxConcurrentStreams, // "infinite", per spec. Use a smaller value until we have received server settings. - peerMaxHeaderListSize: 0xffffffffffffffff, // "infinite", per spec. Use 2^64-1 instead. - streams: make(map[uint32]*http2clientStream), - singleUse: singleUse, - wantSettingsAck: true, - pings: make(map[[8]byte]chan struct{}), - reqHeaderMu: make(chan struct{}, 1), - } + t: t, + tconn: c, + readerDone: make(chan struct{}), + nextStreamID: 1, + maxFrameSize: 16 << 10, // spec default + initialWindowSize: 65535, // spec default + initialStreamRecvWindowSize: conf.MaxUploadBufferPerStream, + maxConcurrentStreams: http2initialMaxConcurrentStreams, // "infinite", per spec. Use a smaller value until we have received server settings. + peerMaxHeaderListSize: 0xffffffffffffffff, // "infinite", per spec. Use 2^64-1 instead. + streams: make(map[uint32]*http2clientStream), + singleUse: singleUse, + wantSettingsAck: true, + readIdleTimeout: conf.SendPingTimeout, + pingTimeout: conf.PingTimeout, + pings: make(map[[8]byte]chan struct{}), + reqHeaderMu: make(chan struct{}, 1), + lastActive: t.now(), + } + var group http2synctestGroupInterface if t.http2transportTestHooks != nil { t.markNewGoroutine() t.http2transportTestHooks.newclientconn(cc) c = cc.tconn + group = t.group } if http2VerboseLogs { t.vlogf("http2: Transport creating client conn %p to %v", cc, c.RemoteAddr()) @@ -7944,24 +8231,23 @@ func (t *http2Transport) newClientConn(c net.Conn, singleUse bool) (*http2Client // TODO: adjust this writer size to account for frame size + // MTU + crypto/tls record padding. cc.bw = bufio.NewWriter(http2stickyErrWriter{ + group: group, conn: c, - timeout: t.WriteByteTimeout, + timeout: conf.WriteByteTimeout, err: &cc.werr, }) cc.br = bufio.NewReader(c) cc.fr = http2NewFramer(cc.bw, cc.br) - if t.maxFrameReadSize() != 0 { - cc.fr.SetMaxReadFrameSize(t.maxFrameReadSize()) - } + cc.fr.SetMaxReadFrameSize(conf.MaxReadFrameSize) if t.CountError != nil { cc.fr.countError = t.CountError } - maxHeaderTableSize := t.maxDecoderHeaderTableSize() + maxHeaderTableSize := conf.MaxDecoderHeaderTableSize cc.fr.ReadMetaHeaders = hpack.NewDecoder(maxHeaderTableSize, nil) cc.fr.MaxHeaderListSize = t.maxHeaderListSize() cc.henc = hpack.NewEncoder(&cc.hbuf) - cc.henc.SetMaxDynamicTableSizeLimit(t.maxEncoderHeaderTableSize()) + cc.henc.SetMaxDynamicTableSizeLimit(conf.MaxEncoderHeaderTableSize) cc.peerMaxHeaderTableSize = http2initialHeaderTableSize if cs, ok := c.(http2connectionStater); ok { @@ -7971,11 +8257,9 @@ func (t *http2Transport) newClientConn(c net.Conn, singleUse bool) (*http2Client initialSettings := []http2Setting{ {ID: http2SettingEnablePush, Val: 0}, - {ID: http2SettingInitialWindowSize, Val: http2transportDefaultStreamFlow}, - } - if max := t.maxFrameReadSize(); max != 0 { - initialSettings = append(initialSettings, http2Setting{ID: http2SettingMaxFrameSize, Val: max}) + {ID: http2SettingInitialWindowSize, Val: uint32(cc.initialStreamRecvWindowSize)}, } + initialSettings = append(initialSettings, http2Setting{ID: http2SettingMaxFrameSize, Val: conf.MaxReadFrameSize}) if max := t.maxHeaderListSize(); max != 0 { initialSettings = append(initialSettings, http2Setting{ID: http2SettingMaxHeaderListSize, Val: max}) } @@ -7985,8 +8269,8 @@ func (t *http2Transport) newClientConn(c net.Conn, singleUse bool) (*http2Client cc.bw.Write(http2clientPreface) cc.fr.WriteSettings(initialSettings...) - cc.fr.WriteWindowUpdate(0, http2transportDefaultConnFlow) - cc.inflow.init(http2transportDefaultConnFlow + http2initialWindowSize) + cc.fr.WriteWindowUpdate(0, uint32(conf.MaxUploadBufferPerConnection)) + cc.inflow.init(conf.MaxUploadBufferPerConnection + http2initialWindowSize) cc.bw.Flush() if cc.werr != nil { cc.Close() @@ -8004,7 +8288,7 @@ func (t *http2Transport) newClientConn(c net.Conn, singleUse bool) (*http2Client } func (cc *http2ClientConn) healthCheck() { - pingTimeout := cc.t.pingTimeout() + pingTimeout := cc.pingTimeout // We don't need to periodically ping in the health check, because the readLoop of ClientConn will // trigger the healthCheck again if there is no frame received. ctx, cancel := cc.t.contextWithTimeout(context.Background(), pingTimeout) @@ -8132,7 +8416,7 @@ func (cc *http2ClientConn) State() http2ClientConnState { return http2ClientConnState{ Closed: cc.closed, Closing: cc.closing || cc.singleUse || cc.doNotReuse || cc.goAway != nil, - StreamsActive: len(cc.streams), + StreamsActive: len(cc.streams) + cc.pendingResets, StreamsReserved: cc.streamsReserved, StreamsPending: cc.pendingRequests, LastIdle: cc.lastIdle, @@ -8164,16 +8448,38 @@ func (cc *http2ClientConn) idleStateLocked() (st http2clientConnIdleState) { // writing it. maxConcurrentOkay = true } else { - maxConcurrentOkay = int64(len(cc.streams)+cc.streamsReserved+1) <= int64(cc.maxConcurrentStreams) + // We can take a new request if the total of + // - active streams; + // - reservation slots for new streams; and + // - streams for which we have sent a RST_STREAM and a PING, + // but received no subsequent frame + // is less than the concurrency limit. + maxConcurrentOkay = cc.currentRequestCountLocked() < int(cc.maxConcurrentStreams) } st.canTakeNewRequest = cc.goAway == nil && !cc.closed && !cc.closing && maxConcurrentOkay && !cc.doNotReuse && int64(cc.nextStreamID)+2*int64(cc.pendingRequests) < math.MaxInt32 && !cc.tooIdleLocked() + + // If this connection has never been used for a request and is closed, + // then let it take a request (which will fail). + // + // This avoids a situation where an error early in a connection's lifetime + // goes unreported. + if cc.nextStreamID == 1 && cc.streamsReserved == 0 && cc.closed { + st.canTakeNewRequest = true + } + return } +// currentRequestCountLocked reports the number of concurrency slots currently in use, +// including active streams, reserved slots, and reset streams waiting for acknowledgement. +func (cc *http2ClientConn) currentRequestCountLocked() int { + return len(cc.streams) + cc.streamsReserved + cc.pendingResets +} + func (cc *http2ClientConn) canTakeNewRequestLocked() bool { st := cc.idleStateLocked() return st.canTakeNewRequest @@ -8186,7 +8492,7 @@ func (cc *http2ClientConn) tooIdleLocked() bool { // times are compared based on their wall time. We don't want // to reuse a connection that's been sitting idle during // VM/laptop suspend if monotonic time was also frozen. - return cc.idleTimeout != 0 && !cc.lastIdle.IsZero() && time.Since(cc.lastIdle.Round(0)) > cc.idleTimeout + return cc.idleTimeout != 0 && !cc.lastIdle.IsZero() && cc.t.timeSince(cc.lastIdle.Round(0)) > cc.idleTimeout } // onIdleTimeout is called from a time.AfterFunc goroutine. It will @@ -8750,6 +9056,7 @@ func (cs *http2clientStream) cleanupWriteRequest(err error) { cs.reqBodyClosed = make(chan struct{}) } bodyClosed := cs.reqBodyClosed + closeOnIdle := cc.singleUse || cc.doNotReuse || cc.t.disableKeepAlives() || cc.goAway != nil cc.mu.Unlock() if mustCloseBody { cs.reqBody.Close() @@ -8774,16 +9081,40 @@ func (cs *http2clientStream) cleanupWriteRequest(err error) { if cs.sentHeaders { if se, ok := err.(http2StreamError); ok { if se.Cause != http2errFromPeer { - cc.writeStreamReset(cs.ID, se.Code, err) + cc.writeStreamReset(cs.ID, se.Code, false, err) } } else { - cc.writeStreamReset(cs.ID, http2ErrCodeCancel, err) + // We're cancelling an in-flight request. + // + // This could be due to the server becoming unresponsive. + // To avoid sending too many requests on a dead connection, + // we let the request continue to consume a concurrency slot + // until we can confirm the server is still responding. + // We do this by sending a PING frame along with the RST_STREAM + // (unless a ping is already in flight). + // + // For simplicity, we don't bother tracking the PING payload: + // We reset cc.pendingResets any time we receive a PING ACK. + // + // We skip this if the conn is going to be closed on idle, + // because it's short lived and will probably be closed before + // we get the ping response. + ping := false + if !closeOnIdle { + cc.mu.Lock() + if cc.pendingResets == 0 { + ping = true + } + cc.pendingResets++ + cc.mu.Unlock() + } + cc.writeStreamReset(cs.ID, http2ErrCodeCancel, ping, err) } } cs.bufPipe.CloseWithError(err) // no-op if already closed } else { if cs.sentHeaders && !cs.sentEndStream { - cc.writeStreamReset(cs.ID, http2ErrCodeNo, nil) + cc.writeStreamReset(cs.ID, http2ErrCodeNo, false, nil) } cs.bufPipe.CloseWithError(http2errRequestCanceled) } @@ -8805,12 +9136,17 @@ func (cs *http2clientStream) cleanupWriteRequest(err error) { // Must hold cc.mu. func (cc *http2ClientConn) awaitOpenSlotForStreamLocked(cs *http2clientStream) error { for { - cc.lastActive = time.Now() + if cc.closed && cc.nextStreamID == 1 && cc.streamsReserved == 0 { + // This is the very first request sent to this connection. + // Return a fatal error which aborts the retry loop. + return http2errClientConnNotEstablished + } + cc.lastActive = cc.t.now() if cc.closed || !cc.canTakeNewRequestLocked() { return http2errClientConnUnusable } cc.lastIdle = time.Time{} - if int64(len(cc.streams)) < int64(cc.maxConcurrentStreams) { + if cc.currentRequestCountLocked() < int(cc.maxConcurrentStreams) { return nil } cc.pendingRequests++ @@ -9337,7 +9673,7 @@ type http2resAndError struct { func (cc *http2ClientConn) addStreamLocked(cs *http2clientStream) { cs.flow.add(int32(cc.initialWindowSize)) cs.flow.setConnFlow(&cc.flow) - cs.inflow.init(http2transportDefaultStreamFlow) + cs.inflow.init(cc.initialStreamRecvWindowSize) cs.ID = cc.nextStreamID cc.nextStreamID += 2 cc.streams[cs.ID] = cs @@ -9353,10 +9689,10 @@ func (cc *http2ClientConn) forgetStreamID(id uint32) { if len(cc.streams) != slen-1 { panic("forgetting unknown stream id") } - cc.lastActive = time.Now() + cc.lastActive = cc.t.now() if len(cc.streams) == 0 && cc.idleTimer != nil { cc.idleTimer.Reset(cc.idleTimeout) - cc.lastIdle = time.Now() + cc.lastIdle = cc.t.now() } // Wake up writeRequestBody via clientStream.awaitFlowControl and // wake up RoundTrip if there is a pending request. @@ -9416,7 +9752,6 @@ func http2isEOFOrNetReadError(err error) bool { func (rl *http2clientConnReadLoop) cleanup() { cc := rl.cc - cc.t.connPool().MarkDead(cc) defer cc.closeConn() defer close(cc.readerDone) @@ -9440,6 +9775,24 @@ func (rl *http2clientConnReadLoop) cleanup() { } cc.closed = true + // If the connection has never been used, and has been open for only a short time, + // leave it in the connection pool for a little while. + // + // This avoids a situation where new connections are constantly created, + // added to the pool, fail, and are removed from the pool, without any error + // being surfaced to the user. + const unusedWaitTime = 5 * time.Second + idleTime := cc.t.now().Sub(cc.lastActive) + if atomic.LoadUint32(&cc.atomicReused) == 0 && idleTime < unusedWaitTime { + cc.idleTimer = cc.t.afterFunc(unusedWaitTime-idleTime, func() { + cc.t.connPool().MarkDead(cc) + }) + } else { + cc.mu.Unlock() // avoid any deadlocks in MarkDead + cc.t.connPool().MarkDead(cc) + cc.mu.Lock() + } + for _, cs := range cc.streams { select { case <-cs.peerClosed: @@ -9483,7 +9836,7 @@ func (cc *http2ClientConn) countReadFrameError(err error) { func (rl *http2clientConnReadLoop) run() error { cc := rl.cc gotSettings := false - readIdleTimeout := cc.t.ReadIdleTimeout + readIdleTimeout := cc.readIdleTimeout var t http2timer if readIdleTimeout != 0 { t = cc.t.afterFunc(readIdleTimeout, cc.healthCheck) @@ -9667,15 +10020,34 @@ func (rl *http2clientConnReadLoop) handleResponse(cs *http2clientStream, f *http if f.StreamEnded() { return nil, errors.New("1xx informational response with END_STREAM flag") } - cs.num1xx++ - const max1xxResponses = 5 // arbitrary bound on number of informational responses, same as net/http - if cs.num1xx > max1xxResponses { - return nil, errors.New("http2: too many 1xx informational responses") - } if fn := cs.get1xxTraceFunc(); fn != nil { + // If the 1xx response is being delivered to the user, + // then they're responsible for limiting the number + // of responses. if err := fn(statusCode, textproto.MIMEHeader(header)); err != nil { return nil, err } + } else { + // If the user didn't examine the 1xx response, then we + // limit the size of all 1xx headers. + // + // This differs a bit from the HTTP/1 implementation, which + // limits the size of all 1xx headers plus the final response. + // Use the larger limit of MaxHeaderListSize and + // net/http.Transport.MaxResponseHeaderBytes. + limit := int64(cs.cc.t.maxHeaderListSize()) + if t1 := cs.cc.t.t1; t1 != nil && t1.MaxResponseHeaderBytes > limit { + limit = t1.MaxResponseHeaderBytes + } + for _, h := range f.Fields { + cs.totalHeaderSize += int64(h.Size()) + } + if cs.totalHeaderSize > limit { + if http2VerboseLogs { + log.Printf("http2: 1xx informational responses too large") + } + return nil, errors.New("header list too large") + } } if statusCode == 100 { http2traceGot100Continue(cs.trace) @@ -10219,6 +10591,11 @@ func (rl *http2clientConnReadLoop) processPing(f *http2PingFrame) error { close(c) delete(cc.pings, f.Data) } + if cc.pendingResets > 0 { + // See clientStream.cleanupWriteRequest. + cc.pendingResets = 0 + cc.cond.Broadcast() + } return nil } cc := rl.cc @@ -10241,13 +10618,20 @@ func (rl *http2clientConnReadLoop) processPushPromise(f *http2PushPromiseFrame) return http2ConnectionError(http2ErrCodeProtocol) } -func (cc *http2ClientConn) writeStreamReset(streamID uint32, code http2ErrCode, err error) { +// writeStreamReset sends a RST_STREAM frame. +// When ping is true, it also sends a PING frame with a random payload. +func (cc *http2ClientConn) writeStreamReset(streamID uint32, code http2ErrCode, ping bool, err error) { // TODO: map err to more interesting error codes, once the // HTTP community comes up with some. But currently for // RST_STREAM there's no equivalent to GOAWAY frame's debug // data, and the error codes are all pretty vague ("cancel"). cc.wmu.Lock() cc.fr.WriteRSTStream(streamID, code) + if ping { + var payload [8]byte + rand.Read(payload[:]) + cc.fr.WritePing(false, payload) + } cc.bw.Flush() cc.wmu.Unlock() } @@ -10404,7 +10788,7 @@ func http2traceGotConn(req *Request, cc *http2ClientConn, reused bool) { cc.mu.Lock() ci.WasIdle = len(cc.streams) == 0 && reused if ci.WasIdle && !cc.lastActive.IsZero() { - ci.IdleTime = time.Since(cc.lastActive) + ci.IdleTime = cc.t.timeSince(cc.lastActive) } cc.mu.Unlock() @@ -10472,6 +10856,27 @@ func (t *http2Transport) dialTLSWithContext(ctx context.Context, network, addr s return tlsCn, nil } +const http2nextProtoUnencryptedHTTP2 = "unencrypted_http2" + +// unencryptedNetConnFromTLSConn retrieves a net.Conn wrapped in a *tls.Conn. +// +// TLSNextProto functions accept a *tls.Conn. +// +// When passing an unencrypted HTTP/2 connection to a TLSNextProto function, +// we pass a *tls.Conn with an underlying net.Conn containing the unencrypted connection. +// To be extra careful about mistakes (accidentally dropping TLS encryption in a place +// where we want it), the tls.Conn contains a net.Conn with an UnencryptedNetConn method +// that returns the actual connection we want to use. +func http2unencryptedNetConnFromTLSConn(tc *tls.Conn) (net.Conn, error) { + conner, ok := tc.NetConn().(interface { + UnencryptedNetConn() net.Conn + }) + if !ok { + return nil, errors.New("http2: TLS conn unexpectedly found in unencrypted handoff") + } + return conner.UnencryptedNetConn(), nil +} + // writeFramer is implemented by any type that is used to write frames. type http2writeFramer interface { writeFrame(http2writeContext) error @@ -10588,6 +10993,18 @@ func (se http2StreamError) writeFrame(ctx http2writeContext) error { func (se http2StreamError) staysWithinBuffer(max int) bool { return http2frameHeaderLen+4 <= max } +type http2writePing struct { + data [8]byte +} + +func (w http2writePing) writeFrame(ctx http2writeContext) error { + return ctx.Framer().WritePing(false, w.data) +} + +func (w http2writePing) staysWithinBuffer(max int) bool { + return http2frameHeaderLen+len(w.data) <= max +} + type http2writePingAck struct{ pf *http2PingFrame } func (w http2writePingAck) writeFrame(ctx http2writeContext) error { diff --git a/src/vendor/golang.org/x/crypto/chacha20/chacha_noasm.go b/src/vendor/golang.org/x/crypto/chacha20/chacha_noasm.go index db42e6676ab35b..c709b728477d10 100644 --- a/src/vendor/golang.org/x/crypto/chacha20/chacha_noasm.go +++ b/src/vendor/golang.org/x/crypto/chacha20/chacha_noasm.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build (!arm64 && !s390x && !ppc64le) || !gc || purego +//go:build (!arm64 && !s390x && !ppc64 && !ppc64le) || !gc || purego package chacha20 diff --git a/src/vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.go b/src/vendor/golang.org/x/crypto/chacha20/chacha_ppc64x.go similarity index 89% rename from src/vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.go rename to src/vendor/golang.org/x/crypto/chacha20/chacha_ppc64x.go index 3a4287f9900e40..bd183d9ba1243e 100644 --- a/src/vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.go +++ b/src/vendor/golang.org/x/crypto/chacha20/chacha_ppc64x.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build gc && !purego +//go:build gc && !purego && (ppc64 || ppc64le) package chacha20 diff --git a/src/vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.s b/src/vendor/golang.org/x/crypto/chacha20/chacha_ppc64x.s similarity index 76% rename from src/vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.s rename to src/vendor/golang.org/x/crypto/chacha20/chacha_ppc64x.s index c672ccf6986b08..a660b4112fafd9 100644 --- a/src/vendor/golang.org/x/crypto/chacha20/chacha_ppc64le.s +++ b/src/vendor/golang.org/x/crypto/chacha20/chacha_ppc64x.s @@ -19,7 +19,7 @@ // The differences in this and the original implementation are // due to the calling conventions and initialization of constants. -//go:build gc && !purego +//go:build gc && !purego && (ppc64 || ppc64le) #include "textflag.h" @@ -36,32 +36,68 @@ // for VPERMXOR #define MASK R18 -DATA consts<>+0x00(SB)/8, $0x3320646e61707865 -DATA consts<>+0x08(SB)/8, $0x6b20657479622d32 -DATA consts<>+0x10(SB)/8, $0x0000000000000001 -DATA consts<>+0x18(SB)/8, $0x0000000000000000 -DATA consts<>+0x20(SB)/8, $0x0000000000000004 -DATA consts<>+0x28(SB)/8, $0x0000000000000000 -DATA consts<>+0x30(SB)/8, $0x0a0b08090e0f0c0d -DATA consts<>+0x38(SB)/8, $0x0203000106070405 -DATA consts<>+0x40(SB)/8, $0x090a0b080d0e0f0c -DATA consts<>+0x48(SB)/8, $0x0102030005060704 -DATA consts<>+0x50(SB)/8, $0x6170786561707865 -DATA consts<>+0x58(SB)/8, $0x6170786561707865 -DATA consts<>+0x60(SB)/8, $0x3320646e3320646e -DATA consts<>+0x68(SB)/8, $0x3320646e3320646e -DATA consts<>+0x70(SB)/8, $0x79622d3279622d32 -DATA consts<>+0x78(SB)/8, $0x79622d3279622d32 -DATA consts<>+0x80(SB)/8, $0x6b2065746b206574 -DATA consts<>+0x88(SB)/8, $0x6b2065746b206574 -DATA consts<>+0x90(SB)/8, $0x0000000100000000 -DATA consts<>+0x98(SB)/8, $0x0000000300000002 -DATA consts<>+0xa0(SB)/8, $0x5566774411223300 -DATA consts<>+0xa8(SB)/8, $0xddeeffcc99aabb88 -DATA consts<>+0xb0(SB)/8, $0x6677445522330011 -DATA consts<>+0xb8(SB)/8, $0xeeffccddaabb8899 +DATA consts<>+0x00(SB)/4, $0x61707865 +DATA consts<>+0x04(SB)/4, $0x3320646e +DATA consts<>+0x08(SB)/4, $0x79622d32 +DATA consts<>+0x0c(SB)/4, $0x6b206574 +DATA consts<>+0x10(SB)/4, $0x00000001 +DATA consts<>+0x14(SB)/4, $0x00000000 +DATA consts<>+0x18(SB)/4, $0x00000000 +DATA consts<>+0x1c(SB)/4, $0x00000000 +DATA consts<>+0x20(SB)/4, $0x00000004 +DATA consts<>+0x24(SB)/4, $0x00000000 +DATA consts<>+0x28(SB)/4, $0x00000000 +DATA consts<>+0x2c(SB)/4, $0x00000000 +DATA consts<>+0x30(SB)/4, $0x0e0f0c0d +DATA consts<>+0x34(SB)/4, $0x0a0b0809 +DATA consts<>+0x38(SB)/4, $0x06070405 +DATA consts<>+0x3c(SB)/4, $0x02030001 +DATA consts<>+0x40(SB)/4, $0x0d0e0f0c +DATA consts<>+0x44(SB)/4, $0x090a0b08 +DATA consts<>+0x48(SB)/4, $0x05060704 +DATA consts<>+0x4c(SB)/4, $0x01020300 +DATA consts<>+0x50(SB)/4, $0x61707865 +DATA consts<>+0x54(SB)/4, $0x61707865 +DATA consts<>+0x58(SB)/4, $0x61707865 +DATA consts<>+0x5c(SB)/4, $0x61707865 +DATA consts<>+0x60(SB)/4, $0x3320646e +DATA consts<>+0x64(SB)/4, $0x3320646e +DATA consts<>+0x68(SB)/4, $0x3320646e +DATA consts<>+0x6c(SB)/4, $0x3320646e +DATA consts<>+0x70(SB)/4, $0x79622d32 +DATA consts<>+0x74(SB)/4, $0x79622d32 +DATA consts<>+0x78(SB)/4, $0x79622d32 +DATA consts<>+0x7c(SB)/4, $0x79622d32 +DATA consts<>+0x80(SB)/4, $0x6b206574 +DATA consts<>+0x84(SB)/4, $0x6b206574 +DATA consts<>+0x88(SB)/4, $0x6b206574 +DATA consts<>+0x8c(SB)/4, $0x6b206574 +DATA consts<>+0x90(SB)/4, $0x00000000 +DATA consts<>+0x94(SB)/4, $0x00000001 +DATA consts<>+0x98(SB)/4, $0x00000002 +DATA consts<>+0x9c(SB)/4, $0x00000003 +DATA consts<>+0xa0(SB)/4, $0x11223300 +DATA consts<>+0xa4(SB)/4, $0x55667744 +DATA consts<>+0xa8(SB)/4, $0x99aabb88 +DATA consts<>+0xac(SB)/4, $0xddeeffcc +DATA consts<>+0xb0(SB)/4, $0x22330011 +DATA consts<>+0xb4(SB)/4, $0x66774455 +DATA consts<>+0xb8(SB)/4, $0xaabb8899 +DATA consts<>+0xbc(SB)/4, $0xeeffccdd GLOBL consts<>(SB), RODATA, $0xc0 +#ifdef GOARCH_ppc64 +#define BE_XXBRW_INIT() \ + LVSL (R0)(R0), V24 \ + VSPLTISB $3, V25 \ + VXOR V24, V25, V24 \ + +#define BE_XXBRW(vr) VPERM vr, vr, V24, vr +#else +#define BE_XXBRW_INIT() +#define BE_XXBRW(vr) +#endif + //func chaCha20_ctr32_vsx(out, inp *byte, len int, key *[8]uint32, counter *uint32) TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40 MOVD out+0(FP), OUT @@ -94,6 +130,8 @@ TEXT ·chaCha20_ctr32_vsx(SB),NOSPLIT,$64-40 // Clear V27 VXOR V27, V27, V27 + BE_XXBRW_INIT() + // V28 LXVW4X (CONSTBASE)(R11), VS60 @@ -299,6 +337,11 @@ loop_vsx: VADDUWM V8, V18, V8 VADDUWM V12, V19, V12 + BE_XXBRW(V0) + BE_XXBRW(V4) + BE_XXBRW(V8) + BE_XXBRW(V12) + CMPU LEN, $64 BLT tail_vsx @@ -327,6 +370,11 @@ loop_vsx: VADDUWM V9, V18, V8 VADDUWM V13, V19, V12 + BE_XXBRW(V0) + BE_XXBRW(V4) + BE_XXBRW(V8) + BE_XXBRW(V12) + CMPU LEN, $64 BLT tail_vsx @@ -334,8 +382,8 @@ loop_vsx: LXVW4X (INP)(R8), VS60 LXVW4X (INP)(R9), VS61 LXVW4X (INP)(R10), VS62 - VXOR V27, V0, V27 + VXOR V27, V0, V27 VXOR V28, V4, V28 VXOR V29, V8, V29 VXOR V30, V12, V30 @@ -354,6 +402,11 @@ loop_vsx: VADDUWM V10, V18, V8 VADDUWM V14, V19, V12 + BE_XXBRW(V0) + BE_XXBRW(V4) + BE_XXBRW(V8) + BE_XXBRW(V12) + CMPU LEN, $64 BLT tail_vsx @@ -381,6 +434,11 @@ loop_vsx: VADDUWM V11, V18, V8 VADDUWM V15, V19, V12 + BE_XXBRW(V0) + BE_XXBRW(V4) + BE_XXBRW(V8) + BE_XXBRW(V12) + CMPU LEN, $64 BLT tail_vsx @@ -408,9 +466,9 @@ loop_vsx: done_vsx: // Increment counter by number of 64 byte blocks - MOVD (CNT), R14 + MOVWZ (CNT), R14 ADD BLOCKS, R14 - MOVD R14, (CNT) + MOVWZ R14, (CNT) RET tail_vsx: diff --git a/src/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s b/src/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s index 731d2ac6dbc11d..fd5ee845f9faac 100644 --- a/src/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s +++ b/src/vendor/golang.org/x/crypto/chacha20poly1305/chacha20poly1305_amd64.s @@ -1,2715 +1,9762 @@ -// Copyright 2016 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// This file was originally from https://golang.org/cl/24717 by Vlad Krasnov of CloudFlare. +// Code generated by command: go run chacha20poly1305_amd64_asm.go -out ../chacha20poly1305_amd64.s -pkg chacha20poly1305. DO NOT EDIT. //go:build gc && !purego #include "textflag.h" -// General register allocation -#define oup DI -#define inp SI -#define inl BX -#define adp CX // free to reuse, after we hash the additional data -#define keyp R8 // free to reuse, when we copy the key to stack -#define itr2 R9 // general iterator -#define itr1 CX // general iterator -#define acc0 R10 -#define acc1 R11 -#define acc2 R12 -#define t0 R13 -#define t1 R14 -#define t2 R15 -#define t3 R8 -// Register and stack allocation for the SSE code -#define rStore (0*16)(BP) -#define sStore (1*16)(BP) -#define state1Store (2*16)(BP) -#define state2Store (3*16)(BP) -#define tmpStore (4*16)(BP) -#define ctr0Store (5*16)(BP) -#define ctr1Store (6*16)(BP) -#define ctr2Store (7*16)(BP) -#define ctr3Store (8*16)(BP) -#define A0 X0 -#define A1 X1 -#define A2 X2 -#define B0 X3 -#define B1 X4 -#define B2 X5 -#define C0 X6 -#define C1 X7 -#define C2 X8 -#define D0 X9 -#define D1 X10 -#define D2 X11 -#define T0 X12 -#define T1 X13 -#define T2 X14 -#define T3 X15 -#define A3 T0 -#define B3 T1 -#define C3 T2 -#define D3 T3 -// Register and stack allocation for the AVX2 code -#define rsStoreAVX2 (0*32)(BP) -#define state1StoreAVX2 (1*32)(BP) -#define state2StoreAVX2 (2*32)(BP) -#define ctr0StoreAVX2 (3*32)(BP) -#define ctr1StoreAVX2 (4*32)(BP) -#define ctr2StoreAVX2 (5*32)(BP) -#define ctr3StoreAVX2 (6*32)(BP) -#define tmpStoreAVX2 (7*32)(BP) // 256 bytes on stack -#define AA0 Y0 -#define AA1 Y5 -#define AA2 Y6 -#define AA3 Y7 -#define BB0 Y14 -#define BB1 Y9 -#define BB2 Y10 -#define BB3 Y11 -#define CC0 Y12 -#define CC1 Y13 -#define CC2 Y8 -#define CC3 Y15 -#define DD0 Y4 -#define DD1 Y1 -#define DD2 Y2 -#define DD3 Y3 -#define TT0 DD3 -#define TT1 AA3 -#define TT2 BB3 -#define TT3 CC3 -// ChaCha20 constants -DATA ·chacha20Constants<>+0x00(SB)/4, $0x61707865 -DATA ·chacha20Constants<>+0x04(SB)/4, $0x3320646e -DATA ·chacha20Constants<>+0x08(SB)/4, $0x79622d32 -DATA ·chacha20Constants<>+0x0c(SB)/4, $0x6b206574 -DATA ·chacha20Constants<>+0x10(SB)/4, $0x61707865 -DATA ·chacha20Constants<>+0x14(SB)/4, $0x3320646e -DATA ·chacha20Constants<>+0x18(SB)/4, $0x79622d32 -DATA ·chacha20Constants<>+0x1c(SB)/4, $0x6b206574 -// <<< 16 with PSHUFB -DATA ·rol16<>+0x00(SB)/8, $0x0504070601000302 -DATA ·rol16<>+0x08(SB)/8, $0x0D0C0F0E09080B0A -DATA ·rol16<>+0x10(SB)/8, $0x0504070601000302 -DATA ·rol16<>+0x18(SB)/8, $0x0D0C0F0E09080B0A -// <<< 8 with PSHUFB -DATA ·rol8<>+0x00(SB)/8, $0x0605040702010003 -DATA ·rol8<>+0x08(SB)/8, $0x0E0D0C0F0A09080B -DATA ·rol8<>+0x10(SB)/8, $0x0605040702010003 -DATA ·rol8<>+0x18(SB)/8, $0x0E0D0C0F0A09080B - -DATA ·avx2InitMask<>+0x00(SB)/8, $0x0 -DATA ·avx2InitMask<>+0x08(SB)/8, $0x0 -DATA ·avx2InitMask<>+0x10(SB)/8, $0x1 -DATA ·avx2InitMask<>+0x18(SB)/8, $0x0 - -DATA ·avx2IncMask<>+0x00(SB)/8, $0x2 -DATA ·avx2IncMask<>+0x08(SB)/8, $0x0 -DATA ·avx2IncMask<>+0x10(SB)/8, $0x2 -DATA ·avx2IncMask<>+0x18(SB)/8, $0x0 -// Poly1305 key clamp -DATA ·polyClampMask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF -DATA ·polyClampMask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC -DATA ·polyClampMask<>+0x10(SB)/8, $0xFFFFFFFFFFFFFFFF -DATA ·polyClampMask<>+0x18(SB)/8, $0xFFFFFFFFFFFFFFFF - -DATA ·sseIncMask<>+0x00(SB)/8, $0x1 -DATA ·sseIncMask<>+0x08(SB)/8, $0x0 -// To load/store the last < 16 bytes in a buffer -DATA ·andMask<>+0x00(SB)/8, $0x00000000000000ff -DATA ·andMask<>+0x08(SB)/8, $0x0000000000000000 -DATA ·andMask<>+0x10(SB)/8, $0x000000000000ffff -DATA ·andMask<>+0x18(SB)/8, $0x0000000000000000 -DATA ·andMask<>+0x20(SB)/8, $0x0000000000ffffff -DATA ·andMask<>+0x28(SB)/8, $0x0000000000000000 -DATA ·andMask<>+0x30(SB)/8, $0x00000000ffffffff -DATA ·andMask<>+0x38(SB)/8, $0x0000000000000000 -DATA ·andMask<>+0x40(SB)/8, $0x000000ffffffffff -DATA ·andMask<>+0x48(SB)/8, $0x0000000000000000 -DATA ·andMask<>+0x50(SB)/8, $0x0000ffffffffffff -DATA ·andMask<>+0x58(SB)/8, $0x0000000000000000 -DATA ·andMask<>+0x60(SB)/8, $0x00ffffffffffffff -DATA ·andMask<>+0x68(SB)/8, $0x0000000000000000 -DATA ·andMask<>+0x70(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+0x78(SB)/8, $0x0000000000000000 -DATA ·andMask<>+0x80(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+0x88(SB)/8, $0x00000000000000ff -DATA ·andMask<>+0x90(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+0x98(SB)/8, $0x000000000000ffff -DATA ·andMask<>+0xa0(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+0xa8(SB)/8, $0x0000000000ffffff -DATA ·andMask<>+0xb0(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+0xb8(SB)/8, $0x00000000ffffffff -DATA ·andMask<>+0xc0(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+0xc8(SB)/8, $0x000000ffffffffff -DATA ·andMask<>+0xd0(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+0xd8(SB)/8, $0x0000ffffffffffff -DATA ·andMask<>+0xe0(SB)/8, $0xffffffffffffffff -DATA ·andMask<>+0xe8(SB)/8, $0x00ffffffffffffff - -GLOBL ·chacha20Constants<>(SB), (NOPTR+RODATA), $32 -GLOBL ·rol16<>(SB), (NOPTR+RODATA), $32 -GLOBL ·rol8<>(SB), (NOPTR+RODATA), $32 -GLOBL ·sseIncMask<>(SB), (NOPTR+RODATA), $16 -GLOBL ·avx2IncMask<>(SB), (NOPTR+RODATA), $32 -GLOBL ·avx2InitMask<>(SB), (NOPTR+RODATA), $32 -GLOBL ·polyClampMask<>(SB), (NOPTR+RODATA), $32 -GLOBL ·andMask<>(SB), (NOPTR+RODATA), $240 -// No PALIGNR in Go ASM yet (but VPALIGNR is present). -#define shiftB0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X3, X3 -#define shiftB1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x04 // PALIGNR $4, X4, X4 -#define shiftB2Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X5, X5 -#define shiftB3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x04 // PALIGNR $4, X13, X13 -#define shiftC0Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X6, X6 -#define shiftC1Left BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x08 // PALIGNR $8, X7, X7 -#define shiftC2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc0; BYTE $0x08 // PALIGNR $8, X8, X8 -#define shiftC3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xf6; BYTE $0x08 // PALIGNR $8, X14, X14 -#define shiftD0Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x0c // PALIGNR $12, X9, X9 -#define shiftD1Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x0c // PALIGNR $12, X10, X10 -#define shiftD2Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X11, X11 -#define shiftD3Left BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x0c // PALIGNR $12, X15, X15 -#define shiftB0Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x0c // PALIGNR $12, X3, X3 -#define shiftB1Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xe4; BYTE $0x0c // PALIGNR $12, X4, X4 -#define shiftB2Right BYTE $0x66; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X5, X5 -#define shiftB3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xed; BYTE $0x0c // PALIGNR $12, X13, X13 -#define shiftC0Right shiftC0Left -#define shiftC1Right shiftC1Left -#define shiftC2Right shiftC2Left -#define shiftC3Right shiftC3Left -#define shiftD0Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xc9; BYTE $0x04 // PALIGNR $4, X9, X9 -#define shiftD1Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xd2; BYTE $0x04 // PALIGNR $4, X10, X10 -#define shiftD2Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xdb; BYTE $0x04 // PALIGNR $4, X11, X11 -#define shiftD3Right BYTE $0x66; BYTE $0x45; BYTE $0x0f; BYTE $0x3a; BYTE $0x0f; BYTE $0xff; BYTE $0x04 // PALIGNR $4, X15, X15 - -// Some macros - -// ROL rotates the uint32s in register R left by N bits, using temporary T. -#define ROL(N, R, T) \ - MOVO R, T; PSLLL $(N), T; PSRLL $(32-(N)), R; PXOR T, R - -// ROL16 rotates the uint32s in register R left by 16, using temporary T if needed. -#ifdef GOAMD64_v2 -#define ROL16(R, T) PSHUFB ·rol16<>(SB), R -#else -#define ROL16(R, T) ROL(16, R, T) -#endif - -// ROL8 rotates the uint32s in register R left by 8, using temporary T if needed. -#ifdef GOAMD64_v2 -#define ROL8(R, T) PSHUFB ·rol8<>(SB), R -#else -#define ROL8(R, T) ROL(8, R, T) -#endif - -#define chachaQR(A, B, C, D, T) \ - PADDD B, A; PXOR A, D; ROL16(D, T) \ - PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $12, T; PSRLL $20, B; PXOR T, B \ - PADDD B, A; PXOR A, D; ROL8(D, T) \ - PADDD D, C; PXOR C, B; MOVO B, T; PSLLL $7, T; PSRLL $25, B; PXOR T, B - -#define chachaQR_AVX2(A, B, C, D, T) \ - VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol16<>(SB), D, D \ - VPADDD D, C, C; VPXOR C, B, B; VPSLLD $12, B, T; VPSRLD $20, B, B; VPXOR T, B, B \ - VPADDD B, A, A; VPXOR A, D, D; VPSHUFB ·rol8<>(SB), D, D \ - VPADDD D, C, C; VPXOR C, B, B; VPSLLD $7, B, T; VPSRLD $25, B, B; VPXOR T, B, B - -#define polyAdd(S) ADDQ S, acc0; ADCQ 8+S, acc1; ADCQ $1, acc2 -#define polyMulStage1 MOVQ (0*8)(BP), AX; MOVQ AX, t2; MULQ acc0; MOVQ AX, t0; MOVQ DX, t1; MOVQ (0*8)(BP), AX; MULQ acc1; IMULQ acc2, t2; ADDQ AX, t1; ADCQ DX, t2 -#define polyMulStage2 MOVQ (1*8)(BP), AX; MOVQ AX, t3; MULQ acc0; ADDQ AX, t1; ADCQ $0, DX; MOVQ DX, acc0; MOVQ (1*8)(BP), AX; MULQ acc1; ADDQ AX, t2; ADCQ $0, DX -#define polyMulStage3 IMULQ acc2, t3; ADDQ acc0, t2; ADCQ DX, t3 -#define polyMulReduceStage MOVQ t0, acc0; MOVQ t1, acc1; MOVQ t2, acc2; ANDQ $3, acc2; MOVQ t2, t0; ANDQ $-4, t0; MOVQ t3, t1; SHRQ $2, t3, t2; SHRQ $2, t3; ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $0, acc2; ADDQ t2, acc0; ADCQ t3, acc1; ADCQ $0, acc2 - -#define polyMulStage1_AVX2 MOVQ (0*8)(BP), DX; MOVQ DX, t2; MULXQ acc0, t0, t1; IMULQ acc2, t2; MULXQ acc1, AX, DX; ADDQ AX, t1; ADCQ DX, t2 -#define polyMulStage2_AVX2 MOVQ (1*8)(BP), DX; MULXQ acc0, acc0, AX; ADDQ acc0, t1; MULXQ acc1, acc1, t3; ADCQ acc1, t2; ADCQ $0, t3 -#define polyMulStage3_AVX2 IMULQ acc2, DX; ADDQ AX, t2; ADCQ DX, t3 - -#define polyMul polyMulStage1; polyMulStage2; polyMulStage3; polyMulReduceStage -#define polyMulAVX2 polyMulStage1_AVX2; polyMulStage2_AVX2; polyMulStage3_AVX2; polyMulReduceStage -// ---------------------------------------------------------------------------- + +// func polyHashADInternal<>() TEXT polyHashADInternal<>(SB), NOSPLIT, $0 - // adp points to beginning of additional data - // itr2 holds ad length - XORQ acc0, acc0 - XORQ acc1, acc1 - XORQ acc2, acc2 - CMPQ itr2, $13 - JNE hashADLoop - -openFastTLSAD: - // Special treatment for the TLS case of 13 bytes - MOVQ (adp), acc0 - MOVQ 5(adp), acc1 - SHRQ $24, acc1 - MOVQ $1, acc2 - polyMul + // Hack: Must declare #define macros inside of a function due to Avo constraints + // ROL rotates the uint32s in register R left by N bits, using temporary T. + #define ROL(N, R, T) \ + MOVO R, T; \ + PSLLL $(N), T; \ + PSRLL $(32-(N)), R; \ + PXOR T, R + + // ROL8 rotates the uint32s in register R left by 8, using temporary T if needed. + #ifdef GOAMD64_v2 + #define ROL8(R, T) PSHUFB ·rol8<>(SB), R + #else + #define ROL8(R, T) ROL(8, R, T) + #endif + + // ROL16 rotates the uint32s in register R left by 16, using temporary T if needed. + #ifdef GOAMD64_v2 + #define ROL16(R, T) PSHUFB ·rol16<>(SB), R + #else + #define ROL16(R, T) ROL(16, R, T) + #endif + XORQ R10, R10 + XORQ R11, R11 + XORQ R12, R12 + CMPQ R9, $0x0d + JNE hashADLoop + MOVQ (CX), R10 + MOVQ 5(CX), R11 + SHRQ $0x18, R11 + MOVQ $0x00000001, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 RET hashADLoop: // Hash in 16 byte chunks - CMPQ itr2, $16 - JB hashADTail - polyAdd(0(adp)) - LEAQ (1*16)(adp), adp - SUBQ $16, itr2 - polyMul - JMP hashADLoop + CMPQ R9, $0x10 + JB hashADTail + ADDQ (CX), R10 + ADCQ 8(CX), R11 + ADCQ $0x01, R12 + LEAQ 16(CX), CX + SUBQ $0x10, R9 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + JMP hashADLoop hashADTail: - CMPQ itr2, $0 + CMPQ R9, $0x00 JE hashADDone // Hash last < 16 byte tail - XORQ t0, t0 - XORQ t1, t1 - XORQ t2, t2 - ADDQ itr2, adp + XORQ R13, R13 + XORQ R14, R14 + XORQ R15, R15 + ADDQ R9, CX hashADTailLoop: - SHLQ $8, t0, t1 - SHLQ $8, t0 - MOVB -1(adp), t2 - XORQ t2, t0 - DECQ adp - DECQ itr2 - JNE hashADTailLoop - -hashADTailFinish: - ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 - polyMul - - // Finished AD + SHLQ $0x08, R13, R14 + SHLQ $0x08, R13 + MOVB -1(CX), R15 + XORQ R15, R13 + DECQ CX + DECQ R9 + JNE hashADTailLoop + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + hashADDone: RET -// ---------------------------------------------------------------------------- -// func chacha20Poly1305Open(dst, key, src, ad []byte) bool -TEXT ·chacha20Poly1305Open(SB), 0, $288-97 +// func chacha20Poly1305Open(dst []byte, key []uint32, src []byte, ad []byte) bool +// Requires: AVX, AVX2, BMI2, CMOV, SSE2 +TEXT ·chacha20Poly1305Open(SB), $288-97 // For aligned stack access MOVQ SP, BP - ADDQ $32, BP + ADDQ $0x20, BP ANDQ $-32, BP - MOVQ dst+0(FP), oup - MOVQ key+24(FP), keyp - MOVQ src+48(FP), inp - MOVQ src_len+56(FP), inl - MOVQ ad+72(FP), adp + MOVQ dst_base+0(FP), DI + MOVQ key_base+24(FP), R8 + MOVQ src_base+48(FP), SI + MOVQ src_len+56(FP), BX + MOVQ ad_base+72(FP), CX // Check for AVX2 support - CMPB ·useAVX2(SB), $1 + CMPB ·useAVX2+0(SB), $0x01 JE chacha20Poly1305Open_AVX2 // Special optimization, for very short buffers - CMPQ inl, $128 - JBE openSSE128 // About 16% faster + CMPQ BX, $0x80 + JBE openSSE128 // For long buffers, prepare the poly key first - MOVOU ·chacha20Constants<>(SB), A0 - MOVOU (1*16)(keyp), B0 - MOVOU (2*16)(keyp), C0 - MOVOU (3*16)(keyp), D0 - MOVO D0, T1 + MOVOU ·chacha20Constants<>+0(SB), X0 + MOVOU 16(R8), X3 + MOVOU 32(R8), X6 + MOVOU 48(R8), X9 + MOVO X9, X13 // Store state on stack for future use - MOVO B0, state1Store - MOVO C0, state2Store - MOVO D0, ctr3Store - MOVQ $10, itr2 + MOVO X3, 32(BP) + MOVO X6, 48(BP) + MOVO X9, 128(BP) + MOVQ $0x0000000a, R9 openSSEPreparePolyKey: - chachaQR(A0, B0, C0, D0, T0) - shiftB0Left; shiftC0Left; shiftD0Left - chachaQR(A0, B0, C0, D0, T0) - shiftB0Right; shiftC0Right; shiftD0Right - DECQ itr2 - JNE openSSEPreparePolyKey + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + DECQ R9 + JNE openSSEPreparePolyKey // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded - PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0 + PADDL ·chacha20Constants<>+0(SB), X0 + PADDL 32(BP), X3 // Clamp and store the key - PAND ·polyClampMask<>(SB), A0 - MOVO A0, rStore; MOVO B0, sStore + PAND ·polyClampMask<>+0(SB), X0 + MOVO X0, (BP) + MOVO X3, 16(BP) // Hash AAD - MOVQ ad_len+80(FP), itr2 + MOVQ ad_len+80(FP), R9 CALL polyHashADInternal<>(SB) openSSEMainLoop: - CMPQ inl, $256 + CMPQ BX, $0x00000100 JB openSSEMainLoopDone // Load state, increment counter blocks - MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0 - MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 - MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 - MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 + MOVO ·chacha20Constants<>+0(SB), X0 + MOVO 32(BP), X3 + MOVO 48(BP), X6 + MOVO 128(BP), X9 + PADDL ·sseIncMask<>+0(SB), X9 + MOVO X0, X1 + MOVO X3, X4 + MOVO X6, X7 + MOVO X9, X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X1, X2 + MOVO X4, X5 + MOVO X7, X8 + MOVO X10, X11 + PADDL ·sseIncMask<>+0(SB), X11 + MOVO X2, X12 + MOVO X5, X13 + MOVO X8, X14 + MOVO X11, X15 + PADDL ·sseIncMask<>+0(SB), X15 // Store counters - MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store + MOVO X9, 80(BP) + MOVO X10, 96(BP) + MOVO X11, 112(BP) + MOVO X15, 128(BP) - // There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16 - MOVQ $4, itr1 - MOVQ inp, itr2 + // There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we hash + // 2 blocks, and for the remaining 4 only 1 block - for a total of 16 + MOVQ $0x00000004, CX + MOVQ SI, R9 openSSEInternalLoop: - MOVO C3, tmpStore - chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) - MOVO tmpStore, C3 - MOVO C1, tmpStore - chachaQR(A3, B3, C3, D3, C1) - MOVO tmpStore, C1 - polyAdd(0(itr2)) - shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left - shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left - shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left - polyMulStage1 - polyMulStage2 - LEAQ (2*8)(itr2), itr2 - MOVO C3, tmpStore - chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) - MOVO tmpStore, C3 - MOVO C1, tmpStore - polyMulStage3 - chachaQR(A3, B3, C3, D3, C1) - MOVO tmpStore, C1 - polyMulReduceStage - shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right - shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right - shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right - DECQ itr1 - JGE openSSEInternalLoop - - polyAdd(0(itr2)) - polyMul - LEAQ (2*8)(itr2), itr2 - - CMPQ itr1, $-6 - JG openSSEInternalLoop + MOVO X14, 64(BP) + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X3 + PXOR X14, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X3 + PXOR X14, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X4 + PXOR X14, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X4 + PXOR X14, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X5 + PXOR X14, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X5 + PXOR X14, X5 + MOVO 64(BP), X14 + MOVO X7, 64(BP) + PADDD X13, X12 + PXOR X12, X15 + ROL16(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x0c, X7 + PSRLL $0x14, X13 + PXOR X7, X13 + PADDD X13, X12 + PXOR X12, X15 + ROL8(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x07, X7 + PSRLL $0x19, X13 + PXOR X7, X13 + MOVO 64(BP), X7 + ADDQ (R9), R10 + ADCQ 8(R9), R11 + ADCQ $0x01, R12 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x0c + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + LEAQ 16(R9), R9 + MOVO X14, 64(BP) + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X3 + PXOR X14, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X3 + PXOR X14, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X4 + PXOR X14, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X4 + PXOR X14, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X5 + PXOR X14, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X5 + PXOR X14, X5 + MOVO 64(BP), X14 + MOVO X7, 64(BP) + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + PADDD X13, X12 + PXOR X12, X15 + ROL16(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x0c, X7 + PSRLL $0x14, X13 + PXOR X7, X13 + PADDD X13, X12 + PXOR X12, X15 + ROL8(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x07, X7 + PSRLL $0x19, X13 + PXOR X7, X13 + MOVO 64(BP), X7 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x04 + DECQ CX + JGE openSSEInternalLoop + ADDQ (R9), R10 + ADCQ 8(R9), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(R9), R9 + CMPQ CX, $-6 + JG openSSEInternalLoop // Add in the state - PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 - PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 - PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 - PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 + PADDD ·chacha20Constants<>+0(SB), X0 + PADDD ·chacha20Constants<>+0(SB), X1 + PADDD ·chacha20Constants<>+0(SB), X2 + PADDD ·chacha20Constants<>+0(SB), X12 + PADDD 32(BP), X3 + PADDD 32(BP), X4 + PADDD 32(BP), X5 + PADDD 32(BP), X13 + PADDD 48(BP), X6 + PADDD 48(BP), X7 + PADDD 48(BP), X8 + PADDD 48(BP), X14 + PADDD 80(BP), X9 + PADDD 96(BP), X10 + PADDD 112(BP), X11 + PADDD 128(BP), X15 // Load - xor - store - MOVO D3, tmpStore - MOVOU (0*16)(inp), D3; PXOR D3, A0; MOVOU A0, (0*16)(oup) - MOVOU (1*16)(inp), D3; PXOR D3, B0; MOVOU B0, (1*16)(oup) - MOVOU (2*16)(inp), D3; PXOR D3, C0; MOVOU C0, (2*16)(oup) - MOVOU (3*16)(inp), D3; PXOR D3, D0; MOVOU D0, (3*16)(oup) - MOVOU (4*16)(inp), D0; PXOR D0, A1; MOVOU A1, (4*16)(oup) - MOVOU (5*16)(inp), D0; PXOR D0, B1; MOVOU B1, (5*16)(oup) - MOVOU (6*16)(inp), D0; PXOR D0, C1; MOVOU C1, (6*16)(oup) - MOVOU (7*16)(inp), D0; PXOR D0, D1; MOVOU D1, (7*16)(oup) - MOVOU (8*16)(inp), D0; PXOR D0, A2; MOVOU A2, (8*16)(oup) - MOVOU (9*16)(inp), D0; PXOR D0, B2; MOVOU B2, (9*16)(oup) - MOVOU (10*16)(inp), D0; PXOR D0, C2; MOVOU C2, (10*16)(oup) - MOVOU (11*16)(inp), D0; PXOR D0, D2; MOVOU D2, (11*16)(oup) - MOVOU (12*16)(inp), D0; PXOR D0, A3; MOVOU A3, (12*16)(oup) - MOVOU (13*16)(inp), D0; PXOR D0, B3; MOVOU B3, (13*16)(oup) - MOVOU (14*16)(inp), D0; PXOR D0, C3; MOVOU C3, (14*16)(oup) - MOVOU (15*16)(inp), D0; PXOR tmpStore, D0; MOVOU D0, (15*16)(oup) - LEAQ 256(inp), inp - LEAQ 256(oup), oup - SUBQ $256, inl + MOVO X15, 64(BP) + MOVOU (SI), X15 + PXOR X15, X0 + MOVOU X0, (DI) + MOVOU 16(SI), X15 + PXOR X15, X3 + MOVOU X3, 16(DI) + MOVOU 32(SI), X15 + PXOR X15, X6 + MOVOU X6, 32(DI) + MOVOU 48(SI), X15 + PXOR X15, X9 + MOVOU X9, 48(DI) + MOVOU 64(SI), X9 + PXOR X9, X1 + MOVOU X1, 64(DI) + MOVOU 80(SI), X9 + PXOR X9, X4 + MOVOU X4, 80(DI) + MOVOU 96(SI), X9 + PXOR X9, X7 + MOVOU X7, 96(DI) + MOVOU 112(SI), X9 + PXOR X9, X10 + MOVOU X10, 112(DI) + MOVOU 128(SI), X9 + PXOR X9, X2 + MOVOU X2, 128(DI) + MOVOU 144(SI), X9 + PXOR X9, X5 + MOVOU X5, 144(DI) + MOVOU 160(SI), X9 + PXOR X9, X8 + MOVOU X8, 160(DI) + MOVOU 176(SI), X9 + PXOR X9, X11 + MOVOU X11, 176(DI) + MOVOU 192(SI), X9 + PXOR X9, X12 + MOVOU X12, 192(DI) + MOVOU 208(SI), X9 + PXOR X9, X13 + MOVOU X13, 208(DI) + MOVOU 224(SI), X9 + PXOR X9, X14 + MOVOU X14, 224(DI) + MOVOU 240(SI), X9 + PXOR 64(BP), X9 + MOVOU X9, 240(DI) + LEAQ 256(SI), SI + LEAQ 256(DI), DI + SUBQ $0x00000100, BX JMP openSSEMainLoop openSSEMainLoopDone: // Handle the various tail sizes efficiently - TESTQ inl, inl + TESTQ BX, BX JE openSSEFinalize - CMPQ inl, $64 + CMPQ BX, $0x40 JBE openSSETail64 - CMPQ inl, $128 + CMPQ BX, $0x80 JBE openSSETail128 - CMPQ inl, $192 + CMPQ BX, $0xc0 JBE openSSETail192 JMP openSSETail256 openSSEFinalize: // Hash in the PT, AAD lengths - ADDQ ad_len+80(FP), acc0; ADCQ src_len+56(FP), acc1; ADCQ $1, acc2 - polyMul + ADDQ ad_len+80(FP), R10 + ADCQ src_len+56(FP), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 // Final reduce - MOVQ acc0, t0 - MOVQ acc1, t1 - MOVQ acc2, t2 - SUBQ $-5, acc0 - SBBQ $-1, acc1 - SBBQ $3, acc2 - CMOVQCS t0, acc0 - CMOVQCS t1, acc1 - CMOVQCS t2, acc2 + MOVQ R10, R13 + MOVQ R11, R14 + MOVQ R12, R15 + SUBQ $-5, R10 + SBBQ $-1, R11 + SBBQ $0x03, R12 + CMOVQCS R13, R10 + CMOVQCS R14, R11 + CMOVQCS R15, R12 // Add in the "s" part of the key - ADDQ 0+sStore, acc0 - ADCQ 8+sStore, acc1 + ADDQ 16(BP), R10 + ADCQ 24(BP), R11 // Finally, constant time compare to the tag at the end of the message XORQ AX, AX - MOVQ $1, DX - XORQ (0*8)(inp), acc0 - XORQ (1*8)(inp), acc1 - ORQ acc1, acc0 + MOVQ $0x00000001, DX + XORQ (SI), R10 + XORQ 8(SI), R11 + ORQ R11, R10 CMOVQEQ DX, AX // Return true iff tags are equal MOVB AX, ret+96(FP) RET -// ---------------------------------------------------------------------------- -// Special optimization for buffers smaller than 129 bytes openSSE128: - // For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks - MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0 - MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 - MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 - MOVO B0, T1; MOVO C0, T2; MOVO D1, T3 - MOVQ $10, itr2 + MOVOU ·chacha20Constants<>+0(SB), X0 + MOVOU 16(R8), X3 + MOVOU 32(R8), X6 + MOVOU 48(R8), X9 + MOVO X0, X1 + MOVO X3, X4 + MOVO X6, X7 + MOVO X9, X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X1, X2 + MOVO X4, X5 + MOVO X7, X8 + MOVO X10, X11 + PADDL ·sseIncMask<>+0(SB), X11 + MOVO X3, X13 + MOVO X6, X14 + MOVO X10, X15 + MOVQ $0x0000000a, R9 openSSE128InnerCipherLoop: - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) - shiftB0Left; shiftB1Left; shiftB2Left - shiftC0Left; shiftC1Left; shiftC2Left - shiftD0Left; shiftD1Left; shiftD2Left - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) - shiftB0Right; shiftB1Right; shiftB2Right - shiftC0Right; shiftC1Right; shiftC2Right - shiftD0Right; shiftD1Right; shiftD2Right - DECQ itr2 - JNE openSSE128InnerCipherLoop + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X5 + PXOR X12, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X5 + PXOR X12, X5 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X5 + PXOR X12, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X5 + PXOR X12, X5 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + DECQ R9 + JNE openSSE128InnerCipherLoop // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded - PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 - PADDL T1, B0; PADDL T1, B1; PADDL T1, B2 - PADDL T2, C1; PADDL T2, C2 - PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2 + PADDL ·chacha20Constants<>+0(SB), X0 + PADDL ·chacha20Constants<>+0(SB), X1 + PADDL ·chacha20Constants<>+0(SB), X2 + PADDL X13, X3 + PADDL X13, X4 + PADDL X13, X5 + PADDL X14, X7 + PADDL X14, X8 + PADDL X15, X10 + PADDL ·sseIncMask<>+0(SB), X15 + PADDL X15, X11 // Clamp and store the key - PAND ·polyClampMask<>(SB), A0 - MOVOU A0, rStore; MOVOU B0, sStore + PAND ·polyClampMask<>+0(SB), X0 + MOVOU X0, (BP) + MOVOU X3, 16(BP) // Hash - MOVQ ad_len+80(FP), itr2 + MOVQ ad_len+80(FP), R9 CALL polyHashADInternal<>(SB) openSSE128Open: - CMPQ inl, $16 + CMPQ BX, $0x10 JB openSSETail16 - SUBQ $16, inl + SUBQ $0x10, BX // Load for hashing - polyAdd(0(inp)) + ADDQ (SI), R10 + ADCQ 8(SI), R11 + ADCQ $0x01, R12 // Load for decryption - MOVOU (inp), T0; PXOR T0, A1; MOVOU A1, (oup) - LEAQ (1*16)(inp), inp - LEAQ (1*16)(oup), oup - polyMul + MOVOU (SI), X12 + PXOR X12, X1 + MOVOU X1, (DI) + LEAQ 16(SI), SI + LEAQ 16(DI), DI + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 // Shift the stream "left" - MOVO B1, A1 - MOVO C1, B1 - MOVO D1, C1 - MOVO A2, D1 - MOVO B2, A2 - MOVO C2, B2 - MOVO D2, C2 + MOVO X4, X1 + MOVO X7, X4 + MOVO X10, X7 + MOVO X2, X10 + MOVO X5, X2 + MOVO X8, X5 + MOVO X11, X8 JMP openSSE128Open openSSETail16: - TESTQ inl, inl + TESTQ BX, BX JE openSSEFinalize // We can safely load the CT from the end, because it is padded with the MAC - MOVQ inl, itr2 - SHLQ $4, itr2 - LEAQ ·andMask<>(SB), t0 - MOVOU (inp), T0 - ADDQ inl, inp - PAND -16(t0)(itr2*1), T0 - MOVO T0, 0+tmpStore - MOVQ T0, t0 - MOVQ 8+tmpStore, t1 - PXOR A1, T0 + MOVQ BX, R9 + SHLQ $0x04, R9 + LEAQ ·andMask<>+0(SB), R13 + MOVOU (SI), X12 + ADDQ BX, SI + PAND -16(R13)(R9*1), X12 + MOVO X12, 64(BP) + MOVQ X12, R13 + MOVQ 72(BP), R14 + PXOR X1, X12 // We can only store one byte at a time, since plaintext can be shorter than 16 bytes openSSETail16Store: - MOVQ T0, t3 - MOVB t3, (oup) - PSRLDQ $1, T0 - INCQ oup - DECQ inl + MOVQ X12, R8 + MOVB R8, (DI) + PSRLDQ $0x01, X12 + INCQ DI + DECQ BX JNE openSSETail16Store - ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 - polyMul + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 JMP openSSEFinalize -// ---------------------------------------------------------------------------- -// Special optimization for the last 64 bytes of ciphertext openSSETail64: - // Need to decrypt up to 64 bytes - prepare single block - MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store - XORQ itr2, itr2 - MOVQ inl, itr1 - CMPQ itr1, $16 - JB openSSETail64LoopB + MOVO ·chacha20Constants<>+0(SB), X0 + MOVO 32(BP), X3 + MOVO 48(BP), X6 + MOVO 128(BP), X9 + PADDL ·sseIncMask<>+0(SB), X9 + MOVO X9, 80(BP) + XORQ R9, R9 + MOVQ BX, CX + CMPQ CX, $0x10 + JB openSSETail64LoopB openSSETail64LoopA: - // Perform ChaCha rounds, while hashing the remaining input - polyAdd(0(inp)(itr2*1)) - polyMul - SUBQ $16, itr1 + ADDQ (SI)(R9*1), R10 + ADCQ 8(SI)(R9*1), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + SUBQ $0x10, CX openSSETail64LoopB: - ADDQ $16, itr2 - chachaQR(A0, B0, C0, D0, T0) - shiftB0Left; shiftC0Left; shiftD0Left - chachaQR(A0, B0, C0, D0, T0) - shiftB0Right; shiftC0Right; shiftD0Right - - CMPQ itr1, $16 - JAE openSSETail64LoopA - - CMPQ itr2, $160 - JNE openSSETail64LoopB - - PADDL ·chacha20Constants<>(SB), A0; PADDL state1Store, B0; PADDL state2Store, C0; PADDL ctr0Store, D0 + ADDQ $0x10, R9 + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + CMPQ CX, $0x10 + JAE openSSETail64LoopA + CMPQ R9, $0xa0 + JNE openSSETail64LoopB + PADDL ·chacha20Constants<>+0(SB), X0 + PADDL 32(BP), X3 + PADDL 48(BP), X6 + PADDL 80(BP), X9 openSSETail64DecLoop: - CMPQ inl, $16 + CMPQ BX, $0x10 JB openSSETail64DecLoopDone - SUBQ $16, inl - MOVOU (inp), T0 - PXOR T0, A0 - MOVOU A0, (oup) - LEAQ 16(inp), inp - LEAQ 16(oup), oup - MOVO B0, A0 - MOVO C0, B0 - MOVO D0, C0 + SUBQ $0x10, BX + MOVOU (SI), X12 + PXOR X12, X0 + MOVOU X0, (DI) + LEAQ 16(SI), SI + LEAQ 16(DI), DI + MOVO X3, X0 + MOVO X6, X3 + MOVO X9, X6 JMP openSSETail64DecLoop openSSETail64DecLoopDone: - MOVO A0, A1 + MOVO X0, X1 JMP openSSETail16 -// ---------------------------------------------------------------------------- -// Special optimization for the last 128 bytes of ciphertext openSSETail128: - // Need to decrypt up to 128 bytes - prepare two blocks - MOVO ·chacha20Constants<>(SB), A1; MOVO state1Store, B1; MOVO state2Store, C1; MOVO ctr3Store, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr0Store - MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr1Store - XORQ itr2, itr2 - MOVQ inl, itr1 - ANDQ $-16, itr1 + MOVO ·chacha20Constants<>+0(SB), X1 + MOVO 32(BP), X4 + MOVO 48(BP), X7 + MOVO 128(BP), X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X10, 80(BP) + MOVO X1, X0 + MOVO X4, X3 + MOVO X7, X6 + MOVO X10, X9 + PADDL ·sseIncMask<>+0(SB), X9 + MOVO X9, 96(BP) + XORQ R9, R9 + MOVQ BX, CX + ANDQ $-16, CX openSSETail128LoopA: - // Perform ChaCha rounds, while hashing the remaining input - polyAdd(0(inp)(itr2*1)) - polyMul + ADDQ (SI)(R9*1), R10 + ADCQ 8(SI)(R9*1), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 openSSETail128LoopB: - ADDQ $16, itr2 - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) - shiftB0Left; shiftC0Left; shiftD0Left - shiftB1Left; shiftC1Left; shiftD1Left - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) - shiftB0Right; shiftC0Right; shiftD0Right - shiftB1Right; shiftC1Right; shiftD1Right - - CMPQ itr2, itr1 - JB openSSETail128LoopA - - CMPQ itr2, $160 - JNE openSSETail128LoopB - - PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1 - PADDL state1Store, B0; PADDL state1Store, B1 - PADDL state2Store, C0; PADDL state2Store, C1 - PADDL ctr1Store, D0; PADDL ctr0Store, D1 - - MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 - PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1 - MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup) - - SUBQ $64, inl - LEAQ 64(inp), inp - LEAQ 64(oup), oup - JMP openSSETail64DecLoop - -// ---------------------------------------------------------------------------- -// Special optimization for the last 192 bytes of ciphertext + ADDQ $0x10, R9 + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + CMPQ R9, CX + JB openSSETail128LoopA + CMPQ R9, $0xa0 + JNE openSSETail128LoopB + PADDL ·chacha20Constants<>+0(SB), X0 + PADDL ·chacha20Constants<>+0(SB), X1 + PADDL 32(BP), X3 + PADDL 32(BP), X4 + PADDL 48(BP), X6 + PADDL 48(BP), X7 + PADDL 96(BP), X9 + PADDL 80(BP), X10 + MOVOU (SI), X12 + MOVOU 16(SI), X13 + MOVOU 32(SI), X14 + MOVOU 48(SI), X15 + PXOR X12, X1 + PXOR X13, X4 + PXOR X14, X7 + PXOR X15, X10 + MOVOU X1, (DI) + MOVOU X4, 16(DI) + MOVOU X7, 32(DI) + MOVOU X10, 48(DI) + SUBQ $0x40, BX + LEAQ 64(SI), SI + LEAQ 64(DI), DI + JMP openSSETail64DecLoop + openSSETail192: - // Need to decrypt up to 192 bytes - prepare three blocks - MOVO ·chacha20Constants<>(SB), A2; MOVO state1Store, B2; MOVO state2Store, C2; MOVO ctr3Store, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr0Store - MOVO A2, A1; MOVO B2, B1; MOVO C2, C1; MOVO D2, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store - MOVO A1, A0; MOVO B1, B0; MOVO C1, C0; MOVO D1, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr2Store - - MOVQ inl, itr1 - MOVQ $160, itr2 - CMPQ itr1, $160 - CMOVQGT itr2, itr1 - ANDQ $-16, itr1 - XORQ itr2, itr2 + MOVO ·chacha20Constants<>+0(SB), X2 + MOVO 32(BP), X5 + MOVO 48(BP), X8 + MOVO 128(BP), X11 + PADDL ·sseIncMask<>+0(SB), X11 + MOVO X11, 80(BP) + MOVO X2, X1 + MOVO X5, X4 + MOVO X8, X7 + MOVO X11, X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X10, 96(BP) + MOVO X1, X0 + MOVO X4, X3 + MOVO X7, X6 + MOVO X10, X9 + PADDL ·sseIncMask<>+0(SB), X9 + MOVO X9, 112(BP) + MOVQ BX, CX + MOVQ $0x000000a0, R9 + CMPQ CX, $0xa0 + CMOVQGT R9, CX + ANDQ $-16, CX + XORQ R9, R9 openSSLTail192LoopA: - // Perform ChaCha rounds, while hashing the remaining input - polyAdd(0(inp)(itr2*1)) - polyMul + ADDQ (SI)(R9*1), R10 + ADCQ 8(SI)(R9*1), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 openSSLTail192LoopB: - ADDQ $16, itr2 - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) - shiftB0Left; shiftC0Left; shiftD0Left - shiftB1Left; shiftC1Left; shiftD1Left - shiftB2Left; shiftC2Left; shiftD2Left - - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) - shiftB0Right; shiftC0Right; shiftD0Right - shiftB1Right; shiftC1Right; shiftD1Right - shiftB2Right; shiftC2Right; shiftD2Right - - CMPQ itr2, itr1 - JB openSSLTail192LoopA - - CMPQ itr2, $160 - JNE openSSLTail192LoopB - - CMPQ inl, $176 - JB openSSLTail192Store - - polyAdd(160(inp)) - polyMul - - CMPQ inl, $192 - JB openSSLTail192Store - - polyAdd(176(inp)) - polyMul + ADDQ $0x10, R9 + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X5 + PXOR X12, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X5 + PXOR X12, X5 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X5 + PXOR X12, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X5 + PXOR X12, X5 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + CMPQ R9, CX + JB openSSLTail192LoopA + CMPQ R9, $0xa0 + JNE openSSLTail192LoopB + CMPQ BX, $0xb0 + JB openSSLTail192Store + ADDQ 160(SI), R10 + ADCQ 168(SI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + CMPQ BX, $0xc0 + JB openSSLTail192Store + ADDQ 176(SI), R10 + ADCQ 184(SI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 openSSLTail192Store: - PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 - PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2 - PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2 - PADDL ctr2Store, D0; PADDL ctr1Store, D1; PADDL ctr0Store, D2 - - MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 - PXOR T0, A2; PXOR T1, B2; PXOR T2, C2; PXOR T3, D2 - MOVOU A2, (0*16)(oup); MOVOU B2, (1*16)(oup); MOVOU C2, (2*16)(oup); MOVOU D2, (3*16)(oup) - - MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3 - PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1 - MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) - - SUBQ $128, inl - LEAQ 128(inp), inp - LEAQ 128(oup), oup - JMP openSSETail64DecLoop - -// ---------------------------------------------------------------------------- -// Special optimization for the last 256 bytes of ciphertext + PADDL ·chacha20Constants<>+0(SB), X0 + PADDL ·chacha20Constants<>+0(SB), X1 + PADDL ·chacha20Constants<>+0(SB), X2 + PADDL 32(BP), X3 + PADDL 32(BP), X4 + PADDL 32(BP), X5 + PADDL 48(BP), X6 + PADDL 48(BP), X7 + PADDL 48(BP), X8 + PADDL 112(BP), X9 + PADDL 96(BP), X10 + PADDL 80(BP), X11 + MOVOU (SI), X12 + MOVOU 16(SI), X13 + MOVOU 32(SI), X14 + MOVOU 48(SI), X15 + PXOR X12, X2 + PXOR X13, X5 + PXOR X14, X8 + PXOR X15, X11 + MOVOU X2, (DI) + MOVOU X5, 16(DI) + MOVOU X8, 32(DI) + MOVOU X11, 48(DI) + MOVOU 64(SI), X12 + MOVOU 80(SI), X13 + MOVOU 96(SI), X14 + MOVOU 112(SI), X15 + PXOR X12, X1 + PXOR X13, X4 + PXOR X14, X7 + PXOR X15, X10 + MOVOU X1, 64(DI) + MOVOU X4, 80(DI) + MOVOU X7, 96(DI) + MOVOU X10, 112(DI) + SUBQ $0x80, BX + LEAQ 128(SI), SI + LEAQ 128(DI), DI + JMP openSSETail64DecLoop + openSSETail256: - // Need to decrypt up to 256 bytes - prepare four blocks - MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0 - MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 - MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 - MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 + MOVO ·chacha20Constants<>+0(SB), X0 + MOVO 32(BP), X3 + MOVO 48(BP), X6 + MOVO 128(BP), X9 + PADDL ·sseIncMask<>+0(SB), X9 + MOVO X0, X1 + MOVO X3, X4 + MOVO X6, X7 + MOVO X9, X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X1, X2 + MOVO X4, X5 + MOVO X7, X8 + MOVO X10, X11 + PADDL ·sseIncMask<>+0(SB), X11 + MOVO X2, X12 + MOVO X5, X13 + MOVO X8, X14 + MOVO X11, X15 + PADDL ·sseIncMask<>+0(SB), X15 // Store counters - MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store - XORQ itr2, itr2 + MOVO X9, 80(BP) + MOVO X10, 96(BP) + MOVO X11, 112(BP) + MOVO X15, 128(BP) + XORQ R9, R9 openSSETail256Loop: - // This loop inteleaves 8 ChaCha quarter rounds with 1 poly multiplication - polyAdd(0(inp)(itr2*1)) - MOVO C3, tmpStore - chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) - MOVO tmpStore, C3 - MOVO C1, tmpStore - chachaQR(A3, B3, C3, D3, C1) - MOVO tmpStore, C1 - shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left - shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left - shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left - polyMulStage1 - polyMulStage2 - MOVO C3, tmpStore - chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) - MOVO tmpStore, C3 - MOVO C1, tmpStore - chachaQR(A3, B3, C3, D3, C1) - MOVO tmpStore, C1 - polyMulStage3 - polyMulReduceStage - shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right - shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right - shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right - ADDQ $2*8, itr2 - CMPQ itr2, $160 - JB openSSETail256Loop - MOVQ inl, itr1 - ANDQ $-16, itr1 + ADDQ (SI)(R9*1), R10 + ADCQ 8(SI)(R9*1), R11 + ADCQ $0x01, R12 + MOVO X14, 64(BP) + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X3 + PXOR X14, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X3 + PXOR X14, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X4 + PXOR X14, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X4 + PXOR X14, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X5 + PXOR X14, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X5 + PXOR X14, X5 + MOVO 64(BP), X14 + MOVO X7, 64(BP) + PADDD X13, X12 + PXOR X12, X15 + ROL16(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x0c, X7 + PSRLL $0x14, X13 + PXOR X7, X13 + PADDD X13, X12 + PXOR X12, X15 + ROL8(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x07, X7 + PSRLL $0x19, X13 + PXOR X7, X13 + MOVO 64(BP), X7 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x0c + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + MOVO X14, 64(BP) + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X3 + PXOR X14, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X3 + PXOR X14, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X4 + PXOR X14, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X4 + PXOR X14, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X5 + PXOR X14, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X5 + PXOR X14, X5 + MOVO 64(BP), X14 + MOVO X7, 64(BP) + PADDD X13, X12 + PXOR X12, X15 + ROL16(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x0c, X7 + PSRLL $0x14, X13 + PXOR X7, X13 + PADDD X13, X12 + PXOR X12, X15 + ROL8(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x07, X7 + PSRLL $0x19, X13 + PXOR X7, X13 + MOVO 64(BP), X7 + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x04 + ADDQ $0x10, R9 + CMPQ R9, $0xa0 + JB openSSETail256Loop + MOVQ BX, CX + ANDQ $-16, CX openSSETail256HashLoop: - polyAdd(0(inp)(itr2*1)) - polyMul - ADDQ $2*8, itr2 - CMPQ itr2, itr1 - JB openSSETail256HashLoop + ADDQ (SI)(R9*1), R10 + ADCQ 8(SI)(R9*1), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + ADDQ $0x10, R9 + CMPQ R9, CX + JB openSSETail256HashLoop // Add in the state - PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 - PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 - PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 - PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 - MOVO D3, tmpStore + PADDD ·chacha20Constants<>+0(SB), X0 + PADDD ·chacha20Constants<>+0(SB), X1 + PADDD ·chacha20Constants<>+0(SB), X2 + PADDD ·chacha20Constants<>+0(SB), X12 + PADDD 32(BP), X3 + PADDD 32(BP), X4 + PADDD 32(BP), X5 + PADDD 32(BP), X13 + PADDD 48(BP), X6 + PADDD 48(BP), X7 + PADDD 48(BP), X8 + PADDD 48(BP), X14 + PADDD 80(BP), X9 + PADDD 96(BP), X10 + PADDD 112(BP), X11 + PADDD 128(BP), X15 + MOVO X15, 64(BP) // Load - xor - store - MOVOU (0*16)(inp), D3; PXOR D3, A0 - MOVOU (1*16)(inp), D3; PXOR D3, B0 - MOVOU (2*16)(inp), D3; PXOR D3, C0 - MOVOU (3*16)(inp), D3; PXOR D3, D0 - MOVOU A0, (0*16)(oup) - MOVOU B0, (1*16)(oup) - MOVOU C0, (2*16)(oup) - MOVOU D0, (3*16)(oup) - MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0 - PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1 - MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) - MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0 - PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2 - MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup) - LEAQ 192(inp), inp - LEAQ 192(oup), oup - SUBQ $192, inl - MOVO A3, A0 - MOVO B3, B0 - MOVO C3, C0 - MOVO tmpStore, D0 - - JMP openSSETail64DecLoop - -// ---------------------------------------------------------------------------- -// ------------------------- AVX2 Code ---------------------------------------- + MOVOU (SI), X15 + PXOR X15, X0 + MOVOU 16(SI), X15 + PXOR X15, X3 + MOVOU 32(SI), X15 + PXOR X15, X6 + MOVOU 48(SI), X15 + PXOR X15, X9 + MOVOU X0, (DI) + MOVOU X3, 16(DI) + MOVOU X6, 32(DI) + MOVOU X9, 48(DI) + MOVOU 64(SI), X0 + MOVOU 80(SI), X3 + MOVOU 96(SI), X6 + MOVOU 112(SI), X9 + PXOR X0, X1 + PXOR X3, X4 + PXOR X6, X7 + PXOR X9, X10 + MOVOU X1, 64(DI) + MOVOU X4, 80(DI) + MOVOU X7, 96(DI) + MOVOU X10, 112(DI) + MOVOU 128(SI), X0 + MOVOU 144(SI), X3 + MOVOU 160(SI), X6 + MOVOU 176(SI), X9 + PXOR X0, X2 + PXOR X3, X5 + PXOR X6, X8 + PXOR X9, X11 + MOVOU X2, 128(DI) + MOVOU X5, 144(DI) + MOVOU X8, 160(DI) + MOVOU X11, 176(DI) + LEAQ 192(SI), SI + LEAQ 192(DI), DI + SUBQ $0xc0, BX + MOVO X12, X0 + MOVO X13, X3 + MOVO X14, X6 + MOVO 64(BP), X9 + JMP openSSETail64DecLoop + chacha20Poly1305Open_AVX2: VZEROUPPER - VMOVDQU ·chacha20Constants<>(SB), AA0 - BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14 - BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12 - BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4 - VPADDD ·avx2InitMask<>(SB), DD0, DD0 + VMOVDQU ·chacha20Constants<>+0(SB), Y0 + BYTE $0xc4 + BYTE $0x42 + BYTE $0x7d + BYTE $0x5a + BYTE $0x70 + BYTE $0x10 + BYTE $0xc4 + BYTE $0x42 + BYTE $0x7d + BYTE $0x5a + BYTE $0x60 + BYTE $0x20 + BYTE $0xc4 + BYTE $0xc2 + BYTE $0x7d + BYTE $0x5a + BYTE $0x60 + BYTE $0x30 + VPADDD ·avx2InitMask<>+0(SB), Y4, Y4 // Special optimization, for very short buffers - CMPQ inl, $192 + CMPQ BX, $0xc0 JBE openAVX2192 - CMPQ inl, $320 + CMPQ BX, $0x00000140 JBE openAVX2320 // For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream - VMOVDQA BB0, state1StoreAVX2 - VMOVDQA CC0, state2StoreAVX2 - VMOVDQA DD0, ctr3StoreAVX2 - MOVQ $10, itr2 + VMOVDQA Y14, 32(BP) + VMOVDQA Y12, 64(BP) + VMOVDQA Y4, 192(BP) + MOVQ $0x0000000a, R9 openAVX2PreparePolyKey: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0 - DECQ itr2 - JNE openAVX2PreparePolyKey - - VPADDD ·chacha20Constants<>(SB), AA0, AA0 - VPADDD state1StoreAVX2, BB0, BB0 - VPADDD state2StoreAVX2, CC0, CC0 - VPADDD ctr3StoreAVX2, DD0, DD0 - - VPERM2I128 $0x02, AA0, BB0, TT0 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x04, Y4, Y4, Y4 + DECQ R9 + JNE openAVX2PreparePolyKey + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD 32(BP), Y14, Y14 + VPADDD 64(BP), Y12, Y12 + VPADDD 192(BP), Y4, Y4 + VPERM2I128 $0x02, Y0, Y14, Y3 // Clamp and store poly key - VPAND ·polyClampMask<>(SB), TT0, TT0 - VMOVDQA TT0, rsStoreAVX2 + VPAND ·polyClampMask<>+0(SB), Y3, Y3 + VMOVDQA Y3, (BP) // Stream for the first 64 bytes - VPERM2I128 $0x13, AA0, BB0, AA0 - VPERM2I128 $0x13, CC0, DD0, BB0 + VPERM2I128 $0x13, Y0, Y14, Y0 + VPERM2I128 $0x13, Y12, Y4, Y14 // Hash AD + first 64 bytes - MOVQ ad_len+80(FP), itr2 + MOVQ ad_len+80(FP), R9 CALL polyHashADInternal<>(SB) - XORQ itr1, itr1 + XORQ CX, CX openAVX2InitialHash64: - polyAdd(0(inp)(itr1*1)) - polyMulAVX2 - ADDQ $16, itr1 - CMPQ itr1, $64 - JNE openAVX2InitialHash64 + ADDQ (SI)(CX*1), R10 + ADCQ 8(SI)(CX*1), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + ADDQ $0x10, CX + CMPQ CX, $0x40 + JNE openAVX2InitialHash64 // Decrypt the first 64 bytes - VPXOR (0*32)(inp), AA0, AA0 - VPXOR (1*32)(inp), BB0, BB0 - VMOVDQU AA0, (0*32)(oup) - VMOVDQU BB0, (1*32)(oup) - LEAQ (2*32)(inp), inp - LEAQ (2*32)(oup), oup - SUBQ $64, inl + VPXOR (SI), Y0, Y0 + VPXOR 32(SI), Y14, Y14 + VMOVDQU Y0, (DI) + VMOVDQU Y14, 32(DI) + LEAQ 64(SI), SI + LEAQ 64(DI), DI + SUBQ $0x40, BX openAVX2MainLoop: - CMPQ inl, $512 + CMPQ BX, $0x00000200 JB openAVX2MainLoopDone // Load state, increment counter blocks, store the incremented counters - VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 - VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 - VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 - VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 - VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 - XORQ itr1, itr1 + VMOVDQU ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA Y0, Y6 + VMOVDQA Y0, Y7 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA Y14, Y10 + VMOVDQA Y14, Y11 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA Y12, Y8 + VMOVDQA Y12, Y15 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 + VMOVDQA Y4, 96(BP) + VMOVDQA Y1, 128(BP) + VMOVDQA Y2, 160(BP) + VMOVDQA Y3, 192(BP) + XORQ CX, CX openAVX2InternalLoop: - // Lets just say this spaghetti loop interleaves 2 quarter rounds with 3 poly multiplications - // Effectively per 512 bytes of stream we hash 480 bytes of ciphertext - polyAdd(0*8(inp)(itr1*1)) - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - polyMulStage1_AVX2 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 - polyMulStage2_AVX2 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - polyMulStage3_AVX2 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyMulReduceStage - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 - polyAdd(2*8(inp)(itr1*1)) - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - polyMulStage1_AVX2 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyMulStage2_AVX2 - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - polyMulStage3_AVX2 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 - polyMulReduceStage - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - polyAdd(4*8(inp)(itr1*1)) - LEAQ (6*8)(itr1), itr1 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyMulStage1_AVX2 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - polyMulStage2_AVX2 - VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - polyMulStage3_AVX2 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyMulReduceStage - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 - CMPQ itr1, $480 + ADDQ (SI)(CX*1), R10 + ADCQ 8(SI)(CX*1), R11 + ADCQ $0x01, R12 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + ADDQ 16(SI)(CX*1), R10 + ADCQ 24(SI)(CX*1), R11 + ADCQ $0x01, R12 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x04, Y11, Y11, Y11 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPALIGNR $0x0c, Y3, Y3, Y3 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + ADDQ 32(SI)(CX*1), R10 + ADCQ 40(SI)(CX*1), R11 + ADCQ $0x01, R12 + LEAQ 48(CX), CX + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x0c, Y11, Y11, Y11 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x04, Y2, Y2, Y2 + VPALIGNR $0x04, Y3, Y3, Y3 + CMPQ CX, $0x000001e0 JNE openAVX2InternalLoop - - VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 - VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 - VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 - VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 - VMOVDQA CC3, tmpStoreAVX2 + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 + VPADDD ·chacha20Constants<>+0(SB), Y7, Y7 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 32(BP), Y10, Y10 + VPADDD 32(BP), Y11, Y11 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD 64(BP), Y8, Y8 + VPADDD 64(BP), Y15, Y15 + VPADDD 96(BP), Y4, Y4 + VPADDD 128(BP), Y1, Y1 + VPADDD 160(BP), Y2, Y2 + VPADDD 192(BP), Y3, Y3 + VMOVDQA Y15, 224(BP) // We only hashed 480 of the 512 bytes available - hash the remaining 32 here - polyAdd(480(inp)) - polyMulAVX2 - VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0 - VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0 - VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup) - VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 - VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 - VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) + ADDQ 480(SI), R10 + ADCQ 488(SI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPERM2I128 $0x02, Y0, Y14, Y15 + VPERM2I128 $0x13, Y0, Y14, Y14 + VPERM2I128 $0x02, Y12, Y4, Y0 + VPERM2I128 $0x13, Y12, Y4, Y12 + VPXOR (SI), Y15, Y15 + VPXOR 32(SI), Y0, Y0 + VPXOR 64(SI), Y14, Y14 + VPXOR 96(SI), Y12, Y12 + VMOVDQU Y15, (DI) + VMOVDQU Y0, 32(DI) + VMOVDQU Y14, 64(DI) + VMOVDQU Y12, 96(DI) + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 + VPXOR 128(SI), Y0, Y0 + VPXOR 160(SI), Y14, Y14 + VPXOR 192(SI), Y12, Y12 + VPXOR 224(SI), Y4, Y4 + VMOVDQU Y0, 128(DI) + VMOVDQU Y14, 160(DI) + VMOVDQU Y12, 192(DI) + VMOVDQU Y4, 224(DI) // and here - polyAdd(496(inp)) - polyMulAVX2 - VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 - VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 - VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) - VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 - VPXOR (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0 - VMOVDQU AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup) - LEAQ (32*16)(inp), inp - LEAQ (32*16)(oup), oup - SUBQ $(32*16), inl + ADDQ 496(SI), R10 + ADCQ 504(SI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPERM2I128 $0x02, Y6, Y10, Y0 + VPERM2I128 $0x02, Y8, Y2, Y14 + VPERM2I128 $0x13, Y6, Y10, Y12 + VPERM2I128 $0x13, Y8, Y2, Y4 + VPXOR 256(SI), Y0, Y0 + VPXOR 288(SI), Y14, Y14 + VPXOR 320(SI), Y12, Y12 + VPXOR 352(SI), Y4, Y4 + VMOVDQU Y0, 256(DI) + VMOVDQU Y14, 288(DI) + VMOVDQU Y12, 320(DI) + VMOVDQU Y4, 352(DI) + VPERM2I128 $0x02, Y7, Y11, Y0 + VPERM2I128 $0x02, 224(BP), Y3, Y14 + VPERM2I128 $0x13, Y7, Y11, Y12 + VPERM2I128 $0x13, 224(BP), Y3, Y4 + VPXOR 384(SI), Y0, Y0 + VPXOR 416(SI), Y14, Y14 + VPXOR 448(SI), Y12, Y12 + VPXOR 480(SI), Y4, Y4 + VMOVDQU Y0, 384(DI) + VMOVDQU Y14, 416(DI) + VMOVDQU Y12, 448(DI) + VMOVDQU Y4, 480(DI) + LEAQ 512(SI), SI + LEAQ 512(DI), DI + SUBQ $0x00000200, BX JMP openAVX2MainLoop openAVX2MainLoopDone: // Handle the various tail sizes efficiently - TESTQ inl, inl + TESTQ BX, BX JE openSSEFinalize - CMPQ inl, $128 + CMPQ BX, $0x80 JBE openAVX2Tail128 - CMPQ inl, $256 + CMPQ BX, $0x00000100 JBE openAVX2Tail256 - CMPQ inl, $384 + CMPQ BX, $0x00000180 JBE openAVX2Tail384 JMP openAVX2Tail512 -// ---------------------------------------------------------------------------- -// Special optimization for buffers smaller than 193 bytes openAVX2192: - // For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks - VMOVDQA AA0, AA1 - VMOVDQA BB0, BB1 - VMOVDQA CC0, CC1 - VPADDD ·avx2IncMask<>(SB), DD0, DD1 - VMOVDQA AA0, AA2 - VMOVDQA BB0, BB2 - VMOVDQA CC0, CC2 - VMOVDQA DD0, DD2 - VMOVDQA DD1, TT3 - MOVQ $10, itr2 + VMOVDQA Y0, Y5 + VMOVDQA Y14, Y9 + VMOVDQA Y12, Y13 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VMOVDQA Y0, Y6 + VMOVDQA Y14, Y10 + VMOVDQA Y12, Y8 + VMOVDQA Y4, Y2 + VMOVDQA Y1, Y15 + MOVQ $0x0000000a, R9 openAVX2192InnerCipherLoop: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 - DECQ itr2 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + DECQ R9 JNE openAVX2192InnerCipherLoop - VPADDD AA2, AA0, AA0; VPADDD AA2, AA1, AA1 - VPADDD BB2, BB0, BB0; VPADDD BB2, BB1, BB1 - VPADDD CC2, CC0, CC0; VPADDD CC2, CC1, CC1 - VPADDD DD2, DD0, DD0; VPADDD TT3, DD1, DD1 - VPERM2I128 $0x02, AA0, BB0, TT0 + VPADDD Y6, Y0, Y0 + VPADDD Y6, Y5, Y5 + VPADDD Y10, Y14, Y14 + VPADDD Y10, Y9, Y9 + VPADDD Y8, Y12, Y12 + VPADDD Y8, Y13, Y13 + VPADDD Y2, Y4, Y4 + VPADDD Y15, Y1, Y1 + VPERM2I128 $0x02, Y0, Y14, Y3 // Clamp and store poly key - VPAND ·polyClampMask<>(SB), TT0, TT0 - VMOVDQA TT0, rsStoreAVX2 + VPAND ·polyClampMask<>+0(SB), Y3, Y3 + VMOVDQA Y3, (BP) // Stream for up to 192 bytes - VPERM2I128 $0x13, AA0, BB0, AA0 - VPERM2I128 $0x13, CC0, DD0, BB0 - VPERM2I128 $0x02, AA1, BB1, CC0 - VPERM2I128 $0x02, CC1, DD1, DD0 - VPERM2I128 $0x13, AA1, BB1, AA1 - VPERM2I128 $0x13, CC1, DD1, BB1 + VPERM2I128 $0x13, Y0, Y14, Y0 + VPERM2I128 $0x13, Y12, Y4, Y14 + VPERM2I128 $0x02, Y5, Y9, Y12 + VPERM2I128 $0x02, Y13, Y1, Y4 + VPERM2I128 $0x13, Y5, Y9, Y5 + VPERM2I128 $0x13, Y13, Y1, Y9 openAVX2ShortOpen: // Hash - MOVQ ad_len+80(FP), itr2 + MOVQ ad_len+80(FP), R9 CALL polyHashADInternal<>(SB) openAVX2ShortOpenLoop: - CMPQ inl, $32 + CMPQ BX, $0x20 JB openAVX2ShortTail32 - SUBQ $32, inl + SUBQ $0x20, BX // Load for hashing - polyAdd(0*8(inp)) - polyMulAVX2 - polyAdd(2*8(inp)) - polyMulAVX2 + ADDQ (SI), R10 + ADCQ 8(SI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + ADDQ 16(SI), R10 + ADCQ 24(SI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 // Load for decryption - VPXOR (inp), AA0, AA0 - VMOVDQU AA0, (oup) - LEAQ (1*32)(inp), inp - LEAQ (1*32)(oup), oup + VPXOR (SI), Y0, Y0 + VMOVDQU Y0, (DI) + LEAQ 32(SI), SI + LEAQ 32(DI), DI // Shift stream left - VMOVDQA BB0, AA0 - VMOVDQA CC0, BB0 - VMOVDQA DD0, CC0 - VMOVDQA AA1, DD0 - VMOVDQA BB1, AA1 - VMOVDQA CC1, BB1 - VMOVDQA DD1, CC1 - VMOVDQA AA2, DD1 - VMOVDQA BB2, AA2 + VMOVDQA Y14, Y0 + VMOVDQA Y12, Y14 + VMOVDQA Y4, Y12 + VMOVDQA Y5, Y4 + VMOVDQA Y9, Y5 + VMOVDQA Y13, Y9 + VMOVDQA Y1, Y13 + VMOVDQA Y6, Y1 + VMOVDQA Y10, Y6 JMP openAVX2ShortOpenLoop openAVX2ShortTail32: - CMPQ inl, $16 - VMOVDQA A0, A1 + CMPQ BX, $0x10 + VMOVDQA X0, X1 JB openAVX2ShortDone - - SUBQ $16, inl + SUBQ $0x10, BX // Load for hashing - polyAdd(0*8(inp)) - polyMulAVX2 + ADDQ (SI), R10 + ADCQ 8(SI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 // Load for decryption - VPXOR (inp), A0, T0 - VMOVDQU T0, (oup) - LEAQ (1*16)(inp), inp - LEAQ (1*16)(oup), oup - VPERM2I128 $0x11, AA0, AA0, AA0 - VMOVDQA A0, A1 + VPXOR (SI), X0, X12 + VMOVDQU X12, (DI) + LEAQ 16(SI), SI + LEAQ 16(DI), DI + VPERM2I128 $0x11, Y0, Y0, Y0 + VMOVDQA X0, X1 openAVX2ShortDone: VZEROUPPER JMP openSSETail16 -// ---------------------------------------------------------------------------- -// Special optimization for buffers smaller than 321 bytes openAVX2320: - // For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks - VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1 - VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2 - VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3 - MOVQ $10, itr2 + VMOVDQA Y0, Y5 + VMOVDQA Y14, Y9 + VMOVDQA Y12, Y13 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VMOVDQA Y0, Y6 + VMOVDQA Y14, Y10 + VMOVDQA Y12, Y8 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VMOVDQA Y14, Y7 + VMOVDQA Y12, Y11 + VMOVDQA Y4, Y15 + MOVQ $0x0000000a, R9 openAVX2320InnerCipherLoop: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 - DECQ itr2 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y3 + VPSRLD $0x19, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y3 + VPSRLD $0x19, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x04, Y2, Y2, Y2 + DECQ R9 JNE openAVX2320InnerCipherLoop - - VMOVDQA ·chacha20Constants<>(SB), TT0 - VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2 - VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2 - VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2 - VMOVDQA ·avx2IncMask<>(SB), TT0 - VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3 - VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3 - VPADDD TT3, DD2, DD2 + VMOVDQA ·chacha20Constants<>+0(SB), Y3 + VPADDD Y3, Y0, Y0 + VPADDD Y3, Y5, Y5 + VPADDD Y3, Y6, Y6 + VPADDD Y7, Y14, Y14 + VPADDD Y7, Y9, Y9 + VPADDD Y7, Y10, Y10 + VPADDD Y11, Y12, Y12 + VPADDD Y11, Y13, Y13 + VPADDD Y11, Y8, Y8 + VMOVDQA ·avx2IncMask<>+0(SB), Y3 + VPADDD Y15, Y4, Y4 + VPADDD Y3, Y15, Y15 + VPADDD Y15, Y1, Y1 + VPADDD Y3, Y15, Y15 + VPADDD Y15, Y2, Y2 // Clamp and store poly key - VPERM2I128 $0x02, AA0, BB0, TT0 - VPAND ·polyClampMask<>(SB), TT0, TT0 - VMOVDQA TT0, rsStoreAVX2 + VPERM2I128 $0x02, Y0, Y14, Y3 + VPAND ·polyClampMask<>+0(SB), Y3, Y3 + VMOVDQA Y3, (BP) // Stream for up to 320 bytes - VPERM2I128 $0x13, AA0, BB0, AA0 - VPERM2I128 $0x13, CC0, DD0, BB0 - VPERM2I128 $0x02, AA1, BB1, CC0 - VPERM2I128 $0x02, CC1, DD1, DD0 - VPERM2I128 $0x13, AA1, BB1, AA1 - VPERM2I128 $0x13, CC1, DD1, BB1 - VPERM2I128 $0x02, AA2, BB2, CC1 - VPERM2I128 $0x02, CC2, DD2, DD1 - VPERM2I128 $0x13, AA2, BB2, AA2 - VPERM2I128 $0x13, CC2, DD2, BB2 + VPERM2I128 $0x13, Y0, Y14, Y0 + VPERM2I128 $0x13, Y12, Y4, Y14 + VPERM2I128 $0x02, Y5, Y9, Y12 + VPERM2I128 $0x02, Y13, Y1, Y4 + VPERM2I128 $0x13, Y5, Y9, Y5 + VPERM2I128 $0x13, Y13, Y1, Y9 + VPERM2I128 $0x02, Y6, Y10, Y13 + VPERM2I128 $0x02, Y8, Y2, Y1 + VPERM2I128 $0x13, Y6, Y10, Y6 + VPERM2I128 $0x13, Y8, Y2, Y10 JMP openAVX2ShortOpen -// ---------------------------------------------------------------------------- -// Special optimization for the last 128 bytes of ciphertext openAVX2Tail128: // Need to decrypt up to 128 bytes - prepare two blocks - VMOVDQA ·chacha20Constants<>(SB), AA1 - VMOVDQA state1StoreAVX2, BB1 - VMOVDQA state2StoreAVX2, CC1 - VMOVDQA ctr3StoreAVX2, DD1 - VPADDD ·avx2IncMask<>(SB), DD1, DD1 - VMOVDQA DD1, DD0 - - XORQ itr2, itr2 - MOVQ inl, itr1 - ANDQ $-16, itr1 - TESTQ itr1, itr1 - JE openAVX2Tail128LoopB + VMOVDQA ·chacha20Constants<>+0(SB), Y5 + VMOVDQA 32(BP), Y9 + VMOVDQA 64(BP), Y13 + VMOVDQA 192(BP), Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y1 + VMOVDQA Y1, Y4 + XORQ R9, R9 + MOVQ BX, CX + ANDQ $-16, CX + TESTQ CX, CX + JE openAVX2Tail128LoopB openAVX2Tail128LoopA: - // Perform ChaCha rounds, while hashing the remaining input - polyAdd(0(inp)(itr2*1)) - polyMulAVX2 + ADDQ (SI)(R9*1), R10 + ADCQ 8(SI)(R9*1), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 openAVX2Tail128LoopB: - ADDQ $16, itr2 - chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - VPALIGNR $4, BB1, BB1, BB1 - VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $12, DD1, DD1, DD1 - chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - VPALIGNR $12, BB1, BB1, BB1 - VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $4, DD1, DD1, DD1 - CMPQ itr2, itr1 - JB openAVX2Tail128LoopA - CMPQ itr2, $160 - JNE openAVX2Tail128LoopB - - VPADDD ·chacha20Constants<>(SB), AA1, AA1 - VPADDD state1StoreAVX2, BB1, BB1 - VPADDD state2StoreAVX2, CC1, CC1 - VPADDD DD0, DD1, DD1 - VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 + ADDQ $0x10, R9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x04, Y1, Y1, Y1 + CMPQ R9, CX + JB openAVX2Tail128LoopA + CMPQ R9, $0xa0 + JNE openAVX2Tail128LoopB + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD 32(BP), Y9, Y9 + VPADDD 64(BP), Y13, Y13 + VPADDD Y4, Y1, Y1 + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 openAVX2TailLoop: - CMPQ inl, $32 + CMPQ BX, $0x20 JB openAVX2Tail - SUBQ $32, inl + SUBQ $0x20, BX // Load for decryption - VPXOR (inp), AA0, AA0 - VMOVDQU AA0, (oup) - LEAQ (1*32)(inp), inp - LEAQ (1*32)(oup), oup - VMOVDQA BB0, AA0 - VMOVDQA CC0, BB0 - VMOVDQA DD0, CC0 + VPXOR (SI), Y0, Y0 + VMOVDQU Y0, (DI) + LEAQ 32(SI), SI + LEAQ 32(DI), DI + VMOVDQA Y14, Y0 + VMOVDQA Y12, Y14 + VMOVDQA Y4, Y12 JMP openAVX2TailLoop openAVX2Tail: - CMPQ inl, $16 - VMOVDQA A0, A1 + CMPQ BX, $0x10 + VMOVDQA X0, X1 JB openAVX2TailDone - SUBQ $16, inl + SUBQ $0x10, BX // Load for decryption - VPXOR (inp), A0, T0 - VMOVDQU T0, (oup) - LEAQ (1*16)(inp), inp - LEAQ (1*16)(oup), oup - VPERM2I128 $0x11, AA0, AA0, AA0 - VMOVDQA A0, A1 + VPXOR (SI), X0, X12 + VMOVDQU X12, (DI) + LEAQ 16(SI), SI + LEAQ 16(DI), DI + VPERM2I128 $0x11, Y0, Y0, Y0 + VMOVDQA X0, X1 openAVX2TailDone: VZEROUPPER JMP openSSETail16 -// ---------------------------------------------------------------------------- -// Special optimization for the last 256 bytes of ciphertext openAVX2Tail256: - // Need to decrypt up to 256 bytes - prepare four blocks - VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1 - VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1 - VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1 - VMOVDQA ctr3StoreAVX2, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD1 - VMOVDQA DD0, TT1 - VMOVDQA DD1, TT2 + VMOVDQA ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VMOVDQA Y4, Y7 + VMOVDQA Y1, Y11 // Compute the number of iterations that will hash data - MOVQ inl, tmpStoreAVX2 - MOVQ inl, itr1 - SUBQ $128, itr1 - SHRQ $4, itr1 - MOVQ $10, itr2 - CMPQ itr1, $10 - CMOVQGT itr2, itr1 - MOVQ inp, inl - XORQ itr2, itr2 + MOVQ BX, 224(BP) + MOVQ BX, CX + SUBQ $0x80, CX + SHRQ $0x04, CX + MOVQ $0x0000000a, R9 + CMPQ CX, $0x0a + CMOVQGT R9, CX + MOVQ SI, BX + XORQ R9, R9 openAVX2Tail256LoopA: - polyAdd(0(inl)) - polyMulAVX2 - LEAQ 16(inl), inl + ADDQ (BX), R10 + ADCQ 8(BX), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(BX), BX - // Perform ChaCha rounds, while hashing the remaining input openAVX2Tail256LoopB: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 - INCQ itr2 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 - CMPQ itr2, itr1 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + INCQ R9 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + CMPQ R9, CX JB openAVX2Tail256LoopA + CMPQ R9, $0x0a + JNE openAVX2Tail256LoopB + MOVQ BX, R9 + SUBQ SI, BX + MOVQ BX, CX + MOVQ 224(BP), BX - CMPQ itr2, $10 - JNE openAVX2Tail256LoopB - - MOVQ inl, itr2 - SUBQ inp, inl - MOVQ inl, itr1 - MOVQ tmpStoreAVX2, inl - - // Hash the remainder of data (if any) openAVX2Tail256Hash: - ADDQ $16, itr1 - CMPQ itr1, inl - JGT openAVX2Tail256HashEnd - polyAdd (0(itr2)) - polyMulAVX2 - LEAQ 16(itr2), itr2 - JMP openAVX2Tail256Hash - -// Store 128 bytes safely, then go to store loop + ADDQ $0x10, CX + CMPQ CX, BX + JGT openAVX2Tail256HashEnd + ADDQ (R9), R10 + ADCQ 8(R9), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(R9), R9 + JMP openAVX2Tail256Hash + openAVX2Tail256HashEnd: - VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1 - VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1 - VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1 - VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1 - VPERM2I128 $0x02, AA0, BB0, AA2; VPERM2I128 $0x02, CC0, DD0, BB2; VPERM2I128 $0x13, AA0, BB0, CC2; VPERM2I128 $0x13, CC0, DD0, DD2 - VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 - - VPXOR (0*32)(inp), AA2, AA2; VPXOR (1*32)(inp), BB2, BB2; VPXOR (2*32)(inp), CC2, CC2; VPXOR (3*32)(inp), DD2, DD2 - VMOVDQU AA2, (0*32)(oup); VMOVDQU BB2, (1*32)(oup); VMOVDQU CC2, (2*32)(oup); VMOVDQU DD2, (3*32)(oup) - LEAQ (4*32)(inp), inp - LEAQ (4*32)(oup), oup - SUBQ $4*32, inl - - JMP openAVX2TailLoop - -// ---------------------------------------------------------------------------- -// Special optimization for the last 384 bytes of ciphertext + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD Y7, Y4, Y4 + VPADDD Y11, Y1, Y1 + VPERM2I128 $0x02, Y0, Y14, Y6 + VPERM2I128 $0x02, Y12, Y4, Y10 + VPERM2I128 $0x13, Y0, Y14, Y8 + VPERM2I128 $0x13, Y12, Y4, Y2 + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 + VPXOR (SI), Y6, Y6 + VPXOR 32(SI), Y10, Y10 + VPXOR 64(SI), Y8, Y8 + VPXOR 96(SI), Y2, Y2 + VMOVDQU Y6, (DI) + VMOVDQU Y10, 32(DI) + VMOVDQU Y8, 64(DI) + VMOVDQU Y2, 96(DI) + LEAQ 128(SI), SI + LEAQ 128(DI), DI + SUBQ $0x80, BX + JMP openAVX2TailLoop + openAVX2Tail384: // Need to decrypt up to 384 bytes - prepare six blocks - VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2 - VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2 - VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2 - VMOVDQA ctr3StoreAVX2, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD1 - VPADDD ·avx2IncMask<>(SB), DD1, DD2 - VMOVDQA DD0, ctr0StoreAVX2 - VMOVDQA DD1, ctr1StoreAVX2 - VMOVDQA DD2, ctr2StoreAVX2 + VMOVDQA ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA Y0, Y6 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA Y14, Y10 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA Y12, Y8 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VMOVDQA Y4, 96(BP) + VMOVDQA Y1, 128(BP) + VMOVDQA Y2, 160(BP) // Compute the number of iterations that will hash two blocks of data - MOVQ inl, tmpStoreAVX2 - MOVQ inl, itr1 - SUBQ $256, itr1 - SHRQ $4, itr1 - ADDQ $6, itr1 - MOVQ $10, itr2 - CMPQ itr1, $10 - CMOVQGT itr2, itr1 - MOVQ inp, inl - XORQ itr2, itr2 - - // Perform ChaCha rounds, while hashing the remaining input + MOVQ BX, 224(BP) + MOVQ BX, CX + SUBQ $0x00000100, CX + SHRQ $0x04, CX + ADDQ $0x06, CX + MOVQ $0x0000000a, R9 + CMPQ CX, $0x0a + CMOVQGT R9, CX + MOVQ SI, BX + XORQ R9, R9 + openAVX2Tail384LoopB: - polyAdd(0(inl)) - polyMulAVX2 - LEAQ 16(inl), inl + ADDQ (BX), R10 + ADCQ 8(BX), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(BX), BX openAVX2Tail384LoopA: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 - polyAdd(0(inl)) - polyMulAVX2 - LEAQ 16(inl), inl - INCQ itr2 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 - - CMPQ itr2, itr1 - JB openAVX2Tail384LoopB - - CMPQ itr2, $10 - JNE openAVX2Tail384LoopA - - MOVQ inl, itr2 - SUBQ inp, inl - MOVQ inl, itr1 - MOVQ tmpStoreAVX2, inl + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y3 + VPSRLD $0x19, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x0c, Y2, Y2, Y2 + ADDQ (BX), R10 + ADCQ 8(BX), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(BX), BX + INCQ R9 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y3 + VPSRLD $0x19, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x04, Y2, Y2, Y2 + CMPQ R9, CX + JB openAVX2Tail384LoopB + CMPQ R9, $0x0a + JNE openAVX2Tail384LoopA + MOVQ BX, R9 + SUBQ SI, BX + MOVQ BX, CX + MOVQ 224(BP), BX openAVX2Tail384Hash: - ADDQ $16, itr1 - CMPQ itr1, inl - JGT openAVX2Tail384HashEnd - polyAdd(0(itr2)) - polyMulAVX2 - LEAQ 16(itr2), itr2 - JMP openAVX2Tail384Hash - -// Store 256 bytes safely, then go to store loop + ADDQ $0x10, CX + CMPQ CX, BX + JGT openAVX2Tail384HashEnd + ADDQ (R9), R10 + ADCQ 8(R9), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(R9), R9 + JMP openAVX2Tail384Hash + openAVX2Tail384HashEnd: - VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2 - VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2 - VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2 - VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2 - VPERM2I128 $0x02, AA0, BB0, TT0; VPERM2I128 $0x02, CC0, DD0, TT1; VPERM2I128 $0x13, AA0, BB0, TT2; VPERM2I128 $0x13, CC0, DD0, TT3 - VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3 - VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup) - VPERM2I128 $0x02, AA1, BB1, TT0; VPERM2I128 $0x02, CC1, DD1, TT1; VPERM2I128 $0x13, AA1, BB1, TT2; VPERM2I128 $0x13, CC1, DD1, TT3 - VPXOR (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3 - VMOVDQU TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup) - VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 - LEAQ (8*32)(inp), inp - LEAQ (8*32)(oup), oup - SUBQ $8*32, inl + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 32(BP), Y10, Y10 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD 64(BP), Y8, Y8 + VPADDD 96(BP), Y4, Y4 + VPADDD 128(BP), Y1, Y1 + VPADDD 160(BP), Y2, Y2 + VPERM2I128 $0x02, Y0, Y14, Y3 + VPERM2I128 $0x02, Y12, Y4, Y7 + VPERM2I128 $0x13, Y0, Y14, Y11 + VPERM2I128 $0x13, Y12, Y4, Y15 + VPXOR (SI), Y3, Y3 + VPXOR 32(SI), Y7, Y7 + VPXOR 64(SI), Y11, Y11 + VPXOR 96(SI), Y15, Y15 + VMOVDQU Y3, (DI) + VMOVDQU Y7, 32(DI) + VMOVDQU Y11, 64(DI) + VMOVDQU Y15, 96(DI) + VPERM2I128 $0x02, Y5, Y9, Y3 + VPERM2I128 $0x02, Y13, Y1, Y7 + VPERM2I128 $0x13, Y5, Y9, Y11 + VPERM2I128 $0x13, Y13, Y1, Y15 + VPXOR 128(SI), Y3, Y3 + VPXOR 160(SI), Y7, Y7 + VPXOR 192(SI), Y11, Y11 + VPXOR 224(SI), Y15, Y15 + VMOVDQU Y3, 128(DI) + VMOVDQU Y7, 160(DI) + VMOVDQU Y11, 192(DI) + VMOVDQU Y15, 224(DI) + VPERM2I128 $0x02, Y6, Y10, Y0 + VPERM2I128 $0x02, Y8, Y2, Y14 + VPERM2I128 $0x13, Y6, Y10, Y12 + VPERM2I128 $0x13, Y8, Y2, Y4 + LEAQ 256(SI), SI + LEAQ 256(DI), DI + SUBQ $0x00000100, BX JMP openAVX2TailLoop -// ---------------------------------------------------------------------------- -// Special optimization for the last 512 bytes of ciphertext openAVX2Tail512: - VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 - VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 - VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 - VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 - VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 - XORQ itr1, itr1 - MOVQ inp, itr2 + VMOVDQU ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA Y0, Y6 + VMOVDQA Y0, Y7 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA Y14, Y10 + VMOVDQA Y14, Y11 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA Y12, Y8 + VMOVDQA Y12, Y15 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 + VMOVDQA Y4, 96(BP) + VMOVDQA Y1, 128(BP) + VMOVDQA Y2, 160(BP) + VMOVDQA Y3, 192(BP) + XORQ CX, CX + MOVQ SI, R9 openAVX2Tail512LoopB: - polyAdd(0(itr2)) - polyMulAVX2 - LEAQ (2*8)(itr2), itr2 + ADDQ (R9), R10 + ADCQ 8(R9), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(R9), R9 openAVX2Tail512LoopA: - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyAdd(0*8(itr2)) - polyMulAVX2 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - polyAdd(2*8(itr2)) - polyMulAVX2 - LEAQ (4*8)(itr2), itr2 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 - INCQ itr1 - CMPQ itr1, $4 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + ADDQ (R9), R10 + ADCQ 8(R9), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x04, Y11, Y11, Y11 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPALIGNR $0x0c, Y3, Y3, Y3 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + ADDQ 16(R9), R10 + ADCQ 24(R9), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 32(R9), R9 + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x0c, Y11, Y11, Y11 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x04, Y2, Y2, Y2 + VPALIGNR $0x04, Y3, Y3, Y3 + INCQ CX + CMPQ CX, $0x04 JLT openAVX2Tail512LoopB - - CMPQ itr1, $10 - JNE openAVX2Tail512LoopA - - MOVQ inl, itr1 - SUBQ $384, itr1 - ANDQ $-16, itr1 + CMPQ CX, $0x0a + JNE openAVX2Tail512LoopA + MOVQ BX, CX + SUBQ $0x00000180, CX + ANDQ $-16, CX openAVX2Tail512HashLoop: - TESTQ itr1, itr1 + TESTQ CX, CX JE openAVX2Tail512HashEnd - polyAdd(0(itr2)) - polyMulAVX2 - LEAQ 16(itr2), itr2 - SUBQ $16, itr1 + ADDQ (R9), R10 + ADCQ 8(R9), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(R9), R9 + SUBQ $0x10, CX JMP openAVX2Tail512HashLoop openAVX2Tail512HashEnd: - VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 - VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 - VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 - VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 - VMOVDQA CC3, tmpStoreAVX2 - VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0 - VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0 - VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup) - VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 - VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 - VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) - VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 - VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 - VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) - VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 - - LEAQ (12*32)(inp), inp - LEAQ (12*32)(oup), oup - SUBQ $12*32, inl - - JMP openAVX2TailLoop - -// ---------------------------------------------------------------------------- -// ---------------------------------------------------------------------------- -// func chacha20Poly1305Seal(dst, key, src, ad []byte) -TEXT ·chacha20Poly1305Seal(SB), 0, $288-96 - // For aligned stack access + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 + VPADDD ·chacha20Constants<>+0(SB), Y7, Y7 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 32(BP), Y10, Y10 + VPADDD 32(BP), Y11, Y11 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD 64(BP), Y8, Y8 + VPADDD 64(BP), Y15, Y15 + VPADDD 96(BP), Y4, Y4 + VPADDD 128(BP), Y1, Y1 + VPADDD 160(BP), Y2, Y2 + VPADDD 192(BP), Y3, Y3 + VMOVDQA Y15, 224(BP) + VPERM2I128 $0x02, Y0, Y14, Y15 + VPERM2I128 $0x13, Y0, Y14, Y14 + VPERM2I128 $0x02, Y12, Y4, Y0 + VPERM2I128 $0x13, Y12, Y4, Y12 + VPXOR (SI), Y15, Y15 + VPXOR 32(SI), Y0, Y0 + VPXOR 64(SI), Y14, Y14 + VPXOR 96(SI), Y12, Y12 + VMOVDQU Y15, (DI) + VMOVDQU Y0, 32(DI) + VMOVDQU Y14, 64(DI) + VMOVDQU Y12, 96(DI) + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 + VPXOR 128(SI), Y0, Y0 + VPXOR 160(SI), Y14, Y14 + VPXOR 192(SI), Y12, Y12 + VPXOR 224(SI), Y4, Y4 + VMOVDQU Y0, 128(DI) + VMOVDQU Y14, 160(DI) + VMOVDQU Y12, 192(DI) + VMOVDQU Y4, 224(DI) + VPERM2I128 $0x02, Y6, Y10, Y0 + VPERM2I128 $0x02, Y8, Y2, Y14 + VPERM2I128 $0x13, Y6, Y10, Y12 + VPERM2I128 $0x13, Y8, Y2, Y4 + VPXOR 256(SI), Y0, Y0 + VPXOR 288(SI), Y14, Y14 + VPXOR 320(SI), Y12, Y12 + VPXOR 352(SI), Y4, Y4 + VMOVDQU Y0, 256(DI) + VMOVDQU Y14, 288(DI) + VMOVDQU Y12, 320(DI) + VMOVDQU Y4, 352(DI) + VPERM2I128 $0x02, Y7, Y11, Y0 + VPERM2I128 $0x02, 224(BP), Y3, Y14 + VPERM2I128 $0x13, Y7, Y11, Y12 + VPERM2I128 $0x13, 224(BP), Y3, Y4 + LEAQ 384(SI), SI + LEAQ 384(DI), DI + SUBQ $0x00000180, BX + JMP openAVX2TailLoop + +DATA ·chacha20Constants<>+0(SB)/4, $0x61707865 +DATA ·chacha20Constants<>+4(SB)/4, $0x3320646e +DATA ·chacha20Constants<>+8(SB)/4, $0x79622d32 +DATA ·chacha20Constants<>+12(SB)/4, $0x6b206574 +DATA ·chacha20Constants<>+16(SB)/4, $0x61707865 +DATA ·chacha20Constants<>+20(SB)/4, $0x3320646e +DATA ·chacha20Constants<>+24(SB)/4, $0x79622d32 +DATA ·chacha20Constants<>+28(SB)/4, $0x6b206574 +GLOBL ·chacha20Constants<>(SB), RODATA|NOPTR, $32 + +DATA ·polyClampMask<>+0(SB)/8, $0x0ffffffc0fffffff +DATA ·polyClampMask<>+8(SB)/8, $0x0ffffffc0ffffffc +DATA ·polyClampMask<>+16(SB)/8, $0xffffffffffffffff +DATA ·polyClampMask<>+24(SB)/8, $0xffffffffffffffff +GLOBL ·polyClampMask<>(SB), RODATA|NOPTR, $32 + +DATA ·sseIncMask<>+0(SB)/8, $0x0000000000000001 +DATA ·sseIncMask<>+8(SB)/8, $0x0000000000000000 +GLOBL ·sseIncMask<>(SB), RODATA|NOPTR, $16 + +DATA ·andMask<>+0(SB)/8, $0x00000000000000ff +DATA ·andMask<>+8(SB)/8, $0x0000000000000000 +DATA ·andMask<>+16(SB)/8, $0x000000000000ffff +DATA ·andMask<>+24(SB)/8, $0x0000000000000000 +DATA ·andMask<>+32(SB)/8, $0x0000000000ffffff +DATA ·andMask<>+40(SB)/8, $0x0000000000000000 +DATA ·andMask<>+48(SB)/8, $0x00000000ffffffff +DATA ·andMask<>+56(SB)/8, $0x0000000000000000 +DATA ·andMask<>+64(SB)/8, $0x000000ffffffffff +DATA ·andMask<>+72(SB)/8, $0x0000000000000000 +DATA ·andMask<>+80(SB)/8, $0x0000ffffffffffff +DATA ·andMask<>+88(SB)/8, $0x0000000000000000 +DATA ·andMask<>+96(SB)/8, $0x00ffffffffffffff +DATA ·andMask<>+104(SB)/8, $0x0000000000000000 +DATA ·andMask<>+112(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+120(SB)/8, $0x0000000000000000 +DATA ·andMask<>+128(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+136(SB)/8, $0x00000000000000ff +DATA ·andMask<>+144(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+152(SB)/8, $0x000000000000ffff +DATA ·andMask<>+160(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+168(SB)/8, $0x0000000000ffffff +DATA ·andMask<>+176(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+184(SB)/8, $0x00000000ffffffff +DATA ·andMask<>+192(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+200(SB)/8, $0x000000ffffffffff +DATA ·andMask<>+208(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+216(SB)/8, $0x0000ffffffffffff +DATA ·andMask<>+224(SB)/8, $0xffffffffffffffff +DATA ·andMask<>+232(SB)/8, $0x00ffffffffffffff +GLOBL ·andMask<>(SB), RODATA|NOPTR, $240 + +DATA ·avx2InitMask<>+0(SB)/8, $0x0000000000000000 +DATA ·avx2InitMask<>+8(SB)/8, $0x0000000000000000 +DATA ·avx2InitMask<>+16(SB)/8, $0x0000000000000001 +DATA ·avx2InitMask<>+24(SB)/8, $0x0000000000000000 +GLOBL ·avx2InitMask<>(SB), RODATA|NOPTR, $32 + +DATA ·rol16<>+0(SB)/8, $0x0504070601000302 +DATA ·rol16<>+8(SB)/8, $0x0d0c0f0e09080b0a +DATA ·rol16<>+16(SB)/8, $0x0504070601000302 +DATA ·rol16<>+24(SB)/8, $0x0d0c0f0e09080b0a +GLOBL ·rol16<>(SB), RODATA|NOPTR, $32 + +DATA ·rol8<>+0(SB)/8, $0x0605040702010003 +DATA ·rol8<>+8(SB)/8, $0x0e0d0c0f0a09080b +DATA ·rol8<>+16(SB)/8, $0x0605040702010003 +DATA ·rol8<>+24(SB)/8, $0x0e0d0c0f0a09080b +GLOBL ·rol8<>(SB), RODATA|NOPTR, $32 + +DATA ·avx2IncMask<>+0(SB)/8, $0x0000000000000002 +DATA ·avx2IncMask<>+8(SB)/8, $0x0000000000000000 +DATA ·avx2IncMask<>+16(SB)/8, $0x0000000000000002 +DATA ·avx2IncMask<>+24(SB)/8, $0x0000000000000000 +GLOBL ·avx2IncMask<>(SB), RODATA|NOPTR, $32 + +// func chacha20Poly1305Seal(dst []byte, key []uint32, src []byte, ad []byte) +// Requires: AVX, AVX2, BMI2, CMOV, SSE2 +TEXT ·chacha20Poly1305Seal(SB), $288-96 MOVQ SP, BP - ADDQ $32, BP + ADDQ $0x20, BP ANDQ $-32, BP - MOVQ dst+0(FP), oup - MOVQ key+24(FP), keyp - MOVQ src+48(FP), inp - MOVQ src_len+56(FP), inl - MOVQ ad+72(FP), adp - - CMPB ·useAVX2(SB), $1 + MOVQ dst_base+0(FP), DI + MOVQ key_base+24(FP), R8 + MOVQ src_base+48(FP), SI + MOVQ src_len+56(FP), BX + MOVQ ad_base+72(FP), CX + CMPB ·useAVX2+0(SB), $0x01 JE chacha20Poly1305Seal_AVX2 // Special optimization, for very short buffers - CMPQ inl, $128 - JBE sealSSE128 // About 15% faster + CMPQ BX, $0x80 + JBE sealSSE128 // In the seal case - prepare the poly key + 3 blocks of stream in the first iteration - MOVOU ·chacha20Constants<>(SB), A0 - MOVOU (1*16)(keyp), B0 - MOVOU (2*16)(keyp), C0 - MOVOU (3*16)(keyp), D0 + MOVOU ·chacha20Constants<>+0(SB), X0 + MOVOU 16(R8), X3 + MOVOU 32(R8), X6 + MOVOU 48(R8), X9 // Store state on stack for future use - MOVO B0, state1Store - MOVO C0, state2Store + MOVO X3, 32(BP) + MOVO X6, 48(BP) // Load state, increment counter blocks - MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 - MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 - MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 + MOVO X0, X1 + MOVO X3, X4 + MOVO X6, X7 + MOVO X9, X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X1, X2 + MOVO X4, X5 + MOVO X7, X8 + MOVO X10, X11 + PADDL ·sseIncMask<>+0(SB), X11 + MOVO X2, X12 + MOVO X5, X13 + MOVO X8, X14 + MOVO X11, X15 + PADDL ·sseIncMask<>+0(SB), X15 // Store counters - MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store - MOVQ $10, itr2 + MOVO X9, 80(BP) + MOVO X10, 96(BP) + MOVO X11, 112(BP) + MOVO X15, 128(BP) + MOVQ $0x0000000a, R9 sealSSEIntroLoop: - MOVO C3, tmpStore - chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) - MOVO tmpStore, C3 - MOVO C1, tmpStore - chachaQR(A3, B3, C3, D3, C1) - MOVO tmpStore, C1 - shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left - shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left - shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left - - MOVO C3, tmpStore - chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) - MOVO tmpStore, C3 - MOVO C1, tmpStore - chachaQR(A3, B3, C3, D3, C1) - MOVO tmpStore, C1 - shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right - shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right - shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right - DECQ itr2 - JNE sealSSEIntroLoop + MOVO X14, 64(BP) + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X3 + PXOR X14, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X3 + PXOR X14, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X4 + PXOR X14, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X4 + PXOR X14, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X5 + PXOR X14, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X5 + PXOR X14, X5 + MOVO 64(BP), X14 + MOVO X7, 64(BP) + PADDD X13, X12 + PXOR X12, X15 + ROL16(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x0c, X7 + PSRLL $0x14, X13 + PXOR X7, X13 + PADDD X13, X12 + PXOR X12, X15 + ROL8(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x07, X7 + PSRLL $0x19, X13 + PXOR X7, X13 + MOVO 64(BP), X7 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x0c + MOVO X14, 64(BP) + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X3 + PXOR X14, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X3 + PXOR X14, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X4 + PXOR X14, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X4 + PXOR X14, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X5 + PXOR X14, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X5 + PXOR X14, X5 + MOVO 64(BP), X14 + MOVO X7, 64(BP) + PADDD X13, X12 + PXOR X12, X15 + ROL16(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x0c, X7 + PSRLL $0x14, X13 + PXOR X7, X13 + PADDD X13, X12 + PXOR X12, X15 + ROL8(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x07, X7 + PSRLL $0x19, X13 + PXOR X7, X13 + MOVO 64(BP), X7 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x04 + DECQ R9 + JNE sealSSEIntroLoop // Add in the state - PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 - PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 - PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 - PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 + PADDD ·chacha20Constants<>+0(SB), X0 + PADDD ·chacha20Constants<>+0(SB), X1 + PADDD ·chacha20Constants<>+0(SB), X2 + PADDD ·chacha20Constants<>+0(SB), X12 + PADDD 32(BP), X3 + PADDD 32(BP), X4 + PADDD 32(BP), X5 + PADDD 32(BP), X13 + PADDD 48(BP), X7 + PADDD 48(BP), X8 + PADDD 48(BP), X14 + PADDD 96(BP), X10 + PADDD 112(BP), X11 + PADDD 128(BP), X15 // Clamp and store the key - PAND ·polyClampMask<>(SB), A0 - MOVO A0, rStore - MOVO B0, sStore + PAND ·polyClampMask<>+0(SB), X0 + MOVO X0, (BP) + MOVO X3, 16(BP) // Hash AAD - MOVQ ad_len+80(FP), itr2 - CALL polyHashADInternal<>(SB) - - MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0 - PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1 - MOVOU A1, (0*16)(oup); MOVOU B1, (1*16)(oup); MOVOU C1, (2*16)(oup); MOVOU D1, (3*16)(oup) - MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0 - PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2 - MOVOU A2, (4*16)(oup); MOVOU B2, (5*16)(oup); MOVOU C2, (6*16)(oup); MOVOU D2, (7*16)(oup) - - MOVQ $128, itr1 - SUBQ $128, inl - LEAQ 128(inp), inp - - MOVO A3, A1; MOVO B3, B1; MOVO C3, C1; MOVO D3, D1 - - CMPQ inl, $64 - JBE sealSSE128SealHash - - MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0 - PXOR A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3 - MOVOU A3, (8*16)(oup); MOVOU B3, (9*16)(oup); MOVOU C3, (10*16)(oup); MOVOU D3, (11*16)(oup) - - ADDQ $64, itr1 - SUBQ $64, inl - LEAQ 64(inp), inp - - MOVQ $2, itr1 - MOVQ $8, itr2 - - CMPQ inl, $64 - JBE sealSSETail64 - CMPQ inl, $128 - JBE sealSSETail128 - CMPQ inl, $192 - JBE sealSSETail192 + MOVQ ad_len+80(FP), R9 + CALL polyHashADInternal<>(SB) + MOVOU (SI), X0 + MOVOU 16(SI), X3 + MOVOU 32(SI), X6 + MOVOU 48(SI), X9 + PXOR X0, X1 + PXOR X3, X4 + PXOR X6, X7 + PXOR X9, X10 + MOVOU X1, (DI) + MOVOU X4, 16(DI) + MOVOU X7, 32(DI) + MOVOU X10, 48(DI) + MOVOU 64(SI), X0 + MOVOU 80(SI), X3 + MOVOU 96(SI), X6 + MOVOU 112(SI), X9 + PXOR X0, X2 + PXOR X3, X5 + PXOR X6, X8 + PXOR X9, X11 + MOVOU X2, 64(DI) + MOVOU X5, 80(DI) + MOVOU X8, 96(DI) + MOVOU X11, 112(DI) + MOVQ $0x00000080, CX + SUBQ $0x80, BX + LEAQ 128(SI), SI + MOVO X12, X1 + MOVO X13, X4 + MOVO X14, X7 + MOVO X15, X10 + CMPQ BX, $0x40 + JBE sealSSE128SealHash + MOVOU (SI), X0 + MOVOU 16(SI), X3 + MOVOU 32(SI), X6 + MOVOU 48(SI), X9 + PXOR X0, X12 + PXOR X3, X13 + PXOR X6, X14 + PXOR X9, X15 + MOVOU X12, 128(DI) + MOVOU X13, 144(DI) + MOVOU X14, 160(DI) + MOVOU X15, 176(DI) + ADDQ $0x40, CX + SUBQ $0x40, BX + LEAQ 64(SI), SI + MOVQ $0x00000002, CX + MOVQ $0x00000008, R9 + CMPQ BX, $0x40 + JBE sealSSETail64 + CMPQ BX, $0x80 + JBE sealSSETail128 + CMPQ BX, $0xc0 + JBE sealSSETail192 sealSSEMainLoop: // Load state, increment counter blocks - MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0 - MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 - MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 - MOVO A2, A3; MOVO B2, B3; MOVO C2, C3; MOVO D2, D3; PADDL ·sseIncMask<>(SB), D3 + MOVO ·chacha20Constants<>+0(SB), X0 + MOVO 32(BP), X3 + MOVO 48(BP), X6 + MOVO 128(BP), X9 + PADDL ·sseIncMask<>+0(SB), X9 + MOVO X0, X1 + MOVO X3, X4 + MOVO X6, X7 + MOVO X9, X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X1, X2 + MOVO X4, X5 + MOVO X7, X8 + MOVO X10, X11 + PADDL ·sseIncMask<>+0(SB), X11 + MOVO X2, X12 + MOVO X5, X13 + MOVO X8, X14 + MOVO X11, X15 + PADDL ·sseIncMask<>+0(SB), X15 // Store counters - MOVO D0, ctr0Store; MOVO D1, ctr1Store; MOVO D2, ctr2Store; MOVO D3, ctr3Store + MOVO X9, 80(BP) + MOVO X10, 96(BP) + MOVO X11, 112(BP) + MOVO X15, 128(BP) sealSSEInnerLoop: - MOVO C3, tmpStore - chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) - MOVO tmpStore, C3 - MOVO C1, tmpStore - chachaQR(A3, B3, C3, D3, C1) - MOVO tmpStore, C1 - polyAdd(0(oup)) - shiftB0Left; shiftB1Left; shiftB2Left; shiftB3Left - shiftC0Left; shiftC1Left; shiftC2Left; shiftC3Left - shiftD0Left; shiftD1Left; shiftD2Left; shiftD3Left - polyMulStage1 - polyMulStage2 - LEAQ (2*8)(oup), oup - MOVO C3, tmpStore - chachaQR(A0, B0, C0, D0, C3); chachaQR(A1, B1, C1, D1, C3); chachaQR(A2, B2, C2, D2, C3) - MOVO tmpStore, C3 - MOVO C1, tmpStore - polyMulStage3 - chachaQR(A3, B3, C3, D3, C1) - MOVO tmpStore, C1 - polyMulReduceStage - shiftB0Right; shiftB1Right; shiftB2Right; shiftB3Right - shiftC0Right; shiftC1Right; shiftC2Right; shiftC3Right - shiftD0Right; shiftD1Right; shiftD2Right; shiftD3Right - DECQ itr2 - JGE sealSSEInnerLoop - polyAdd(0(oup)) - polyMul - LEAQ (2*8)(oup), oup - DECQ itr1 - JG sealSSEInnerLoop + MOVO X14, 64(BP) + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X3 + PXOR X14, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X3 + PXOR X14, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X4 + PXOR X14, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X4 + PXOR X14, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X5 + PXOR X14, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X5 + PXOR X14, X5 + MOVO 64(BP), X14 + MOVO X7, 64(BP) + PADDD X13, X12 + PXOR X12, X15 + ROL16(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x0c, X7 + PSRLL $0x14, X13 + PXOR X7, X13 + PADDD X13, X12 + PXOR X12, X15 + ROL8(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x07, X7 + PSRLL $0x19, X13 + PXOR X7, X13 + MOVO 64(BP), X7 + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x0c + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + LEAQ 16(DI), DI + MOVO X14, 64(BP) + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X3 + PXOR X14, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X14) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X3 + PXOR X14, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X4 + PXOR X14, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X14) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X4 + PXOR X14, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x0c, X14 + PSRLL $0x14, X5 + PXOR X14, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X14) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X14 + PSLLL $0x07, X14 + PSRLL $0x19, X5 + PXOR X14, X5 + MOVO 64(BP), X14 + MOVO X7, 64(BP) + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + PADDD X13, X12 + PXOR X12, X15 + ROL16(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x0c, X7 + PSRLL $0x14, X13 + PXOR X7, X13 + PADDD X13, X12 + PXOR X12, X15 + ROL8(X15, X7) + PADDD X15, X14 + PXOR X14, X13 + MOVO X13, X7 + PSLLL $0x07, X7 + PSRLL $0x19, X13 + PXOR X7, X13 + MOVO 64(BP), X7 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x04 + DECQ R9 + JGE sealSSEInnerLoop + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI + DECQ CX + JG sealSSEInnerLoop // Add in the state - PADDD ·chacha20Constants<>(SB), A0; PADDD ·chacha20Constants<>(SB), A1; PADDD ·chacha20Constants<>(SB), A2; PADDD ·chacha20Constants<>(SB), A3 - PADDD state1Store, B0; PADDD state1Store, B1; PADDD state1Store, B2; PADDD state1Store, B3 - PADDD state2Store, C0; PADDD state2Store, C1; PADDD state2Store, C2; PADDD state2Store, C3 - PADDD ctr0Store, D0; PADDD ctr1Store, D1; PADDD ctr2Store, D2; PADDD ctr3Store, D3 - MOVO D3, tmpStore + PADDD ·chacha20Constants<>+0(SB), X0 + PADDD ·chacha20Constants<>+0(SB), X1 + PADDD ·chacha20Constants<>+0(SB), X2 + PADDD ·chacha20Constants<>+0(SB), X12 + PADDD 32(BP), X3 + PADDD 32(BP), X4 + PADDD 32(BP), X5 + PADDD 32(BP), X13 + PADDD 48(BP), X6 + PADDD 48(BP), X7 + PADDD 48(BP), X8 + PADDD 48(BP), X14 + PADDD 80(BP), X9 + PADDD 96(BP), X10 + PADDD 112(BP), X11 + PADDD 128(BP), X15 + MOVO X15, 64(BP) // Load - xor - store - MOVOU (0*16)(inp), D3; PXOR D3, A0 - MOVOU (1*16)(inp), D3; PXOR D3, B0 - MOVOU (2*16)(inp), D3; PXOR D3, C0 - MOVOU (3*16)(inp), D3; PXOR D3, D0 - MOVOU A0, (0*16)(oup) - MOVOU B0, (1*16)(oup) - MOVOU C0, (2*16)(oup) - MOVOU D0, (3*16)(oup) - MOVO tmpStore, D3 - - MOVOU (4*16)(inp), A0; MOVOU (5*16)(inp), B0; MOVOU (6*16)(inp), C0; MOVOU (7*16)(inp), D0 - PXOR A0, A1; PXOR B0, B1; PXOR C0, C1; PXOR D0, D1 - MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) - MOVOU (8*16)(inp), A0; MOVOU (9*16)(inp), B0; MOVOU (10*16)(inp), C0; MOVOU (11*16)(inp), D0 - PXOR A0, A2; PXOR B0, B2; PXOR C0, C2; PXOR D0, D2 - MOVOU A2, (8*16)(oup); MOVOU B2, (9*16)(oup); MOVOU C2, (10*16)(oup); MOVOU D2, (11*16)(oup) - ADDQ $192, inp - MOVQ $192, itr1 - SUBQ $192, inl - MOVO A3, A1 - MOVO B3, B1 - MOVO C3, C1 - MOVO D3, D1 - CMPQ inl, $64 + MOVOU (SI), X15 + PXOR X15, X0 + MOVOU 16(SI), X15 + PXOR X15, X3 + MOVOU 32(SI), X15 + PXOR X15, X6 + MOVOU 48(SI), X15 + PXOR X15, X9 + MOVOU X0, (DI) + MOVOU X3, 16(DI) + MOVOU X6, 32(DI) + MOVOU X9, 48(DI) + MOVO 64(BP), X15 + MOVOU 64(SI), X0 + MOVOU 80(SI), X3 + MOVOU 96(SI), X6 + MOVOU 112(SI), X9 + PXOR X0, X1 + PXOR X3, X4 + PXOR X6, X7 + PXOR X9, X10 + MOVOU X1, 64(DI) + MOVOU X4, 80(DI) + MOVOU X7, 96(DI) + MOVOU X10, 112(DI) + MOVOU 128(SI), X0 + MOVOU 144(SI), X3 + MOVOU 160(SI), X6 + MOVOU 176(SI), X9 + PXOR X0, X2 + PXOR X3, X5 + PXOR X6, X8 + PXOR X9, X11 + MOVOU X2, 128(DI) + MOVOU X5, 144(DI) + MOVOU X8, 160(DI) + MOVOU X11, 176(DI) + ADDQ $0xc0, SI + MOVQ $0x000000c0, CX + SUBQ $0xc0, BX + MOVO X12, X1 + MOVO X13, X4 + MOVO X14, X7 + MOVO X15, X10 + CMPQ BX, $0x40 JBE sealSSE128SealHash - MOVOU (0*16)(inp), A0; MOVOU (1*16)(inp), B0; MOVOU (2*16)(inp), C0; MOVOU (3*16)(inp), D0 - PXOR A0, A3; PXOR B0, B3; PXOR C0, C3; PXOR D0, D3 - MOVOU A3, (12*16)(oup); MOVOU B3, (13*16)(oup); MOVOU C3, (14*16)(oup); MOVOU D3, (15*16)(oup) - LEAQ 64(inp), inp - SUBQ $64, inl - MOVQ $6, itr1 - MOVQ $4, itr2 - CMPQ inl, $192 + MOVOU (SI), X0 + MOVOU 16(SI), X3 + MOVOU 32(SI), X6 + MOVOU 48(SI), X9 + PXOR X0, X12 + PXOR X3, X13 + PXOR X6, X14 + PXOR X9, X15 + MOVOU X12, 192(DI) + MOVOU X13, 208(DI) + MOVOU X14, 224(DI) + MOVOU X15, 240(DI) + LEAQ 64(SI), SI + SUBQ $0x40, BX + MOVQ $0x00000006, CX + MOVQ $0x00000004, R9 + CMPQ BX, $0xc0 JG sealSSEMainLoop - - MOVQ inl, itr1 - TESTQ inl, inl + MOVQ BX, CX + TESTQ BX, BX JE sealSSE128SealHash - MOVQ $6, itr1 - CMPQ inl, $64 + MOVQ $0x00000006, CX + CMPQ BX, $0x40 JBE sealSSETail64 - CMPQ inl, $128 + CMPQ BX, $0x80 JBE sealSSETail128 JMP sealSSETail192 -// ---------------------------------------------------------------------------- -// Special optimization for the last 64 bytes of plaintext sealSSETail64: - // Need to encrypt up to 64 bytes - prepare single block, hash 192 or 256 bytes - MOVO ·chacha20Constants<>(SB), A1 - MOVO state1Store, B1 - MOVO state2Store, C1 - MOVO ctr3Store, D1 - PADDL ·sseIncMask<>(SB), D1 - MOVO D1, ctr0Store + MOVO ·chacha20Constants<>+0(SB), X1 + MOVO 32(BP), X4 + MOVO 48(BP), X7 + MOVO 128(BP), X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X10, 80(BP) sealSSETail64LoopA: - // Perform ChaCha rounds, while hashing the previously encrypted ciphertext - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI sealSSETail64LoopB: - chachaQR(A1, B1, C1, D1, T1) - shiftB1Left; shiftC1Left; shiftD1Left - chachaQR(A1, B1, C1, D1, T1) - shiftB1Right; shiftC1Right; shiftD1Right - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup - - DECQ itr1 - JG sealSSETail64LoopA - - DECQ itr2 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X13) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X13 + PSLLL $0x0c, X13 + PSRLL $0x14, X4 + PXOR X13, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X13) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X13 + PSLLL $0x07, X13 + PSRLL $0x19, X4 + PXOR X13, X4 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X13) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X13 + PSLLL $0x0c, X13 + PSRLL $0x14, X4 + PXOR X13, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X13) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X13 + PSLLL $0x07, X13 + PSRLL $0x19, X4 + PXOR X13, X4 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI + DECQ CX + JG sealSSETail64LoopA + DECQ R9 JGE sealSSETail64LoopB - PADDL ·chacha20Constants<>(SB), A1 - PADDL state1Store, B1 - PADDL state2Store, C1 - PADDL ctr0Store, D1 + PADDL ·chacha20Constants<>+0(SB), X1 + PADDL 32(BP), X4 + PADDL 48(BP), X7 + PADDL 80(BP), X10 + JMP sealSSE128Seal - JMP sealSSE128Seal - -// ---------------------------------------------------------------------------- -// Special optimization for the last 128 bytes of plaintext sealSSETail128: - // Need to encrypt up to 128 bytes - prepare two blocks, hash 192 or 256 bytes - MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store - MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store + MOVO ·chacha20Constants<>+0(SB), X0 + MOVO 32(BP), X3 + MOVO 48(BP), X6 + MOVO 128(BP), X9 + PADDL ·sseIncMask<>+0(SB), X9 + MOVO X9, 80(BP) + MOVO X0, X1 + MOVO X3, X4 + MOVO X6, X7 + MOVO X9, X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X10, 96(BP) sealSSETail128LoopA: - // Perform ChaCha rounds, while hashing the previously encrypted ciphertext - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI sealSSETail128LoopB: - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) - shiftB0Left; shiftC0Left; shiftD0Left - shiftB1Left; shiftC1Left; shiftD1Left - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0) - shiftB0Right; shiftC0Right; shiftD0Right - shiftB1Right; shiftC1Right; shiftD1Right - - DECQ itr1 - JG sealSSETail128LoopA - - DECQ itr2 - JGE sealSSETail128LoopB - - PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1 - PADDL state1Store, B0; PADDL state1Store, B1 - PADDL state2Store, C0; PADDL state2Store, C1 - PADDL ctr0Store, D0; PADDL ctr1Store, D1 - - MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 - PXOR T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0 - MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup) - - MOVQ $64, itr1 - LEAQ 64(inp), inp - SUBQ $64, inl - - JMP sealSSE128SealHash - -// ---------------------------------------------------------------------------- -// Special optimization for the last 192 bytes of plaintext + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + DECQ CX + JG sealSSETail128LoopA + DECQ R9 + JGE sealSSETail128LoopB + PADDL ·chacha20Constants<>+0(SB), X0 + PADDL ·chacha20Constants<>+0(SB), X1 + PADDL 32(BP), X3 + PADDL 32(BP), X4 + PADDL 48(BP), X6 + PADDL 48(BP), X7 + PADDL 80(BP), X9 + PADDL 96(BP), X10 + MOVOU (SI), X12 + MOVOU 16(SI), X13 + MOVOU 32(SI), X14 + MOVOU 48(SI), X15 + PXOR X12, X0 + PXOR X13, X3 + PXOR X14, X6 + PXOR X15, X9 + MOVOU X0, (DI) + MOVOU X3, 16(DI) + MOVOU X6, 32(DI) + MOVOU X9, 48(DI) + MOVQ $0x00000040, CX + LEAQ 64(SI), SI + SUBQ $0x40, BX + JMP sealSSE128SealHash + sealSSETail192: - // Need to encrypt up to 192 bytes - prepare three blocks, hash 192 or 256 bytes - MOVO ·chacha20Constants<>(SB), A0; MOVO state1Store, B0; MOVO state2Store, C0; MOVO ctr3Store, D0; PADDL ·sseIncMask<>(SB), D0; MOVO D0, ctr0Store - MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1; MOVO D1, ctr1Store - MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2; MOVO D2, ctr2Store + MOVO ·chacha20Constants<>+0(SB), X0 + MOVO 32(BP), X3 + MOVO 48(BP), X6 + MOVO 128(BP), X9 + PADDL ·sseIncMask<>+0(SB), X9 + MOVO X9, 80(BP) + MOVO X0, X1 + MOVO X3, X4 + MOVO X6, X7 + MOVO X9, X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X10, 96(BP) + MOVO X1, X2 + MOVO X4, X5 + MOVO X7, X8 + MOVO X10, X11 + PADDL ·sseIncMask<>+0(SB), X11 + MOVO X11, 112(BP) sealSSETail192LoopA: - // Perform ChaCha rounds, while hashing the previously encrypted ciphertext - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI sealSSETail192LoopB: - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) - shiftB0Left; shiftC0Left; shiftD0Left - shiftB1Left; shiftC1Left; shiftD1Left - shiftB2Left; shiftC2Left; shiftD2Left - - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup - - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) - shiftB0Right; shiftC0Right; shiftD0Right - shiftB1Right; shiftC1Right; shiftD1Right - shiftB2Right; shiftC2Right; shiftD2Right - - DECQ itr1 - JG sealSSETail192LoopA - - DECQ itr2 - JGE sealSSETail192LoopB - - PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 - PADDL state1Store, B0; PADDL state1Store, B1; PADDL state1Store, B2 - PADDL state2Store, C0; PADDL state2Store, C1; PADDL state2Store, C2 - PADDL ctr0Store, D0; PADDL ctr1Store, D1; PADDL ctr2Store, D2 - - MOVOU (0*16)(inp), T0; MOVOU (1*16)(inp), T1; MOVOU (2*16)(inp), T2; MOVOU (3*16)(inp), T3 - PXOR T0, A0; PXOR T1, B0; PXOR T2, C0; PXOR T3, D0 - MOVOU A0, (0*16)(oup); MOVOU B0, (1*16)(oup); MOVOU C0, (2*16)(oup); MOVOU D0, (3*16)(oup) - MOVOU (4*16)(inp), T0; MOVOU (5*16)(inp), T1; MOVOU (6*16)(inp), T2; MOVOU (7*16)(inp), T3 - PXOR T0, A1; PXOR T1, B1; PXOR T2, C1; PXOR T3, D1 - MOVOU A1, (4*16)(oup); MOVOU B1, (5*16)(oup); MOVOU C1, (6*16)(oup); MOVOU D1, (7*16)(oup) - - MOVO A2, A1 - MOVO B2, B1 - MOVO C2, C1 - MOVO D2, D1 - MOVQ $128, itr1 - LEAQ 128(inp), inp - SUBQ $128, inl - - JMP sealSSE128SealHash - -// ---------------------------------------------------------------------------- -// Special seal optimization for buffers smaller than 129 bytes + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X5 + PXOR X12, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X5 + PXOR X12, X5 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X5 + PXOR X12, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X5 + PXOR X12, X5 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + DECQ CX + JG sealSSETail192LoopA + DECQ R9 + JGE sealSSETail192LoopB + PADDL ·chacha20Constants<>+0(SB), X0 + PADDL ·chacha20Constants<>+0(SB), X1 + PADDL ·chacha20Constants<>+0(SB), X2 + PADDL 32(BP), X3 + PADDL 32(BP), X4 + PADDL 32(BP), X5 + PADDL 48(BP), X6 + PADDL 48(BP), X7 + PADDL 48(BP), X8 + PADDL 80(BP), X9 + PADDL 96(BP), X10 + PADDL 112(BP), X11 + MOVOU (SI), X12 + MOVOU 16(SI), X13 + MOVOU 32(SI), X14 + MOVOU 48(SI), X15 + PXOR X12, X0 + PXOR X13, X3 + PXOR X14, X6 + PXOR X15, X9 + MOVOU X0, (DI) + MOVOU X3, 16(DI) + MOVOU X6, 32(DI) + MOVOU X9, 48(DI) + MOVOU 64(SI), X12 + MOVOU 80(SI), X13 + MOVOU 96(SI), X14 + MOVOU 112(SI), X15 + PXOR X12, X1 + PXOR X13, X4 + PXOR X14, X7 + PXOR X15, X10 + MOVOU X1, 64(DI) + MOVOU X4, 80(DI) + MOVOU X7, 96(DI) + MOVOU X10, 112(DI) + MOVO X2, X1 + MOVO X5, X4 + MOVO X8, X7 + MOVO X11, X10 + MOVQ $0x00000080, CX + LEAQ 128(SI), SI + SUBQ $0x80, BX + JMP sealSSE128SealHash + sealSSE128: - // For up to 128 bytes of ciphertext and 64 bytes for the poly key, we require to process three blocks - MOVOU ·chacha20Constants<>(SB), A0; MOVOU (1*16)(keyp), B0; MOVOU (2*16)(keyp), C0; MOVOU (3*16)(keyp), D0 - MOVO A0, A1; MOVO B0, B1; MOVO C0, C1; MOVO D0, D1; PADDL ·sseIncMask<>(SB), D1 - MOVO A1, A2; MOVO B1, B2; MOVO C1, C2; MOVO D1, D2; PADDL ·sseIncMask<>(SB), D2 - MOVO B0, T1; MOVO C0, T2; MOVO D1, T3 - MOVQ $10, itr2 + MOVOU ·chacha20Constants<>+0(SB), X0 + MOVOU 16(R8), X3 + MOVOU 32(R8), X6 + MOVOU 48(R8), X9 + MOVO X0, X1 + MOVO X3, X4 + MOVO X6, X7 + MOVO X9, X10 + PADDL ·sseIncMask<>+0(SB), X10 + MOVO X1, X2 + MOVO X4, X5 + MOVO X7, X8 + MOVO X10, X11 + PADDL ·sseIncMask<>+0(SB), X11 + MOVO X3, X13 + MOVO X6, X14 + MOVO X10, X15 + MOVQ $0x0000000a, R9 sealSSE128InnerCipherLoop: - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) - shiftB0Left; shiftB1Left; shiftB2Left - shiftC0Left; shiftC1Left; shiftC2Left - shiftD0Left; shiftD1Left; shiftD2Left - chachaQR(A0, B0, C0, D0, T0); chachaQR(A1, B1, C1, D1, T0); chachaQR(A2, B2, C2, D2, T0) - shiftB0Right; shiftB1Right; shiftB2Right - shiftC0Right; shiftC1Right; shiftC2Right - shiftD0Right; shiftD1Right; shiftD2Right - DECQ itr2 - JNE sealSSE128InnerCipherLoop + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X5 + PXOR X12, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X5 + PXOR X12, X5 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x04 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x0c + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + PADDD X3, X0 + PXOR X0, X9 + ROL16(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X3 + PXOR X12, X3 + PADDD X3, X0 + PXOR X0, X9 + ROL8(X9, X12) + PADDD X9, X6 + PXOR X6, X3 + MOVO X3, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X3 + PXOR X12, X3 + PADDD X4, X1 + PXOR X1, X10 + ROL16(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X4 + PXOR X12, X4 + PADDD X4, X1 + PXOR X1, X10 + ROL8(X10, X12) + PADDD X10, X7 + PXOR X7, X4 + MOVO X4, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X4 + PXOR X12, X4 + PADDD X5, X2 + PXOR X2, X11 + ROL16(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x0c, X12 + PSRLL $0x14, X5 + PXOR X12, X5 + PADDD X5, X2 + PXOR X2, X11 + ROL8(X11, X12) + PADDD X11, X8 + PXOR X8, X5 + MOVO X5, X12 + PSLLL $0x07, X12 + PSRLL $0x19, X5 + PXOR X12, X5 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xe4 + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xed + BYTE $0x0c + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xf6 + BYTE $0x08 + BYTE $0x66 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xff + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc0 + BYTE $0x08 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xc9 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xd2 + BYTE $0x04 + BYTE $0x66 + BYTE $0x45 + BYTE $0x0f + BYTE $0x3a + BYTE $0x0f + BYTE $0xdb + BYTE $0x04 + DECQ R9 + JNE sealSSE128InnerCipherLoop // A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded - PADDL ·chacha20Constants<>(SB), A0; PADDL ·chacha20Constants<>(SB), A1; PADDL ·chacha20Constants<>(SB), A2 - PADDL T1, B0; PADDL T1, B1; PADDL T1, B2 - PADDL T2, C1; PADDL T2, C2 - PADDL T3, D1; PADDL ·sseIncMask<>(SB), T3; PADDL T3, D2 - PAND ·polyClampMask<>(SB), A0 - MOVOU A0, rStore - MOVOU B0, sStore + PADDL ·chacha20Constants<>+0(SB), X0 + PADDL ·chacha20Constants<>+0(SB), X1 + PADDL ·chacha20Constants<>+0(SB), X2 + PADDL X13, X3 + PADDL X13, X4 + PADDL X13, X5 + PADDL X14, X7 + PADDL X14, X8 + PADDL X15, X10 + PADDL ·sseIncMask<>+0(SB), X15 + PADDL X15, X11 + PAND ·polyClampMask<>+0(SB), X0 + MOVOU X0, (BP) + MOVOU X3, 16(BP) // Hash - MOVQ ad_len+80(FP), itr2 + MOVQ ad_len+80(FP), R9 CALL polyHashADInternal<>(SB) - XORQ itr1, itr1 + XORQ CX, CX sealSSE128SealHash: - // itr1 holds the number of bytes encrypted but not yet hashed - CMPQ itr1, $16 - JB sealSSE128Seal - polyAdd(0(oup)) - polyMul - - SUBQ $16, itr1 - ADDQ $16, oup - - JMP sealSSE128SealHash + CMPQ CX, $0x10 + JB sealSSE128Seal + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + SUBQ $0x10, CX + ADDQ $0x10, DI + JMP sealSSE128SealHash sealSSE128Seal: - CMPQ inl, $16 + CMPQ BX, $0x10 JB sealSSETail - SUBQ $16, inl + SUBQ $0x10, BX // Load for decryption - MOVOU (inp), T0 - PXOR T0, A1 - MOVOU A1, (oup) - LEAQ (1*16)(inp), inp - LEAQ (1*16)(oup), oup + MOVOU (SI), X12 + PXOR X12, X1 + MOVOU X1, (DI) + LEAQ 16(SI), SI + LEAQ 16(DI), DI // Extract for hashing - MOVQ A1, t0 - PSRLDQ $8, A1 - MOVQ A1, t1 - ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 - polyMul + MOVQ X1, R13 + PSRLDQ $0x08, X1 + MOVQ X1, R14 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 // Shift the stream "left" - MOVO B1, A1 - MOVO C1, B1 - MOVO D1, C1 - MOVO A2, D1 - MOVO B2, A2 - MOVO C2, B2 - MOVO D2, C2 + MOVO X4, X1 + MOVO X7, X4 + MOVO X10, X7 + MOVO X2, X10 + MOVO X5, X2 + MOVO X8, X5 + MOVO X11, X8 JMP sealSSE128Seal sealSSETail: - TESTQ inl, inl + TESTQ BX, BX JE sealSSEFinalize // We can only load the PT one byte at a time to avoid read after end of buffer - MOVQ inl, itr2 - SHLQ $4, itr2 - LEAQ ·andMask<>(SB), t0 - MOVQ inl, itr1 - LEAQ -1(inp)(inl*1), inp - XORQ t2, t2 - XORQ t3, t3 + MOVQ BX, R9 + SHLQ $0x04, R9 + LEAQ ·andMask<>+0(SB), R13 + MOVQ BX, CX + LEAQ -1(SI)(BX*1), SI + XORQ R15, R15 + XORQ R8, R8 XORQ AX, AX sealSSETailLoadLoop: - SHLQ $8, t2, t3 - SHLQ $8, t2 - MOVB (inp), AX - XORQ AX, t2 - LEAQ -1(inp), inp - DECQ itr1 + SHLQ $0x08, R15, R8 + SHLQ $0x08, R15 + MOVB (SI), AX + XORQ AX, R15 + LEAQ -1(SI), SI + DECQ CX JNE sealSSETailLoadLoop - MOVQ t2, 0+tmpStore - MOVQ t3, 8+tmpStore - PXOR 0+tmpStore, A1 - MOVOU A1, (oup) - MOVOU -16(t0)(itr2*1), T0 - PAND T0, A1 - MOVQ A1, t0 - PSRLDQ $8, A1 - MOVQ A1, t1 - ADDQ t0, acc0; ADCQ t1, acc1; ADCQ $1, acc2 - polyMul - - ADDQ inl, oup + MOVQ R15, 64(BP) + MOVQ R8, 72(BP) + PXOR 64(BP), X1 + MOVOU X1, (DI) + MOVOU -16(R13)(R9*1), X12 + PAND X12, X1 + MOVQ X1, R13 + PSRLDQ $0x08, X1 + MOVQ X1, R14 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + ADDQ BX, DI sealSSEFinalize: // Hash in the buffer lengths - ADDQ ad_len+80(FP), acc0 - ADCQ src_len+56(FP), acc1 - ADCQ $1, acc2 - polyMul + ADDQ ad_len+80(FP), R10 + ADCQ src_len+56(FP), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 // Final reduce - MOVQ acc0, t0 - MOVQ acc1, t1 - MOVQ acc2, t2 - SUBQ $-5, acc0 - SBBQ $-1, acc1 - SBBQ $3, acc2 - CMOVQCS t0, acc0 - CMOVQCS t1, acc1 - CMOVQCS t2, acc2 + MOVQ R10, R13 + MOVQ R11, R14 + MOVQ R12, R15 + SUBQ $-5, R10 + SBBQ $-1, R11 + SBBQ $0x03, R12 + CMOVQCS R13, R10 + CMOVQCS R14, R11 + CMOVQCS R15, R12 // Add in the "s" part of the key - ADDQ 0+sStore, acc0 - ADCQ 8+sStore, acc1 + ADDQ 16(BP), R10 + ADCQ 24(BP), R11 // Finally store the tag at the end of the message - MOVQ acc0, (0*8)(oup) - MOVQ acc1, (1*8)(oup) + MOVQ R10, (DI) + MOVQ R11, 8(DI) RET -// ---------------------------------------------------------------------------- -// ------------------------- AVX2 Code ---------------------------------------- chacha20Poly1305Seal_AVX2: VZEROUPPER - VMOVDQU ·chacha20Constants<>(SB), AA0 - BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x70; BYTE $0x10 // broadcasti128 16(r8), ymm14 - BYTE $0xc4; BYTE $0x42; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x20 // broadcasti128 32(r8), ymm12 - BYTE $0xc4; BYTE $0xc2; BYTE $0x7d; BYTE $0x5a; BYTE $0x60; BYTE $0x30 // broadcasti128 48(r8), ymm4 - VPADDD ·avx2InitMask<>(SB), DD0, DD0 + VMOVDQU ·chacha20Constants<>+0(SB), Y0 + BYTE $0xc4 + BYTE $0x42 + BYTE $0x7d + BYTE $0x5a + BYTE $0x70 + BYTE $0x10 + BYTE $0xc4 + BYTE $0x42 + BYTE $0x7d + BYTE $0x5a + BYTE $0x60 + BYTE $0x20 + BYTE $0xc4 + BYTE $0xc2 + BYTE $0x7d + BYTE $0x5a + BYTE $0x60 + BYTE $0x30 + VPADDD ·avx2InitMask<>+0(SB), Y4, Y4 // Special optimizations, for very short buffers - CMPQ inl, $192 - JBE seal192AVX2 // 33% faster - CMPQ inl, $320 - JBE seal320AVX2 // 17% faster + CMPQ BX, $0x000000c0 + JBE seal192AVX2 + CMPQ BX, $0x00000140 + JBE seal320AVX2 // For the general key prepare the key first - as a byproduct we have 64 bytes of cipher stream - VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 - VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3; VMOVDQA BB0, state1StoreAVX2 - VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3; VMOVDQA CC0, state2StoreAVX2 - VPADDD ·avx2IncMask<>(SB), DD0, DD1; VMOVDQA DD0, ctr0StoreAVX2 - VPADDD ·avx2IncMask<>(SB), DD1, DD2; VMOVDQA DD1, ctr1StoreAVX2 - VPADDD ·avx2IncMask<>(SB), DD2, DD3; VMOVDQA DD2, ctr2StoreAVX2 - VMOVDQA DD3, ctr3StoreAVX2 - MOVQ $10, itr2 + VMOVDQA Y0, Y5 + VMOVDQA Y0, Y6 + VMOVDQA Y0, Y7 + VMOVDQA Y14, Y9 + VMOVDQA Y14, Y10 + VMOVDQA Y14, Y11 + VMOVDQA Y14, 32(BP) + VMOVDQA Y12, Y13 + VMOVDQA Y12, Y8 + VMOVDQA Y12, Y15 + VMOVDQA Y12, 64(BP) + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VMOVDQA Y4, 96(BP) + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VMOVDQA Y1, 128(BP) + VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 + VMOVDQA Y2, 160(BP) + VMOVDQA Y3, 192(BP) + MOVQ $0x0000000a, R9 sealAVX2IntroLoop: - VMOVDQA CC3, tmpStoreAVX2 - chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) - VMOVDQA tmpStoreAVX2, CC3 - VMOVDQA CC1, tmpStoreAVX2 - chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) - VMOVDQA tmpStoreAVX2, CC1 - - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0 - VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1 - VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2 - VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3 - - VMOVDQA CC3, tmpStoreAVX2 - chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) - VMOVDQA tmpStoreAVX2, CC3 - VMOVDQA CC1, tmpStoreAVX2 - chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) - VMOVDQA tmpStoreAVX2, CC1 - - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0 - VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1 - VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2 - VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3 - DECQ itr2 - JNE sealAVX2IntroLoop - - VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 - VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 - VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 - VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 - - VPERM2I128 $0x13, CC0, DD0, CC0 // Stream bytes 96 - 127 - VPERM2I128 $0x02, AA0, BB0, DD0 // The Poly1305 key - VPERM2I128 $0x13, AA0, BB0, AA0 // Stream bytes 64 - 95 + VMOVDQA Y15, 224(BP) + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VMOVDQA 224(BP), Y15 + VMOVDQA Y13, 224(BP) + VPADDD Y11, Y7, Y7 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y3, Y15, Y15 + VPXOR Y15, Y11, Y11 + VPSLLD $0x0c, Y11, Y13 + VPSRLD $0x14, Y11, Y11 + VPXOR Y13, Y11, Y11 + VPADDD Y11, Y7, Y7 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y3, Y15, Y15 + VPXOR Y15, Y11, Y11 + VPSLLD $0x07, Y11, Y13 + VPSRLD $0x19, Y11, Y11 + VPXOR Y13, Y11, Y11 + VMOVDQA 224(BP), Y13 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPALIGNR $0x04, Y11, Y11, Y11 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x0c, Y3, Y3, Y3 + VMOVDQA Y15, 224(BP) + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VMOVDQA 224(BP), Y15 + VMOVDQA Y13, 224(BP) + VPADDD Y11, Y7, Y7 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y3, Y15, Y15 + VPXOR Y15, Y11, Y11 + VPSLLD $0x0c, Y11, Y13 + VPSRLD $0x14, Y11, Y11 + VPXOR Y13, Y11, Y11 + VPADDD Y11, Y7, Y7 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y3, Y15, Y15 + VPXOR Y15, Y11, Y11 + VPSLLD $0x07, Y11, Y13 + VPSRLD $0x19, Y11, Y11 + VPXOR Y13, Y11, Y11 + VMOVDQA 224(BP), Y13 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x04, Y2, Y2, Y2 + VPALIGNR $0x0c, Y11, Y11, Y11 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x04, Y3, Y3, Y3 + DECQ R9 + JNE sealAVX2IntroLoop + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 + VPADDD ·chacha20Constants<>+0(SB), Y7, Y7 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 32(BP), Y10, Y10 + VPADDD 32(BP), Y11, Y11 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD 64(BP), Y8, Y8 + VPADDD 64(BP), Y15, Y15 + VPADDD 96(BP), Y4, Y4 + VPADDD 128(BP), Y1, Y1 + VPADDD 160(BP), Y2, Y2 + VPADDD 192(BP), Y3, Y3 + VPERM2I128 $0x13, Y12, Y4, Y12 + VPERM2I128 $0x02, Y0, Y14, Y4 + VPERM2I128 $0x13, Y0, Y14, Y0 // Clamp and store poly key - VPAND ·polyClampMask<>(SB), DD0, DD0 - VMOVDQA DD0, rsStoreAVX2 + VPAND ·polyClampMask<>+0(SB), Y4, Y4 + VMOVDQA Y4, (BP) // Hash AD - MOVQ ad_len+80(FP), itr2 + MOVQ ad_len+80(FP), R9 CALL polyHashADInternal<>(SB) // Can store at least 320 bytes - VPXOR (0*32)(inp), AA0, AA0 - VPXOR (1*32)(inp), CC0, CC0 - VMOVDQU AA0, (0*32)(oup) - VMOVDQU CC0, (1*32)(oup) - - VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 - VPXOR (2*32)(inp), AA0, AA0; VPXOR (3*32)(inp), BB0, BB0; VPXOR (4*32)(inp), CC0, CC0; VPXOR (5*32)(inp), DD0, DD0 - VMOVDQU AA0, (2*32)(oup); VMOVDQU BB0, (3*32)(oup); VMOVDQU CC0, (4*32)(oup); VMOVDQU DD0, (5*32)(oup) - VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 - VPXOR (6*32)(inp), AA0, AA0; VPXOR (7*32)(inp), BB0, BB0; VPXOR (8*32)(inp), CC0, CC0; VPXOR (9*32)(inp), DD0, DD0 - VMOVDQU AA0, (6*32)(oup); VMOVDQU BB0, (7*32)(oup); VMOVDQU CC0, (8*32)(oup); VMOVDQU DD0, (9*32)(oup) - - MOVQ $320, itr1 - SUBQ $320, inl - LEAQ 320(inp), inp - - VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, CC3, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, CC3, DD3, DD0 - CMPQ inl, $128 + VPXOR (SI), Y0, Y0 + VPXOR 32(SI), Y12, Y12 + VMOVDQU Y0, (DI) + VMOVDQU Y12, 32(DI) + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 + VPXOR 64(SI), Y0, Y0 + VPXOR 96(SI), Y14, Y14 + VPXOR 128(SI), Y12, Y12 + VPXOR 160(SI), Y4, Y4 + VMOVDQU Y0, 64(DI) + VMOVDQU Y14, 96(DI) + VMOVDQU Y12, 128(DI) + VMOVDQU Y4, 160(DI) + VPERM2I128 $0x02, Y6, Y10, Y0 + VPERM2I128 $0x02, Y8, Y2, Y14 + VPERM2I128 $0x13, Y6, Y10, Y12 + VPERM2I128 $0x13, Y8, Y2, Y4 + VPXOR 192(SI), Y0, Y0 + VPXOR 224(SI), Y14, Y14 + VPXOR 256(SI), Y12, Y12 + VPXOR 288(SI), Y4, Y4 + VMOVDQU Y0, 192(DI) + VMOVDQU Y14, 224(DI) + VMOVDQU Y12, 256(DI) + VMOVDQU Y4, 288(DI) + MOVQ $0x00000140, CX + SUBQ $0x00000140, BX + LEAQ 320(SI), SI + VPERM2I128 $0x02, Y7, Y11, Y0 + VPERM2I128 $0x02, Y15, Y3, Y14 + VPERM2I128 $0x13, Y7, Y11, Y12 + VPERM2I128 $0x13, Y15, Y3, Y4 + CMPQ BX, $0x80 JBE sealAVX2SealHash - - VPXOR (0*32)(inp), AA0, AA0; VPXOR (1*32)(inp), BB0, BB0; VPXOR (2*32)(inp), CC0, CC0; VPXOR (3*32)(inp), DD0, DD0 - VMOVDQU AA0, (10*32)(oup); VMOVDQU BB0, (11*32)(oup); VMOVDQU CC0, (12*32)(oup); VMOVDQU DD0, (13*32)(oup) - SUBQ $128, inl - LEAQ 128(inp), inp - - MOVQ $8, itr1 - MOVQ $2, itr2 - - CMPQ inl, $128 - JBE sealAVX2Tail128 - CMPQ inl, $256 - JBE sealAVX2Tail256 - CMPQ inl, $384 - JBE sealAVX2Tail384 - CMPQ inl, $512 - JBE sealAVX2Tail512 + VPXOR (SI), Y0, Y0 + VPXOR 32(SI), Y14, Y14 + VPXOR 64(SI), Y12, Y12 + VPXOR 96(SI), Y4, Y4 + VMOVDQU Y0, 320(DI) + VMOVDQU Y14, 352(DI) + VMOVDQU Y12, 384(DI) + VMOVDQU Y4, 416(DI) + SUBQ $0x80, BX + LEAQ 128(SI), SI + MOVQ $0x00000008, CX + MOVQ $0x00000002, R9 + CMPQ BX, $0x80 + JBE sealAVX2Tail128 + CMPQ BX, $0x00000100 + JBE sealAVX2Tail256 + CMPQ BX, $0x00000180 + JBE sealAVX2Tail384 + CMPQ BX, $0x00000200 + JBE sealAVX2Tail512 // We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop - VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 - VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 - VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 - VMOVDQA ctr3StoreAVX2, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 - VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 - - VMOVDQA CC3, tmpStoreAVX2 - chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) - VMOVDQA tmpStoreAVX2, CC3 - VMOVDQA CC1, tmpStoreAVX2 - chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) - VMOVDQA tmpStoreAVX2, CC1 - - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $12, DD0, DD0, DD0 - VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $12, DD1, DD1, DD1 - VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $12, DD2, DD2, DD2 - VPALIGNR $4, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $12, DD3, DD3, DD3 - - VMOVDQA CC3, tmpStoreAVX2 - chachaQR_AVX2(AA0, BB0, CC0, DD0, CC3); chachaQR_AVX2(AA1, BB1, CC1, DD1, CC3); chachaQR_AVX2(AA2, BB2, CC2, DD2, CC3) - VMOVDQA tmpStoreAVX2, CC3 - VMOVDQA CC1, tmpStoreAVX2 - chachaQR_AVX2(AA3, BB3, CC3, DD3, CC1) - VMOVDQA tmpStoreAVX2, CC1 - - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $4, DD0, DD0, DD0 - VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $4, DD1, DD1, DD1 - VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $4, DD2, DD2, DD2 - VPALIGNR $12, BB3, BB3, BB3; VPALIGNR $8, CC3, CC3, CC3; VPALIGNR $4, DD3, DD3, DD3 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - - SUBQ $16, oup // Adjust the pointer - MOVQ $9, itr1 - JMP sealAVX2InternalLoopStart + VMOVDQA ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA Y0, Y6 + VMOVDQA Y0, Y7 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA Y14, Y10 + VMOVDQA Y14, Y11 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA Y12, Y8 + VMOVDQA Y12, Y15 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 + VMOVDQA Y4, 96(BP) + VMOVDQA Y1, 128(BP) + VMOVDQA Y2, 160(BP) + VMOVDQA Y3, 192(BP) + VMOVDQA Y15, 224(BP) + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VMOVDQA 224(BP), Y15 + VMOVDQA Y13, 224(BP) + VPADDD Y11, Y7, Y7 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y3, Y15, Y15 + VPXOR Y15, Y11, Y11 + VPSLLD $0x0c, Y11, Y13 + VPSRLD $0x14, Y11, Y11 + VPXOR Y13, Y11, Y11 + VPADDD Y11, Y7, Y7 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y3, Y15, Y15 + VPXOR Y15, Y11, Y11 + VPSLLD $0x07, Y11, Y13 + VPSRLD $0x19, Y11, Y11 + VPXOR Y13, Y11, Y11 + VMOVDQA 224(BP), Y13 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPALIGNR $0x04, Y11, Y11, Y11 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x0c, Y3, Y3, Y3 + VMOVDQA Y15, 224(BP) + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VMOVDQA 224(BP), Y15 + VMOVDQA Y13, 224(BP) + VPADDD Y11, Y7, Y7 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y3, Y15, Y15 + VPXOR Y15, Y11, Y11 + VPSLLD $0x0c, Y11, Y13 + VPSRLD $0x14, Y11, Y11 + VPXOR Y13, Y11, Y11 + VPADDD Y11, Y7, Y7 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y3, Y15, Y15 + VPXOR Y15, Y11, Y11 + VPSLLD $0x07, Y11, Y13 + VPSRLD $0x19, Y11, Y11 + VPXOR Y13, Y11, Y11 + VMOVDQA 224(BP), Y13 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x04, Y2, Y2, Y2 + VPALIGNR $0x0c, Y11, Y11, Y11 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x04, Y3, Y3, Y3 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + SUBQ $0x10, DI + MOVQ $0x00000009, CX + JMP sealAVX2InternalLoopStart sealAVX2MainLoop: - // Load state, increment counter blocks, store the incremented counters - VMOVDQU ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 - VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 - VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 - VMOVDQA ctr3StoreAVX2, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 - VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 - MOVQ $10, itr1 + VMOVDQU ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA Y0, Y6 + VMOVDQA Y0, Y7 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA Y14, Y10 + VMOVDQA Y14, Y11 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA Y12, Y8 + VMOVDQA Y12, Y15 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 + VMOVDQA Y4, 96(BP) + VMOVDQA Y1, 128(BP) + VMOVDQA Y2, 160(BP) + VMOVDQA Y3, 192(BP) + MOVQ $0x0000000a, CX sealAVX2InternalLoop: - polyAdd(0*8(oup)) - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - polyMulStage1_AVX2 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 - polyMulStage2_AVX2 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - polyMulStage3_AVX2 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyMulReduceStage + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 sealAVX2InternalLoopStart: - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 - polyAdd(2*8(oup)) - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - polyMulStage1_AVX2 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyMulStage2_AVX2 - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - polyMulStage3_AVX2 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 - polyMulReduceStage - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - polyAdd(4*8(oup)) - LEAQ (6*8)(oup), oup - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyMulStage1_AVX2 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - polyMulStage2_AVX2 - VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - polyMulStage3_AVX2 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyMulReduceStage - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 - DECQ itr1 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + ADDQ 16(DI), R10 + ADCQ 24(DI), R11 + ADCQ $0x01, R12 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x04, Y11, Y11, Y11 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPALIGNR $0x0c, Y3, Y3, Y3 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + ADDQ 32(DI), R10 + ADCQ 40(DI), R11 + ADCQ $0x01, R12 + LEAQ 48(DI), DI + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x0c, Y11, Y11, Y11 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x04, Y2, Y2, Y2 + VPALIGNR $0x04, Y3, Y3, Y3 + DECQ CX JNE sealAVX2InternalLoop - - VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 - VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 - VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 - VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 - VMOVDQA CC3, tmpStoreAVX2 + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 + VPADDD ·chacha20Constants<>+0(SB), Y7, Y7 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 32(BP), Y10, Y10 + VPADDD 32(BP), Y11, Y11 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD 64(BP), Y8, Y8 + VPADDD 64(BP), Y15, Y15 + VPADDD 96(BP), Y4, Y4 + VPADDD 128(BP), Y1, Y1 + VPADDD 160(BP), Y2, Y2 + VPADDD 192(BP), Y3, Y3 + VMOVDQA Y15, 224(BP) // We only hashed 480 of the 512 bytes available - hash the remaining 32 here - polyAdd(0*8(oup)) - polyMulAVX2 - LEAQ (4*8)(oup), oup - VPERM2I128 $0x02, AA0, BB0, CC3; VPERM2I128 $0x13, AA0, BB0, BB0; VPERM2I128 $0x02, CC0, DD0, AA0; VPERM2I128 $0x13, CC0, DD0, CC0 - VPXOR (0*32)(inp), CC3, CC3; VPXOR (1*32)(inp), AA0, AA0; VPXOR (2*32)(inp), BB0, BB0; VPXOR (3*32)(inp), CC0, CC0 - VMOVDQU CC3, (0*32)(oup); VMOVDQU AA0, (1*32)(oup); VMOVDQU BB0, (2*32)(oup); VMOVDQU CC0, (3*32)(oup) - VPERM2I128 $0x02, AA1, BB1, AA0; VPERM2I128 $0x02, CC1, DD1, BB0; VPERM2I128 $0x13, AA1, BB1, CC0; VPERM2I128 $0x13, CC1, DD1, DD0 - VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 - VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 32(DI), DI + VPERM2I128 $0x02, Y0, Y14, Y15 + VPERM2I128 $0x13, Y0, Y14, Y14 + VPERM2I128 $0x02, Y12, Y4, Y0 + VPERM2I128 $0x13, Y12, Y4, Y12 + VPXOR (SI), Y15, Y15 + VPXOR 32(SI), Y0, Y0 + VPXOR 64(SI), Y14, Y14 + VPXOR 96(SI), Y12, Y12 + VMOVDQU Y15, (DI) + VMOVDQU Y0, 32(DI) + VMOVDQU Y14, 64(DI) + VMOVDQU Y12, 96(DI) + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 + VPXOR 128(SI), Y0, Y0 + VPXOR 160(SI), Y14, Y14 + VPXOR 192(SI), Y12, Y12 + VPXOR 224(SI), Y4, Y4 + VMOVDQU Y0, 128(DI) + VMOVDQU Y14, 160(DI) + VMOVDQU Y12, 192(DI) + VMOVDQU Y4, 224(DI) // and here - polyAdd(-2*8(oup)) - polyMulAVX2 - VPERM2I128 $0x02, AA2, BB2, AA0; VPERM2I128 $0x02, CC2, DD2, BB0; VPERM2I128 $0x13, AA2, BB2, CC0; VPERM2I128 $0x13, CC2, DD2, DD0 - VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 - VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) - VPERM2I128 $0x02, AA3, BB3, AA0; VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0; VPERM2I128 $0x13, AA3, BB3, CC0; VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 - VPXOR (12*32)(inp), AA0, AA0; VPXOR (13*32)(inp), BB0, BB0; VPXOR (14*32)(inp), CC0, CC0; VPXOR (15*32)(inp), DD0, DD0 - VMOVDQU AA0, (12*32)(oup); VMOVDQU BB0, (13*32)(oup); VMOVDQU CC0, (14*32)(oup); VMOVDQU DD0, (15*32)(oup) - LEAQ (32*16)(inp), inp - SUBQ $(32*16), inl - CMPQ inl, $512 + ADDQ -16(DI), R10 + ADCQ -8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPERM2I128 $0x02, Y6, Y10, Y0 + VPERM2I128 $0x02, Y8, Y2, Y14 + VPERM2I128 $0x13, Y6, Y10, Y12 + VPERM2I128 $0x13, Y8, Y2, Y4 + VPXOR 256(SI), Y0, Y0 + VPXOR 288(SI), Y14, Y14 + VPXOR 320(SI), Y12, Y12 + VPXOR 352(SI), Y4, Y4 + VMOVDQU Y0, 256(DI) + VMOVDQU Y14, 288(DI) + VMOVDQU Y12, 320(DI) + VMOVDQU Y4, 352(DI) + VPERM2I128 $0x02, Y7, Y11, Y0 + VPERM2I128 $0x02, 224(BP), Y3, Y14 + VPERM2I128 $0x13, Y7, Y11, Y12 + VPERM2I128 $0x13, 224(BP), Y3, Y4 + VPXOR 384(SI), Y0, Y0 + VPXOR 416(SI), Y14, Y14 + VPXOR 448(SI), Y12, Y12 + VPXOR 480(SI), Y4, Y4 + VMOVDQU Y0, 384(DI) + VMOVDQU Y14, 416(DI) + VMOVDQU Y12, 448(DI) + VMOVDQU Y4, 480(DI) + LEAQ 512(SI), SI + SUBQ $0x00000200, BX + CMPQ BX, $0x00000200 JG sealAVX2MainLoop // Tail can only hash 480 bytes - polyAdd(0*8(oup)) - polyMulAVX2 - polyAdd(2*8(oup)) - polyMulAVX2 - LEAQ 32(oup), oup - - MOVQ $10, itr1 - MOVQ $0, itr2 - CMPQ inl, $128 - JBE sealAVX2Tail128 - CMPQ inl, $256 - JBE sealAVX2Tail256 - CMPQ inl, $384 - JBE sealAVX2Tail384 - JMP sealAVX2Tail512 - -// ---------------------------------------------------------------------------- -// Special optimization for buffers smaller than 193 bytes + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + ADDQ 16(DI), R10 + ADCQ 24(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 32(DI), DI + MOVQ $0x0000000a, CX + MOVQ $0x00000000, R9 + CMPQ BX, $0x80 + JBE sealAVX2Tail128 + CMPQ BX, $0x00000100 + JBE sealAVX2Tail256 + CMPQ BX, $0x00000180 + JBE sealAVX2Tail384 + JMP sealAVX2Tail512 + seal192AVX2: - // For up to 192 bytes of ciphertext and 64 bytes for the poly key, we process four blocks - VMOVDQA AA0, AA1 - VMOVDQA BB0, BB1 - VMOVDQA CC0, CC1 - VPADDD ·avx2IncMask<>(SB), DD0, DD1 - VMOVDQA AA0, AA2 - VMOVDQA BB0, BB2 - VMOVDQA CC0, CC2 - VMOVDQA DD0, DD2 - VMOVDQA DD1, TT3 - MOVQ $10, itr2 + VMOVDQA Y0, Y5 + VMOVDQA Y14, Y9 + VMOVDQA Y12, Y13 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VMOVDQA Y0, Y6 + VMOVDQA Y14, Y10 + VMOVDQA Y12, Y8 + VMOVDQA Y4, Y2 + VMOVDQA Y1, Y15 + MOVQ $0x0000000a, R9 sealAVX2192InnerCipherLoop: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 - DECQ itr2 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + DECQ R9 JNE sealAVX2192InnerCipherLoop - VPADDD AA2, AA0, AA0; VPADDD AA2, AA1, AA1 - VPADDD BB2, BB0, BB0; VPADDD BB2, BB1, BB1 - VPADDD CC2, CC0, CC0; VPADDD CC2, CC1, CC1 - VPADDD DD2, DD0, DD0; VPADDD TT3, DD1, DD1 - VPERM2I128 $0x02, AA0, BB0, TT0 + VPADDD Y6, Y0, Y0 + VPADDD Y6, Y5, Y5 + VPADDD Y10, Y14, Y14 + VPADDD Y10, Y9, Y9 + VPADDD Y8, Y12, Y12 + VPADDD Y8, Y13, Y13 + VPADDD Y2, Y4, Y4 + VPADDD Y15, Y1, Y1 + VPERM2I128 $0x02, Y0, Y14, Y3 // Clamp and store poly key - VPAND ·polyClampMask<>(SB), TT0, TT0 - VMOVDQA TT0, rsStoreAVX2 + VPAND ·polyClampMask<>+0(SB), Y3, Y3 + VMOVDQA Y3, (BP) // Stream for up to 192 bytes - VPERM2I128 $0x13, AA0, BB0, AA0 - VPERM2I128 $0x13, CC0, DD0, BB0 - VPERM2I128 $0x02, AA1, BB1, CC0 - VPERM2I128 $0x02, CC1, DD1, DD0 - VPERM2I128 $0x13, AA1, BB1, AA1 - VPERM2I128 $0x13, CC1, DD1, BB1 + VPERM2I128 $0x13, Y0, Y14, Y0 + VPERM2I128 $0x13, Y12, Y4, Y14 + VPERM2I128 $0x02, Y5, Y9, Y12 + VPERM2I128 $0x02, Y13, Y1, Y4 + VPERM2I128 $0x13, Y5, Y9, Y5 + VPERM2I128 $0x13, Y13, Y1, Y9 sealAVX2ShortSeal: // Hash aad - MOVQ ad_len+80(FP), itr2 + MOVQ ad_len+80(FP), R9 CALL polyHashADInternal<>(SB) - XORQ itr1, itr1 + XORQ CX, CX sealAVX2SealHash: // itr1 holds the number of bytes encrypted but not yet hashed - CMPQ itr1, $16 - JB sealAVX2ShortSealLoop - polyAdd(0(oup)) - polyMul - SUBQ $16, itr1 - ADDQ $16, oup - JMP sealAVX2SealHash + CMPQ CX, $0x10 + JB sealAVX2ShortSealLoop + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + SUBQ $0x10, CX + ADDQ $0x10, DI + JMP sealAVX2SealHash sealAVX2ShortSealLoop: - CMPQ inl, $32 + CMPQ BX, $0x20 JB sealAVX2ShortTail32 - SUBQ $32, inl + SUBQ $0x20, BX // Load for encryption - VPXOR (inp), AA0, AA0 - VMOVDQU AA0, (oup) - LEAQ (1*32)(inp), inp + VPXOR (SI), Y0, Y0 + VMOVDQU Y0, (DI) + LEAQ 32(SI), SI // Now can hash - polyAdd(0*8(oup)) - polyMulAVX2 - polyAdd(2*8(oup)) - polyMulAVX2 - LEAQ (1*32)(oup), oup + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + ADDQ 16(DI), R10 + ADCQ 24(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 32(DI), DI // Shift stream left - VMOVDQA BB0, AA0 - VMOVDQA CC0, BB0 - VMOVDQA DD0, CC0 - VMOVDQA AA1, DD0 - VMOVDQA BB1, AA1 - VMOVDQA CC1, BB1 - VMOVDQA DD1, CC1 - VMOVDQA AA2, DD1 - VMOVDQA BB2, AA2 + VMOVDQA Y14, Y0 + VMOVDQA Y12, Y14 + VMOVDQA Y4, Y12 + VMOVDQA Y5, Y4 + VMOVDQA Y9, Y5 + VMOVDQA Y13, Y9 + VMOVDQA Y1, Y13 + VMOVDQA Y6, Y1 + VMOVDQA Y10, Y6 JMP sealAVX2ShortSealLoop sealAVX2ShortTail32: - CMPQ inl, $16 - VMOVDQA A0, A1 + CMPQ BX, $0x10 + VMOVDQA X0, X1 JB sealAVX2ShortDone - - SUBQ $16, inl + SUBQ $0x10, BX // Load for encryption - VPXOR (inp), A0, T0 - VMOVDQU T0, (oup) - LEAQ (1*16)(inp), inp + VPXOR (SI), X0, X12 + VMOVDQU X12, (DI) + LEAQ 16(SI), SI // Hash - polyAdd(0*8(oup)) - polyMulAVX2 - LEAQ (1*16)(oup), oup - VPERM2I128 $0x11, AA0, AA0, AA0 - VMOVDQA A0, A1 + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI + VPERM2I128 $0x11, Y0, Y0, Y0 + VMOVDQA X0, X1 sealAVX2ShortDone: VZEROUPPER JMP sealSSETail -// ---------------------------------------------------------------------------- -// Special optimization for buffers smaller than 321 bytes seal320AVX2: - // For up to 320 bytes of ciphertext and 64 bytes for the poly key, we process six blocks - VMOVDQA AA0, AA1; VMOVDQA BB0, BB1; VMOVDQA CC0, CC1; VPADDD ·avx2IncMask<>(SB), DD0, DD1 - VMOVDQA AA0, AA2; VMOVDQA BB0, BB2; VMOVDQA CC0, CC2; VPADDD ·avx2IncMask<>(SB), DD1, DD2 - VMOVDQA BB0, TT1; VMOVDQA CC0, TT2; VMOVDQA DD0, TT3 - MOVQ $10, itr2 + VMOVDQA Y0, Y5 + VMOVDQA Y14, Y9 + VMOVDQA Y12, Y13 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VMOVDQA Y0, Y6 + VMOVDQA Y14, Y10 + VMOVDQA Y12, Y8 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VMOVDQA Y14, Y7 + VMOVDQA Y12, Y11 + VMOVDQA Y4, Y15 + MOVQ $0x0000000a, R9 sealAVX2320InnerCipherLoop: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 - DECQ itr2 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y3 + VPSRLD $0x19, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y3 + VPSRLD $0x19, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x04, Y2, Y2, Y2 + DECQ R9 JNE sealAVX2320InnerCipherLoop - - VMOVDQA ·chacha20Constants<>(SB), TT0 - VPADDD TT0, AA0, AA0; VPADDD TT0, AA1, AA1; VPADDD TT0, AA2, AA2 - VPADDD TT1, BB0, BB0; VPADDD TT1, BB1, BB1; VPADDD TT1, BB2, BB2 - VPADDD TT2, CC0, CC0; VPADDD TT2, CC1, CC1; VPADDD TT2, CC2, CC2 - VMOVDQA ·avx2IncMask<>(SB), TT0 - VPADDD TT3, DD0, DD0; VPADDD TT0, TT3, TT3 - VPADDD TT3, DD1, DD1; VPADDD TT0, TT3, TT3 - VPADDD TT3, DD2, DD2 + VMOVDQA ·chacha20Constants<>+0(SB), Y3 + VPADDD Y3, Y0, Y0 + VPADDD Y3, Y5, Y5 + VPADDD Y3, Y6, Y6 + VPADDD Y7, Y14, Y14 + VPADDD Y7, Y9, Y9 + VPADDD Y7, Y10, Y10 + VPADDD Y11, Y12, Y12 + VPADDD Y11, Y13, Y13 + VPADDD Y11, Y8, Y8 + VMOVDQA ·avx2IncMask<>+0(SB), Y3 + VPADDD Y15, Y4, Y4 + VPADDD Y3, Y15, Y15 + VPADDD Y15, Y1, Y1 + VPADDD Y3, Y15, Y15 + VPADDD Y15, Y2, Y2 // Clamp and store poly key - VPERM2I128 $0x02, AA0, BB0, TT0 - VPAND ·polyClampMask<>(SB), TT0, TT0 - VMOVDQA TT0, rsStoreAVX2 + VPERM2I128 $0x02, Y0, Y14, Y3 + VPAND ·polyClampMask<>+0(SB), Y3, Y3 + VMOVDQA Y3, (BP) // Stream for up to 320 bytes - VPERM2I128 $0x13, AA0, BB0, AA0 - VPERM2I128 $0x13, CC0, DD0, BB0 - VPERM2I128 $0x02, AA1, BB1, CC0 - VPERM2I128 $0x02, CC1, DD1, DD0 - VPERM2I128 $0x13, AA1, BB1, AA1 - VPERM2I128 $0x13, CC1, DD1, BB1 - VPERM2I128 $0x02, AA2, BB2, CC1 - VPERM2I128 $0x02, CC2, DD2, DD1 - VPERM2I128 $0x13, AA2, BB2, AA2 - VPERM2I128 $0x13, CC2, DD2, BB2 + VPERM2I128 $0x13, Y0, Y14, Y0 + VPERM2I128 $0x13, Y12, Y4, Y14 + VPERM2I128 $0x02, Y5, Y9, Y12 + VPERM2I128 $0x02, Y13, Y1, Y4 + VPERM2I128 $0x13, Y5, Y9, Y5 + VPERM2I128 $0x13, Y13, Y1, Y9 + VPERM2I128 $0x02, Y6, Y10, Y13 + VPERM2I128 $0x02, Y8, Y2, Y1 + VPERM2I128 $0x13, Y6, Y10, Y6 + VPERM2I128 $0x13, Y8, Y2, Y10 JMP sealAVX2ShortSeal -// ---------------------------------------------------------------------------- -// Special optimization for the last 128 bytes of ciphertext sealAVX2Tail128: - // Need to decrypt up to 128 bytes - prepare two blocks - // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed - // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed - VMOVDQA ·chacha20Constants<>(SB), AA0 - VMOVDQA state1StoreAVX2, BB0 - VMOVDQA state2StoreAVX2, CC0 - VMOVDQA ctr3StoreAVX2, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD0 - VMOVDQA DD0, DD1 + VMOVDQA ·chacha20Constants<>+0(SB), Y0 + VMOVDQA 32(BP), Y14 + VMOVDQA 64(BP), Y12 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VMOVDQA Y4, Y1 sealAVX2Tail128LoopA: - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI sealAVX2Tail128LoopB: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) - polyAdd(0(oup)) - polyMul - VPALIGNR $4, BB0, BB0, BB0 - VPALIGNR $8, CC0, CC0, CC0 - VPALIGNR $12, DD0, DD0, DD0 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0) - polyAdd(16(oup)) - polyMul - LEAQ 32(oup), oup - VPALIGNR $12, BB0, BB0, BB0 - VPALIGNR $8, CC0, CC0, CC0 - VPALIGNR $4, DD0, DD0, DD0 - DECQ itr1 - JG sealAVX2Tail128LoopA - DECQ itr2 - JGE sealAVX2Tail128LoopB - - VPADDD ·chacha20Constants<>(SB), AA0, AA1 - VPADDD state1StoreAVX2, BB0, BB1 - VPADDD state2StoreAVX2, CC0, CC1 - VPADDD DD1, DD0, DD1 - - VPERM2I128 $0x02, AA1, BB1, AA0 - VPERM2I128 $0x02, CC1, DD1, BB0 - VPERM2I128 $0x13, AA1, BB1, CC0 - VPERM2I128 $0x13, CC1, DD1, DD0 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + ADDQ 16(DI), R10 + ADCQ 24(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 32(DI), DI + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x04, Y4, Y4, Y4 + DECQ CX + JG sealAVX2Tail128LoopA + DECQ R9 + JGE sealAVX2Tail128LoopB + VPADDD ·chacha20Constants<>+0(SB), Y0, Y5 + VPADDD 32(BP), Y14, Y9 + VPADDD 64(BP), Y12, Y13 + VPADDD Y1, Y4, Y1 + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 JMP sealAVX2ShortSealLoop -// ---------------------------------------------------------------------------- -// Special optimization for the last 256 bytes of ciphertext sealAVX2Tail256: - // Need to decrypt up to 256 bytes - prepare two blocks - // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed - // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed - VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA ·chacha20Constants<>(SB), AA1 - VMOVDQA state1StoreAVX2, BB0; VMOVDQA state1StoreAVX2, BB1 - VMOVDQA state2StoreAVX2, CC0; VMOVDQA state2StoreAVX2, CC1 - VMOVDQA ctr3StoreAVX2, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD1 - VMOVDQA DD0, TT1 - VMOVDQA DD1, TT2 + VMOVDQA ·chacha20Constants<>+0(SB), Y0 + VMOVDQA ·chacha20Constants<>+0(SB), Y5 + VMOVDQA 32(BP), Y14 + VMOVDQA 32(BP), Y9 + VMOVDQA 64(BP), Y12 + VMOVDQA 64(BP), Y13 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VMOVDQA Y4, Y7 + VMOVDQA Y1, Y11 sealAVX2Tail256LoopA: - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI sealAVX2Tail256LoopB: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - polyAdd(0(oup)) - polyMul - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0) - polyAdd(16(oup)) - polyMul - LEAQ 32(oup), oup - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1 - DECQ itr1 - JG sealAVX2Tail256LoopA - DECQ itr2 - JGE sealAVX2Tail256LoopB - - VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1 - VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1 - VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1 - VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1 - VPERM2I128 $0x02, AA0, BB0, TT0 - VPERM2I128 $0x02, CC0, DD0, TT1 - VPERM2I128 $0x13, AA0, BB0, TT2 - VPERM2I128 $0x13, CC0, DD0, TT3 - VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3 - VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup) - MOVQ $128, itr1 - LEAQ 128(inp), inp - SUBQ $128, inl - VPERM2I128 $0x02, AA1, BB1, AA0 - VPERM2I128 $0x02, CC1, DD1, BB0 - VPERM2I128 $0x13, AA1, BB1, CC0 - VPERM2I128 $0x13, CC1, DD1, DD0 - - JMP sealAVX2SealHash - -// ---------------------------------------------------------------------------- -// Special optimization for the last 384 bytes of ciphertext + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + ADDQ 16(DI), R10 + ADCQ 24(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 32(DI), DI + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + DECQ CX + JG sealAVX2Tail256LoopA + DECQ R9 + JGE sealAVX2Tail256LoopB + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD Y7, Y4, Y4 + VPADDD Y11, Y1, Y1 + VPERM2I128 $0x02, Y0, Y14, Y3 + VPERM2I128 $0x02, Y12, Y4, Y7 + VPERM2I128 $0x13, Y0, Y14, Y11 + VPERM2I128 $0x13, Y12, Y4, Y15 + VPXOR (SI), Y3, Y3 + VPXOR 32(SI), Y7, Y7 + VPXOR 64(SI), Y11, Y11 + VPXOR 96(SI), Y15, Y15 + VMOVDQU Y3, (DI) + VMOVDQU Y7, 32(DI) + VMOVDQU Y11, 64(DI) + VMOVDQU Y15, 96(DI) + MOVQ $0x00000080, CX + LEAQ 128(SI), SI + SUBQ $0x80, BX + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 + JMP sealAVX2SealHash + sealAVX2Tail384: - // Need to decrypt up to 384 bytes - prepare two blocks - // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed - // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed - VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2 - VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2 - VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2 - VMOVDQA ctr3StoreAVX2, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2 - VMOVDQA DD0, TT1; VMOVDQA DD1, TT2; VMOVDQA DD2, TT3 + VMOVDQA ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA Y0, Y6 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA Y14, Y10 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA Y12, Y8 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VMOVDQA Y4, Y7 + VMOVDQA Y1, Y11 + VMOVDQA Y2, Y15 sealAVX2Tail384LoopA: - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI sealAVX2Tail384LoopB: - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) - polyAdd(0(oup)) - polyMul - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2 - chachaQR_AVX2(AA0, BB0, CC0, DD0, TT0); chachaQR_AVX2(AA1, BB1, CC1, DD1, TT0); chachaQR_AVX2(AA2, BB2, CC2, DD2, TT0) - polyAdd(16(oup)) - polyMul - LEAQ 32(oup), oup - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2 - DECQ itr1 - JG sealAVX2Tail384LoopA - DECQ itr2 - JGE sealAVX2Tail384LoopB - - VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2 - VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2 - VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2 - VPADDD TT1, DD0, DD0; VPADDD TT2, DD1, DD1; VPADDD TT3, DD2, DD2 - VPERM2I128 $0x02, AA0, BB0, TT0 - VPERM2I128 $0x02, CC0, DD0, TT1 - VPERM2I128 $0x13, AA0, BB0, TT2 - VPERM2I128 $0x13, CC0, DD0, TT3 - VPXOR (0*32)(inp), TT0, TT0; VPXOR (1*32)(inp), TT1, TT1; VPXOR (2*32)(inp), TT2, TT2; VPXOR (3*32)(inp), TT3, TT3 - VMOVDQU TT0, (0*32)(oup); VMOVDQU TT1, (1*32)(oup); VMOVDQU TT2, (2*32)(oup); VMOVDQU TT3, (3*32)(oup) - VPERM2I128 $0x02, AA1, BB1, TT0 - VPERM2I128 $0x02, CC1, DD1, TT1 - VPERM2I128 $0x13, AA1, BB1, TT2 - VPERM2I128 $0x13, CC1, DD1, TT3 - VPXOR (4*32)(inp), TT0, TT0; VPXOR (5*32)(inp), TT1, TT1; VPXOR (6*32)(inp), TT2, TT2; VPXOR (7*32)(inp), TT3, TT3 - VMOVDQU TT0, (4*32)(oup); VMOVDQU TT1, (5*32)(oup); VMOVDQU TT2, (6*32)(oup); VMOVDQU TT3, (7*32)(oup) - MOVQ $256, itr1 - LEAQ 256(inp), inp - SUBQ $256, inl - VPERM2I128 $0x02, AA2, BB2, AA0 - VPERM2I128 $0x02, CC2, DD2, BB0 - VPERM2I128 $0x13, AA2, BB2, CC0 - VPERM2I128 $0x13, CC2, DD2, DD0 - - JMP sealAVX2SealHash - -// ---------------------------------------------------------------------------- -// Special optimization for the last 512 bytes of ciphertext + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y3 + VPSRLD $0x19, Y10, Y10 + VPXOR Y3, Y10, Y10 + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x0c, Y14, Y3 + VPSRLD $0x14, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y14, Y0, Y0 + VPXOR Y0, Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPADDD Y4, Y12, Y12 + VPXOR Y12, Y14, Y14 + VPSLLD $0x07, Y14, Y3 + VPSRLD $0x19, Y14, Y14 + VPXOR Y3, Y14, Y14 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x0c, Y9, Y3 + VPSRLD $0x14, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y9, Y5, Y5 + VPXOR Y5, Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPADDD Y1, Y13, Y13 + VPXOR Y13, Y9, Y9 + VPSLLD $0x07, Y9, Y3 + VPSRLD $0x19, Y9, Y9 + VPXOR Y3, Y9, Y9 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x0c, Y10, Y3 + VPSRLD $0x14, Y10, Y10 + VPXOR Y3, Y10, Y10 + VPADDD Y10, Y6, Y6 + VPXOR Y6, Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPADDD Y2, Y8, Y8 + VPXOR Y8, Y10, Y10 + VPSLLD $0x07, Y10, Y3 + VPSRLD $0x19, Y10, Y10 + VPXOR Y3, Y10, Y10 + ADDQ 16(DI), R10 + ADCQ 24(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 32(DI), DI + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x04, Y2, Y2, Y2 + DECQ CX + JG sealAVX2Tail384LoopA + DECQ R9 + JGE sealAVX2Tail384LoopB + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 32(BP), Y10, Y10 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD 64(BP), Y8, Y8 + VPADDD Y7, Y4, Y4 + VPADDD Y11, Y1, Y1 + VPADDD Y15, Y2, Y2 + VPERM2I128 $0x02, Y0, Y14, Y3 + VPERM2I128 $0x02, Y12, Y4, Y7 + VPERM2I128 $0x13, Y0, Y14, Y11 + VPERM2I128 $0x13, Y12, Y4, Y15 + VPXOR (SI), Y3, Y3 + VPXOR 32(SI), Y7, Y7 + VPXOR 64(SI), Y11, Y11 + VPXOR 96(SI), Y15, Y15 + VMOVDQU Y3, (DI) + VMOVDQU Y7, 32(DI) + VMOVDQU Y11, 64(DI) + VMOVDQU Y15, 96(DI) + VPERM2I128 $0x02, Y5, Y9, Y3 + VPERM2I128 $0x02, Y13, Y1, Y7 + VPERM2I128 $0x13, Y5, Y9, Y11 + VPERM2I128 $0x13, Y13, Y1, Y15 + VPXOR 128(SI), Y3, Y3 + VPXOR 160(SI), Y7, Y7 + VPXOR 192(SI), Y11, Y11 + VPXOR 224(SI), Y15, Y15 + VMOVDQU Y3, 128(DI) + VMOVDQU Y7, 160(DI) + VMOVDQU Y11, 192(DI) + VMOVDQU Y15, 224(DI) + MOVQ $0x00000100, CX + LEAQ 256(SI), SI + SUBQ $0x00000100, BX + VPERM2I128 $0x02, Y6, Y10, Y0 + VPERM2I128 $0x02, Y8, Y2, Y14 + VPERM2I128 $0x13, Y6, Y10, Y12 + VPERM2I128 $0x13, Y8, Y2, Y4 + JMP sealAVX2SealHash + sealAVX2Tail512: - // Need to decrypt up to 512 bytes - prepare two blocks - // If we got here after the main loop - there are 512 encrypted bytes waiting to be hashed - // If we got here before the main loop - there are 448 encrpyred bytes waiting to be hashed - VMOVDQA ·chacha20Constants<>(SB), AA0; VMOVDQA AA0, AA1; VMOVDQA AA0, AA2; VMOVDQA AA0, AA3 - VMOVDQA state1StoreAVX2, BB0; VMOVDQA BB0, BB1; VMOVDQA BB0, BB2; VMOVDQA BB0, BB3 - VMOVDQA state2StoreAVX2, CC0; VMOVDQA CC0, CC1; VMOVDQA CC0, CC2; VMOVDQA CC0, CC3 - VMOVDQA ctr3StoreAVX2, DD0 - VPADDD ·avx2IncMask<>(SB), DD0, DD0; VPADDD ·avx2IncMask<>(SB), DD0, DD1; VPADDD ·avx2IncMask<>(SB), DD1, DD2; VPADDD ·avx2IncMask<>(SB), DD2, DD3 - VMOVDQA DD0, ctr0StoreAVX2; VMOVDQA DD1, ctr1StoreAVX2; VMOVDQA DD2, ctr2StoreAVX2; VMOVDQA DD3, ctr3StoreAVX2 + VMOVDQA ·chacha20Constants<>+0(SB), Y0 + VMOVDQA Y0, Y5 + VMOVDQA Y0, Y6 + VMOVDQA Y0, Y7 + VMOVDQA 32(BP), Y14 + VMOVDQA Y14, Y9 + VMOVDQA Y14, Y10 + VMOVDQA Y14, Y11 + VMOVDQA 64(BP), Y12 + VMOVDQA Y12, Y13 + VMOVDQA Y12, Y8 + VMOVDQA Y12, Y15 + VMOVDQA 192(BP), Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y4 + VPADDD ·avx2IncMask<>+0(SB), Y4, Y1 + VPADDD ·avx2IncMask<>+0(SB), Y1, Y2 + VPADDD ·avx2IncMask<>+0(SB), Y2, Y3 + VMOVDQA Y4, 96(BP) + VMOVDQA Y1, 128(BP) + VMOVDQA Y2, 160(BP) + VMOVDQA Y3, 192(BP) sealAVX2Tail512LoopA: - polyAdd(0(oup)) - polyMul - LEAQ 16(oup), oup + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), AX + MOVQ AX, R15 + MULQ R10 + MOVQ AX, R13 + MOVQ DX, R14 + MOVQ (BP), AX + MULQ R11 + IMULQ R12, R15 + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), AX + MOVQ AX, R8 + MULQ R10 + ADDQ AX, R14 + ADCQ $0x00, DX + MOVQ DX, R10 + MOVQ 8(BP), AX + MULQ R11 + ADDQ AX, R15 + ADCQ $0x00, DX + IMULQ R12, R8 + ADDQ R10, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 16(DI), DI sealAVX2Tail512LoopB: - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - polyAdd(0*8(oup)) - polyMulAVX2 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - VPALIGNR $4, BB0, BB0, BB0; VPALIGNR $4, BB1, BB1, BB1; VPALIGNR $4, BB2, BB2, BB2; VPALIGNR $4, BB3, BB3, BB3 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 - VPALIGNR $12, DD0, DD0, DD0; VPALIGNR $12, DD1, DD1, DD1; VPALIGNR $12, DD2, DD2, DD2; VPALIGNR $12, DD3, DD3, DD3 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol16<>(SB), DD0, DD0; VPSHUFB ·rol16<>(SB), DD1, DD1; VPSHUFB ·rol16<>(SB), DD2, DD2; VPSHUFB ·rol16<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - polyAdd(2*8(oup)) - polyMulAVX2 - LEAQ (4*8)(oup), oup - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $12, BB0, CC3; VPSRLD $20, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $12, BB1, CC3; VPSRLD $20, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $12, BB2, CC3; VPSRLD $20, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $12, BB3, CC3; VPSRLD $20, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - VPADDD BB0, AA0, AA0; VPADDD BB1, AA1, AA1; VPADDD BB2, AA2, AA2; VPADDD BB3, AA3, AA3 - VPXOR AA0, DD0, DD0; VPXOR AA1, DD1, DD1; VPXOR AA2, DD2, DD2; VPXOR AA3, DD3, DD3 - VPSHUFB ·rol8<>(SB), DD0, DD0; VPSHUFB ·rol8<>(SB), DD1, DD1; VPSHUFB ·rol8<>(SB), DD2, DD2; VPSHUFB ·rol8<>(SB), DD3, DD3 - VPADDD DD0, CC0, CC0; VPADDD DD1, CC1, CC1; VPADDD DD2, CC2, CC2; VPADDD DD3, CC3, CC3 - VPXOR CC0, BB0, BB0; VPXOR CC1, BB1, BB1; VPXOR CC2, BB2, BB2; VPXOR CC3, BB3, BB3 - VMOVDQA CC3, tmpStoreAVX2 - VPSLLD $7, BB0, CC3; VPSRLD $25, BB0, BB0; VPXOR CC3, BB0, BB0 - VPSLLD $7, BB1, CC3; VPSRLD $25, BB1, BB1; VPXOR CC3, BB1, BB1 - VPSLLD $7, BB2, CC3; VPSRLD $25, BB2, BB2; VPXOR CC3, BB2, BB2 - VPSLLD $7, BB3, CC3; VPSRLD $25, BB3, BB3; VPXOR CC3, BB3, BB3 - VMOVDQA tmpStoreAVX2, CC3 - VPALIGNR $12, BB0, BB0, BB0; VPALIGNR $12, BB1, BB1, BB1; VPALIGNR $12, BB2, BB2, BB2; VPALIGNR $12, BB3, BB3, BB3 - VPALIGNR $8, CC0, CC0, CC0; VPALIGNR $8, CC1, CC1, CC1; VPALIGNR $8, CC2, CC2, CC2; VPALIGNR $8, CC3, CC3, CC3 - VPALIGNR $4, DD0, DD0, DD0; VPALIGNR $4, DD1, DD1, DD1; VPALIGNR $4, DD2, DD2, DD2; VPALIGNR $4, DD3, DD3, DD3 - - DECQ itr1 - JG sealAVX2Tail512LoopA - DECQ itr2 - JGE sealAVX2Tail512LoopB - - VPADDD ·chacha20Constants<>(SB), AA0, AA0; VPADDD ·chacha20Constants<>(SB), AA1, AA1; VPADDD ·chacha20Constants<>(SB), AA2, AA2; VPADDD ·chacha20Constants<>(SB), AA3, AA3 - VPADDD state1StoreAVX2, BB0, BB0; VPADDD state1StoreAVX2, BB1, BB1; VPADDD state1StoreAVX2, BB2, BB2; VPADDD state1StoreAVX2, BB3, BB3 - VPADDD state2StoreAVX2, CC0, CC0; VPADDD state2StoreAVX2, CC1, CC1; VPADDD state2StoreAVX2, CC2, CC2; VPADDD state2StoreAVX2, CC3, CC3 - VPADDD ctr0StoreAVX2, DD0, DD0; VPADDD ctr1StoreAVX2, DD1, DD1; VPADDD ctr2StoreAVX2, DD2, DD2; VPADDD ctr3StoreAVX2, DD3, DD3 - VMOVDQA CC3, tmpStoreAVX2 - VPERM2I128 $0x02, AA0, BB0, CC3 - VPXOR (0*32)(inp), CC3, CC3 - VMOVDQU CC3, (0*32)(oup) - VPERM2I128 $0x02, CC0, DD0, CC3 - VPXOR (1*32)(inp), CC3, CC3 - VMOVDQU CC3, (1*32)(oup) - VPERM2I128 $0x13, AA0, BB0, CC3 - VPXOR (2*32)(inp), CC3, CC3 - VMOVDQU CC3, (2*32)(oup) - VPERM2I128 $0x13, CC0, DD0, CC3 - VPXOR (3*32)(inp), CC3, CC3 - VMOVDQU CC3, (3*32)(oup) - - VPERM2I128 $0x02, AA1, BB1, AA0 - VPERM2I128 $0x02, CC1, DD1, BB0 - VPERM2I128 $0x13, AA1, BB1, CC0 - VPERM2I128 $0x13, CC1, DD1, DD0 - VPXOR (4*32)(inp), AA0, AA0; VPXOR (5*32)(inp), BB0, BB0; VPXOR (6*32)(inp), CC0, CC0; VPXOR (7*32)(inp), DD0, DD0 - VMOVDQU AA0, (4*32)(oup); VMOVDQU BB0, (5*32)(oup); VMOVDQU CC0, (6*32)(oup); VMOVDQU DD0, (7*32)(oup) - - VPERM2I128 $0x02, AA2, BB2, AA0 - VPERM2I128 $0x02, CC2, DD2, BB0 - VPERM2I128 $0x13, AA2, BB2, CC0 - VPERM2I128 $0x13, CC2, DD2, DD0 - VPXOR (8*32)(inp), AA0, AA0; VPXOR (9*32)(inp), BB0, BB0; VPXOR (10*32)(inp), CC0, CC0; VPXOR (11*32)(inp), DD0, DD0 - VMOVDQU AA0, (8*32)(oup); VMOVDQU BB0, (9*32)(oup); VMOVDQU CC0, (10*32)(oup); VMOVDQU DD0, (11*32)(oup) - - MOVQ $384, itr1 - LEAQ 384(inp), inp - SUBQ $384, inl - VPERM2I128 $0x02, AA3, BB3, AA0 - VPERM2I128 $0x02, tmpStoreAVX2, DD3, BB0 - VPERM2I128 $0x13, AA3, BB3, CC0 - VPERM2I128 $0x13, tmpStoreAVX2, DD3, DD0 - - JMP sealAVX2SealHash + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + ADDQ (DI), R10 + ADCQ 8(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + VPALIGNR $0x04, Y14, Y14, Y14 + VPALIGNR $0x04, Y9, Y9, Y9 + VPALIGNR $0x04, Y10, Y10, Y10 + VPALIGNR $0x04, Y11, Y11, Y11 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x0c, Y4, Y4, Y4 + VPALIGNR $0x0c, Y1, Y1, Y1 + VPALIGNR $0x0c, Y2, Y2, Y2 + VPALIGNR $0x0c, Y3, Y3, Y3 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol16<>+0(SB), Y4, Y4 + VPSHUFB ·rol16<>+0(SB), Y1, Y1 + VPSHUFB ·rol16<>+0(SB), Y2, Y2 + VPSHUFB ·rol16<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + ADDQ 16(DI), R10 + ADCQ 24(DI), R11 + ADCQ $0x01, R12 + MOVQ (BP), DX + MOVQ DX, R15 + MULXQ R10, R13, R14 + IMULQ R12, R15 + MULXQ R11, AX, DX + ADDQ AX, R14 + ADCQ DX, R15 + MOVQ 8(BP), DX + MULXQ R10, R10, AX + ADDQ R10, R14 + MULXQ R11, R11, R8 + ADCQ R11, R15 + ADCQ $0x00, R8 + IMULQ R12, DX + ADDQ AX, R15 + ADCQ DX, R8 + MOVQ R13, R10 + MOVQ R14, R11 + MOVQ R15, R12 + ANDQ $0x03, R12 + MOVQ R15, R13 + ANDQ $-4, R13 + MOVQ R8, R14 + SHRQ $0x02, R8, R15 + SHRQ $0x02, R8 + ADDQ R13, R10 + ADCQ R14, R11 + ADCQ $0x00, R12 + ADDQ R15, R10 + ADCQ R8, R11 + ADCQ $0x00, R12 + LEAQ 32(DI), DI + VMOVDQA Y15, 224(BP) + VPSLLD $0x0c, Y14, Y15 + VPSRLD $0x14, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x0c, Y9, Y15 + VPSRLD $0x14, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x0c, Y10, Y15 + VPSRLD $0x14, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x0c, Y11, Y15 + VPSRLD $0x14, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + VPADDD Y14, Y0, Y0 + VPADDD Y9, Y5, Y5 + VPADDD Y10, Y6, Y6 + VPADDD Y11, Y7, Y7 + VPXOR Y0, Y4, Y4 + VPXOR Y5, Y1, Y1 + VPXOR Y6, Y2, Y2 + VPXOR Y7, Y3, Y3 + VPSHUFB ·rol8<>+0(SB), Y4, Y4 + VPSHUFB ·rol8<>+0(SB), Y1, Y1 + VPSHUFB ·rol8<>+0(SB), Y2, Y2 + VPSHUFB ·rol8<>+0(SB), Y3, Y3 + VPADDD Y4, Y12, Y12 + VPADDD Y1, Y13, Y13 + VPADDD Y2, Y8, Y8 + VPADDD Y3, Y15, Y15 + VPXOR Y12, Y14, Y14 + VPXOR Y13, Y9, Y9 + VPXOR Y8, Y10, Y10 + VPXOR Y15, Y11, Y11 + VMOVDQA Y15, 224(BP) + VPSLLD $0x07, Y14, Y15 + VPSRLD $0x19, Y14, Y14 + VPXOR Y15, Y14, Y14 + VPSLLD $0x07, Y9, Y15 + VPSRLD $0x19, Y9, Y9 + VPXOR Y15, Y9, Y9 + VPSLLD $0x07, Y10, Y15 + VPSRLD $0x19, Y10, Y10 + VPXOR Y15, Y10, Y10 + VPSLLD $0x07, Y11, Y15 + VPSRLD $0x19, Y11, Y11 + VPXOR Y15, Y11, Y11 + VMOVDQA 224(BP), Y15 + VPALIGNR $0x0c, Y14, Y14, Y14 + VPALIGNR $0x0c, Y9, Y9, Y9 + VPALIGNR $0x0c, Y10, Y10, Y10 + VPALIGNR $0x0c, Y11, Y11, Y11 + VPALIGNR $0x08, Y12, Y12, Y12 + VPALIGNR $0x08, Y13, Y13, Y13 + VPALIGNR $0x08, Y8, Y8, Y8 + VPALIGNR $0x08, Y15, Y15, Y15 + VPALIGNR $0x04, Y4, Y4, Y4 + VPALIGNR $0x04, Y1, Y1, Y1 + VPALIGNR $0x04, Y2, Y2, Y2 + VPALIGNR $0x04, Y3, Y3, Y3 + DECQ CX + JG sealAVX2Tail512LoopA + DECQ R9 + JGE sealAVX2Tail512LoopB + VPADDD ·chacha20Constants<>+0(SB), Y0, Y0 + VPADDD ·chacha20Constants<>+0(SB), Y5, Y5 + VPADDD ·chacha20Constants<>+0(SB), Y6, Y6 + VPADDD ·chacha20Constants<>+0(SB), Y7, Y7 + VPADDD 32(BP), Y14, Y14 + VPADDD 32(BP), Y9, Y9 + VPADDD 32(BP), Y10, Y10 + VPADDD 32(BP), Y11, Y11 + VPADDD 64(BP), Y12, Y12 + VPADDD 64(BP), Y13, Y13 + VPADDD 64(BP), Y8, Y8 + VPADDD 64(BP), Y15, Y15 + VPADDD 96(BP), Y4, Y4 + VPADDD 128(BP), Y1, Y1 + VPADDD 160(BP), Y2, Y2 + VPADDD 192(BP), Y3, Y3 + VMOVDQA Y15, 224(BP) + VPERM2I128 $0x02, Y0, Y14, Y15 + VPXOR (SI), Y15, Y15 + VMOVDQU Y15, (DI) + VPERM2I128 $0x02, Y12, Y4, Y15 + VPXOR 32(SI), Y15, Y15 + VMOVDQU Y15, 32(DI) + VPERM2I128 $0x13, Y0, Y14, Y15 + VPXOR 64(SI), Y15, Y15 + VMOVDQU Y15, 64(DI) + VPERM2I128 $0x13, Y12, Y4, Y15 + VPXOR 96(SI), Y15, Y15 + VMOVDQU Y15, 96(DI) + VPERM2I128 $0x02, Y5, Y9, Y0 + VPERM2I128 $0x02, Y13, Y1, Y14 + VPERM2I128 $0x13, Y5, Y9, Y12 + VPERM2I128 $0x13, Y13, Y1, Y4 + VPXOR 128(SI), Y0, Y0 + VPXOR 160(SI), Y14, Y14 + VPXOR 192(SI), Y12, Y12 + VPXOR 224(SI), Y4, Y4 + VMOVDQU Y0, 128(DI) + VMOVDQU Y14, 160(DI) + VMOVDQU Y12, 192(DI) + VMOVDQU Y4, 224(DI) + VPERM2I128 $0x02, Y6, Y10, Y0 + VPERM2I128 $0x02, Y8, Y2, Y14 + VPERM2I128 $0x13, Y6, Y10, Y12 + VPERM2I128 $0x13, Y8, Y2, Y4 + VPXOR 256(SI), Y0, Y0 + VPXOR 288(SI), Y14, Y14 + VPXOR 320(SI), Y12, Y12 + VPXOR 352(SI), Y4, Y4 + VMOVDQU Y0, 256(DI) + VMOVDQU Y14, 288(DI) + VMOVDQU Y12, 320(DI) + VMOVDQU Y4, 352(DI) + MOVQ $0x00000180, CX + LEAQ 384(SI), SI + SUBQ $0x00000180, BX + VPERM2I128 $0x02, Y7, Y11, Y0 + VPERM2I128 $0x02, 224(BP), Y3, Y14 + VPERM2I128 $0x13, Y7, Y11, Y12 + VPERM2I128 $0x13, 224(BP), Y3, Y4 + JMP sealAVX2SealHash diff --git a/src/vendor/golang.org/x/crypto/internal/poly1305/mac_noasm.go b/src/vendor/golang.org/x/crypto/internal/poly1305/mac_noasm.go index 333da285b32a34..bd896bdc76d1ca 100644 --- a/src/vendor/golang.org/x/crypto/internal/poly1305/mac_noasm.go +++ b/src/vendor/golang.org/x/crypto/internal/poly1305/mac_noasm.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build (!amd64 && !ppc64le && !s390x) || !gc || purego +//go:build (!amd64 && !ppc64le && !ppc64 && !s390x) || !gc || purego package poly1305 diff --git a/src/vendor/golang.org/x/crypto/internal/poly1305/sum_amd64.s b/src/vendor/golang.org/x/crypto/internal/poly1305/sum_amd64.s index e0d3c64756692b..133757384b787f 100644 --- a/src/vendor/golang.org/x/crypto/internal/poly1305/sum_amd64.s +++ b/src/vendor/golang.org/x/crypto/internal/poly1305/sum_amd64.s @@ -1,108 +1,93 @@ -// Copyright 2012 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. +// Code generated by command: go run sum_amd64_asm.go -out ../sum_amd64.s -pkg poly1305. DO NOT EDIT. //go:build gc && !purego -#include "textflag.h" - -#define POLY1305_ADD(msg, h0, h1, h2) \ - ADDQ 0(msg), h0; \ - ADCQ 8(msg), h1; \ - ADCQ $1, h2; \ - LEAQ 16(msg), msg - -#define POLY1305_MUL(h0, h1, h2, r0, r1, t0, t1, t2, t3) \ - MOVQ r0, AX; \ - MULQ h0; \ - MOVQ AX, t0; \ - MOVQ DX, t1; \ - MOVQ r0, AX; \ - MULQ h1; \ - ADDQ AX, t1; \ - ADCQ $0, DX; \ - MOVQ r0, t2; \ - IMULQ h2, t2; \ - ADDQ DX, t2; \ - \ - MOVQ r1, AX; \ - MULQ h0; \ - ADDQ AX, t1; \ - ADCQ $0, DX; \ - MOVQ DX, h0; \ - MOVQ r1, t3; \ - IMULQ h2, t3; \ - MOVQ r1, AX; \ - MULQ h1; \ - ADDQ AX, t2; \ - ADCQ DX, t3; \ - ADDQ h0, t2; \ - ADCQ $0, t3; \ - \ - MOVQ t0, h0; \ - MOVQ t1, h1; \ - MOVQ t2, h2; \ - ANDQ $3, h2; \ - MOVQ t2, t0; \ - ANDQ $0xFFFFFFFFFFFFFFFC, t0; \ - ADDQ t0, h0; \ - ADCQ t3, h1; \ - ADCQ $0, h2; \ - SHRQ $2, t3, t2; \ - SHRQ $2, t3; \ - ADDQ t2, h0; \ - ADCQ t3, h1; \ - ADCQ $0, h2 - -// func update(state *[7]uint64, msg []byte) +// func update(state *macState, msg []byte) TEXT ·update(SB), $0-32 MOVQ state+0(FP), DI MOVQ msg_base+8(FP), SI MOVQ msg_len+16(FP), R15 - - MOVQ 0(DI), R8 // h0 - MOVQ 8(DI), R9 // h1 - MOVQ 16(DI), R10 // h2 - MOVQ 24(DI), R11 // r0 - MOVQ 32(DI), R12 // r1 - - CMPQ R15, $16 + MOVQ (DI), R8 + MOVQ 8(DI), R9 + MOVQ 16(DI), R10 + MOVQ 24(DI), R11 + MOVQ 32(DI), R12 + CMPQ R15, $0x10 JB bytes_between_0_and_15 loop: - POLY1305_ADD(SI, R8, R9, R10) + ADDQ (SI), R8 + ADCQ 8(SI), R9 + ADCQ $0x01, R10 + LEAQ 16(SI), SI multiply: - POLY1305_MUL(R8, R9, R10, R11, R12, BX, CX, R13, R14) - SUBQ $16, R15 - CMPQ R15, $16 - JAE loop + MOVQ R11, AX + MULQ R8 + MOVQ AX, BX + MOVQ DX, CX + MOVQ R11, AX + MULQ R9 + ADDQ AX, CX + ADCQ $0x00, DX + MOVQ R11, R13 + IMULQ R10, R13 + ADDQ DX, R13 + MOVQ R12, AX + MULQ R8 + ADDQ AX, CX + ADCQ $0x00, DX + MOVQ DX, R8 + MOVQ R12, R14 + IMULQ R10, R14 + MOVQ R12, AX + MULQ R9 + ADDQ AX, R13 + ADCQ DX, R14 + ADDQ R8, R13 + ADCQ $0x00, R14 + MOVQ BX, R8 + MOVQ CX, R9 + MOVQ R13, R10 + ANDQ $0x03, R10 + MOVQ R13, BX + ANDQ $-4, BX + ADDQ BX, R8 + ADCQ R14, R9 + ADCQ $0x00, R10 + SHRQ $0x02, R14, R13 + SHRQ $0x02, R14 + ADDQ R13, R8 + ADCQ R14, R9 + ADCQ $0x00, R10 + SUBQ $0x10, R15 + CMPQ R15, $0x10 + JAE loop bytes_between_0_and_15: TESTQ R15, R15 JZ done - MOVQ $1, BX + MOVQ $0x00000001, BX XORQ CX, CX XORQ R13, R13 ADDQ R15, SI flush_buffer: - SHLQ $8, BX, CX - SHLQ $8, BX + SHLQ $0x08, BX, CX + SHLQ $0x08, BX MOVB -1(SI), R13 XORQ R13, BX DECQ SI DECQ R15 JNZ flush_buffer - ADDQ BX, R8 ADCQ CX, R9 - ADCQ $0, R10 - MOVQ $16, R15 + ADCQ $0x00, R10 + MOVQ $0x00000010, R15 JMP multiply done: - MOVQ R8, 0(DI) + MOVQ R8, (DI) MOVQ R9, 8(DI) MOVQ R10, 16(DI) RET diff --git a/src/vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64le.go b/src/vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64x.go similarity index 95% rename from src/vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64le.go rename to src/vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64x.go index 4aec4874b507e8..1a1679aaad9c47 100644 --- a/src/vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64le.go +++ b/src/vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64x.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build gc && !purego +//go:build gc && !purego && (ppc64 || ppc64le) package poly1305 diff --git a/src/vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64le.s b/src/vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64x.s similarity index 89% rename from src/vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64le.s rename to src/vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64x.s index b3c1699bff51a2..6899a1dabc0b3e 100644 --- a/src/vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64le.s +++ b/src/vendor/golang.org/x/crypto/internal/poly1305/sum_ppc64x.s @@ -2,15 +2,25 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -//go:build gc && !purego +//go:build gc && !purego && (ppc64 || ppc64le) #include "textflag.h" // This was ported from the amd64 implementation. +#ifdef GOARCH_ppc64le +#define LE_MOVD MOVD +#define LE_MOVWZ MOVWZ +#define LE_MOVHZ MOVHZ +#else +#define LE_MOVD MOVDBR +#define LE_MOVWZ MOVWBR +#define LE_MOVHZ MOVHBR +#endif + #define POLY1305_ADD(msg, h0, h1, h2, t0, t1, t2) \ - MOVD (msg), t0; \ - MOVD 8(msg), t1; \ + LE_MOVD (msg)( R0), t0; \ + LE_MOVD (msg)(R24), t1; \ MOVD $1, t2; \ ADDC t0, h0, h0; \ ADDE t1, h1, h1; \ @@ -50,10 +60,6 @@ ADDE t3, h1, h1; \ ADDZE h2 -DATA ·poly1305Mask<>+0x00(SB)/8, $0x0FFFFFFC0FFFFFFF -DATA ·poly1305Mask<>+0x08(SB)/8, $0x0FFFFFFC0FFFFFFC -GLOBL ·poly1305Mask<>(SB), RODATA, $16 - // func update(state *[7]uint64, msg []byte) TEXT ·update(SB), $0-32 MOVD state+0(FP), R3 @@ -66,6 +72,8 @@ TEXT ·update(SB), $0-32 MOVD 24(R3), R11 // r0 MOVD 32(R3), R12 // r1 + MOVD $8, R24 + CMP R5, $16 BLT bytes_between_0_and_15 @@ -94,7 +102,7 @@ flush_buffer: // Greater than 8 -- load the rightmost remaining bytes in msg // and put into R17 (h1) - MOVD (R4)(R21), R17 + LE_MOVD (R4)(R21), R17 MOVD $16, R22 // Find the offset to those bytes @@ -118,7 +126,7 @@ just1: BLT less8 // Exactly 8 - MOVD (R4), R16 + LE_MOVD (R4), R16 CMP R17, $0 @@ -133,7 +141,7 @@ less8: MOVD $0, R22 // shift count CMP R5, $4 BLT less4 - MOVWZ (R4), R16 + LE_MOVWZ (R4), R16 ADD $4, R4 ADD $-4, R5 MOVD $32, R22 @@ -141,7 +149,7 @@ less8: less4: CMP R5, $2 BLT less2 - MOVHZ (R4), R21 + LE_MOVHZ (R4), R21 SLD R22, R21, R21 OR R16, R21, R16 ADD $16, R22 diff --git a/src/vendor/golang.org/x/net/route/address.go b/src/vendor/golang.org/x/net/route/address.go index 5443d672236c2a..bae63003f04da3 100644 --- a/src/vendor/golang.org/x/net/route/address.go +++ b/src/vendor/golang.org/x/net/route/address.go @@ -170,20 +170,37 @@ func (a *Inet6Addr) marshal(b []byte) (int, error) { // parseInetAddr parses b as an internet address for IPv4 or IPv6. func parseInetAddr(af int, b []byte) (Addr, error) { + const ( + off4 = 4 // offset of in_addr + off6 = 8 // offset of in6_addr + ) switch af { case syscall.AF_INET: - if len(b) < sizeofSockaddrInet { + if len(b) < (off4+1) || len(b) < int(b[0]) { return nil, errInvalidAddr } + sockAddrLen := int(b[0]) a := &Inet4Addr{} - copy(a.IP[:], b[4:8]) + n := off4 + 4 + if sockAddrLen < n { + n = sockAddrLen + } + copy(a.IP[:], b[off4:n]) return a, nil case syscall.AF_INET6: - if len(b) < sizeofSockaddrInet6 { + if len(b) < (off6+1) || len(b) < int(b[0]) { return nil, errInvalidAddr } - a := &Inet6Addr{ZoneID: int(nativeEndian.Uint32(b[24:28]))} - copy(a.IP[:], b[8:24]) + sockAddrLen := int(b[0]) + n := off6 + 16 + if sockAddrLen < n { + n = sockAddrLen + } + a := &Inet6Addr{} + if sockAddrLen == sizeofSockaddrInet6 { + a.ZoneID = int(nativeEndian.Uint32(b[24:28])) + } + copy(a.IP[:], b[off6:n]) if a.IP[0] == 0xfe && a.IP[1]&0xc0 == 0x80 || a.IP[0] == 0xff && (a.IP[1]&0x0f == 0x01 || a.IP[1]&0x0f == 0x02) { // KAME based IPv6 protocol stack usually // embeds the interface index in the diff --git a/src/vendor/modules.txt b/src/vendor/modules.txt index 2ef127ad5a4428..164cdd09c95f72 100644 --- a/src/vendor/modules.txt +++ b/src/vendor/modules.txt @@ -1,4 +1,4 @@ -# golang.org/x/crypto v0.25.1-0.20240722173533-bb80217080b0 +# golang.org/x/crypto v0.29.0 ## explicit; go 1.20 golang.org/x/crypto/chacha20 golang.org/x/crypto/chacha20poly1305 @@ -6,7 +6,7 @@ golang.org/x/crypto/cryptobyte golang.org/x/crypto/cryptobyte/asn1 golang.org/x/crypto/internal/alias golang.org/x/crypto/internal/poly1305 -# golang.org/x/net v0.27.1-0.20240722181819-765c7e89b3bd +# golang.org/x/net v0.31.0 ## explicit; go 1.18 golang.org/x/net/dns/dnsmessage golang.org/x/net/http/httpguts @@ -19,7 +19,7 @@ golang.org/x/net/route # golang.org/x/sys v0.27.0 ## explicit; go 1.18 golang.org/x/sys/cpu -# golang.org/x/text v0.19.0 +# golang.org/x/text v0.20.0 ## explicit; go 1.18 golang.org/x/text/secure/bidirule golang.org/x/text/transform