// Copyright 2015, Joe Tsai. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE.md file. package xflate import ( "bytes" "encoding/binary" "hash/crc32" "io" "io/ioutil" "math" "github.com/dsnet/compress/internal/errors" "github.com/dsnet/compress/xflate/internal/meta" ) // chunk is a tuple of raw (uncompressed) size and compressed size for a chunk. type chunk struct { csize, rsize int64 typ int } // chunkReader wraps an io.Reader by appending an endBlock marker at the end. // // Attempting to decompress each chunk ad-verbatim with a DEFLATE decompressor // will lead to an io.ErrUnexpectedEOF error since the chunk does not terminate // with a DEFLATE block with the final bit set. The chunkReader appends the // endBlock to the stream so that the decompressor knows when to terminate. // It also tracks the last 4 bytes of the chunk for sync marker verification. type chunkReader struct { rd io.LimitedReader sync uint32 // Last 4 bytes from underlying stream end []byte // Bytes of the endBlock marker } func (cr *chunkReader) Reset(rd io.Reader, n int64) { *cr = chunkReader{rd: io.LimitedReader{R: rd, N: n}} } func (cr *chunkReader) Read(buf []byte) (int, error) { max := func(a, b int) int { if a < b { return b } return a } // Append endBlock marker. if cr.end != nil { n := copy(buf, cr.end) cr.end = cr.end[n:] if len(cr.end) == 0 { return n, io.EOF } return n, nil } // Read from underlying data stream. n, err := cr.rd.Read(buf) for i := max(n-4, 0); i < n; i++ { cr.sync = (cr.sync << 8) | uint32(buf[i]) } if err == io.EOF { cr.end = endBlock[:] err = nil } return n, err } // A Reader is an io.ReadSeeker that can read the XFLATE format. Only the stream // produced by Writer (or some other valid XFLATE stream) can be read by Reader. // Regular DEFLATE streams produced by flate.Writer cannot be read by Reader. type Reader struct { // TODO(dsnet): Export index information somehow. rd io.ReadSeeker mr meta.Reader // Meta decoder used to read the index and footer cr chunkReader // Wraps rd before being passed into zr zr *flateReader // DEFLATE decompressor ri int // Current record number offset int64 // Current raw offset discard int64 // Number of bytes to discard to reach offset idx index // Index table of seekable offsets chk chunk // Information about the current chunk err error // Persistent error // The following fields are embedded here to reduce memory allocations. lr io.LimitedReader br, bw bytes.Buffer idxs []index chunks []chunk } // ReaderConfig configures the Reader. // There are currently no configuration options for Reader. type ReaderConfig struct { _ struct{} // Blank field to prevent unkeyed struct literals } // NewReader creates a new Reader reading the given reader rs. This reader can // only decompress files in the XFLATE format. If the underlying stream is // regular DEFLATE and not XFLATE, then this returns error. // // Regardless of the current offset in rs, this function Seeks to the end of rs // in order to determine the total compressed size. The Reader returned has its // offset set to the start of the stream. // // If conf is nil, then default configuration values are used. Reader copies // all configuration values as necessary and does not store conf. func NewReader(rs io.ReadSeeker, conf *ReaderConfig) (*Reader, error) { xr := new(Reader) err := xr.Reset(rs) return xr, err } // Reset discards the Reader's state and makes it equivalent to the result // of a call to NewReader, but reading from rd instead. This method may return // an error if it is unable to parse the index. // // This is used to reduce memory allocations. func (xr *Reader) Reset(rs io.ReadSeeker) error { *xr = Reader{ rd: rs, mr: xr.mr, zr: xr.zr, idx: xr.idx, br: xr.br, bw: xr.bw, idxs: xr.idxs, chunks: xr.chunks, } if xr.zr == nil { xr.zr, _ = newFlateReader(nil) } xr.idx.Reset() // Read entire index. var backSize, footSize int64 if backSize, footSize, xr.err = xr.decodeFooter(); xr.err != nil { return xr.err } if xr.err = xr.decodeIndexes(backSize); xr.err != nil { return xr.err } if !xr.idx.AppendRecord(footSize, 0, footerType) { xr.err = errCorrupted return xr.err } // Setup initial chunk reader. _, xr.err = xr.Seek(0, io.SeekStart) return xr.err } // Read reads decompressed data from the underlying io.Reader. // This method automatically proceeds to the next chunk when the current one // has been fully read. func (xr *Reader) Read(buf []byte) (int, error) { if xr.err != nil { return 0, xr.err } // Discard some data to reach the expected raw offset. if xr.discard > 0 { var n int64 xr.lr = io.LimitedReader{R: xr.zr, N: xr.discard} n, xr.err = io.Copy(ioutil.Discard, &xr.lr) if xr.err != nil { return 0, xr.err } if n != xr.discard { xr.err = errCorrupted return 0, xr.err } xr.discard = 0 } var cnt int for cnt == 0 && xr.err == nil { cnt, xr.err = xr.zr.Read(buf) xr.offset += int64(cnt) if xr.err == io.EOF { xr.err = nil // Clear io.EOF temporarily // Verify that the compressed section ends with an empty raw block. if xr.chk.typ == deflateType && xr.cr.sync != 0x0000ffff { xr.err = errCorrupted break } // Verify that the compressed and raw sizes match. if xr.chk.typ != footerType { xr.chk.csize += int64(len(endBlock)) // Side of effect of using chunkReader } if xr.chk.csize != xr.zr.InputOffset || xr.chk.rsize != xr.zr.OutputOffset { xr.err = errCorrupted break } // Seek to next chunk. if _, xr.err = xr.Seek(xr.offset, io.SeekStart); xr.err != nil { break } if xr.chk.typ == unknownType { xr.err = io.EOF } } } return cnt, xr.err } // Seek sets the offset for the next Read operation, interpreted according to // the whence value provided. It is permitted to seek to offsets in the middle // of a compressed chunk. The next call to Read will automatically discard some // number of bytes before returning the requested data. func (xr *Reader) Seek(offset int64, whence int) (int64, error) { if xr.err != nil && xr.err != io.EOF { return 0, xr.err } // Determine which position to seek to. var pos int64 end := xr.idx.LastRecord().RawOffset switch whence { case io.SeekStart: pos = offset case io.SeekCurrent: pos = xr.offset + offset case io.SeekEnd: pos = end + offset default: return 0, errorf(errors.Invalid, "invalid whence: %d", whence) } if pos < 0 { return 0, errorf(errors.Invalid, "negative position: %d", pos) } // As an optimization if the new position is within the current chunk, // then just adjust the discard value. discard := pos - xr.offset remain := xr.chk.rsize - xr.zr.OutputOffset if discard > 0 && remain > 0 && discard < remain { xr.offset, xr.discard = pos, discard return pos, nil } // Query the index for the chunk to start decoding from. // Attempt to use the subsequent record before resorting to binary search. prev, curr := xr.idx.GetRecords(xr.ri) if !(prev.RawOffset <= pos && pos <= curr.RawOffset) { xr.ri = xr.idx.Search(pos) prev, curr = xr.idx.GetRecords(xr.ri) } xr.ri++ if xr.ri > len(xr.idx.Records) { xr.ri = len(xr.idx.Records) } // Setup a chunk reader at the given position. xr.chk = chunk{ csize: curr.CompOffset - prev.CompOffset, rsize: curr.RawOffset - prev.RawOffset, typ: curr.Type, } xr.offset, xr.discard = pos, pos-prev.RawOffset if pos > end { // In case pos is really large, only discard data that actually exists. xr.discard = end - prev.RawOffset } _, xr.err = xr.rd.Seek(prev.CompOffset, io.SeekStart) xr.cr.Reset(xr.rd, xr.chk.csize) xr.zr.Reset(&xr.cr) return pos, xr.err } // Close ends the XFLATE stream. func (xr *Reader) Close() error { if xr.err == errClosed { return nil } if xr.err != nil && xr.err != io.EOF { return xr.err } xr.err = errClosed return nil } // decodeIndexes iteratively decodes all of the indexes in the XFLATE stream. // Even if the index is fragmented in the source stream, this method will merge // all of the index fragments into a single index table. func (xr *Reader) decodeIndexes(backSize int64) error { pos, err := xr.rd.Seek(0, io.SeekCurrent) if err != nil { return err } // Read all indexes. var compSize int64 xr.idxs = xr.idxs[:0] for { // Seek backwards past index and compressed blocks. newPos := pos - (backSize + compSize) if newPos < 0 || newPos > pos { return errCorrupted // Integer overflow for new seek position } if pos, err = xr.rd.Seek(newPos, io.SeekStart); err != nil { return err } if backSize == 0 { break } // Read the index. if cap(xr.idxs) > len(xr.idxs) { xr.idxs = xr.idxs[:len(xr.idxs)+1] } else { xr.idxs = append(xr.idxs, index{}) } idx := &xr.idxs[len(xr.idxs)-1] idx.Reset() idx.IndexSize = backSize if err = xr.decodeIndex(idx); err != nil { return err } backSize, compSize = idx.BackSize, idx.LastRecord().CompOffset } if pos != 0 { return errCorrupted } // Compact all indexes into one. for i := len(xr.idxs) - 1; i >= 0; i-- { idx := xr.idxs[i] if !xr.idx.AppendIndex(&idx) { return errCorrupted } if !xr.idx.AppendRecord(idx.IndexSize, 0, indexType) { return errCorrupted } } return nil } // decodeIndex decodes the index from a meta encoded stream. // The current offset must be set to the start of the encoded index and // index.IndexSize must be populated. If successful, the index.Records and // index.BackSize fields will be populated. This method will attempt to reset // the read offset to the start of the index. func (xr *Reader) decodeIndex(idx *index) error { // Helper function to read VLIs. var errVLI error readVLI := func() int64 { x, n := binary.Uvarint(xr.bw.Bytes()) if n <= 0 || x > math.MaxInt64 { errVLI = errCorrupted return 0 } xr.bw.Next(n) return int64(x) } // Read the index and restore the underlying reader offset. xr.br.Reset() xr.lr = io.LimitedReader{R: xr.rd, N: idx.IndexSize} n, err := io.Copy(&xr.br, &xr.lr) if err != nil { return err } if _, err := xr.rd.Seek(-n, io.SeekCurrent); err != nil { return err } // Parse the index. var crc uint32 xr.chunks = xr.chunks[:0] xr.bw.Reset() xr.mr.Reset(&xr.br) if _, err := io.Copy(&xr.bw, &xr.mr); err != nil { return errWrap(err) } if xr.bw.Len() > 4 { crc = crc32.ChecksumIEEE(xr.bw.Bytes()[:xr.bw.Len()-4]) } idx.BackSize = readVLI() numRecs := readVLI() totalCompSize := readVLI() totalRawSize := readVLI() if errVLI != nil { return errVLI } for i := int64(0); i < numRecs; i++ { xr.chunks = append(xr.chunks, chunk{readVLI(), readVLI(), 0}) } if xr.bw.Len() != 4 || binary.LittleEndian.Uint32(xr.bw.Bytes()) != crc { return errCorrupted } if xr.mr.FinalMode != meta.FinalMeta { return errCorrupted } if xr.mr.InputOffset != idx.IndexSize { return errCorrupted } // Convert individual index sizes to be absolute offsets. for _, chk := range xr.chunks { if chk.csize <= 4 { return errCorrupted // Every chunk has a sync marker } if !idx.AppendRecord(chk.csize, chk.rsize, deflateType) { return errCorrupted } } lastRec := idx.LastRecord() if lastRec.CompOffset != totalCompSize || lastRec.RawOffset != totalRawSize { return errCorrupted } return nil } // decodeFooter seeks to the end of the stream, searches for the footer // and decodes it. If successful, it will return the backSize for the preceding // index and the size of the footer itself. This method will attempt to reset // the read offset to the start of the footer. func (xr *Reader) decodeFooter() (backSize, footSize int64, err error) { // Read the last few bytes of the stream. end, err := xr.rd.Seek(0, io.SeekEnd) if err != nil { return 0, 0, err } if end > meta.MaxEncBytes { end = meta.MaxEncBytes } if _, err := xr.rd.Seek(-end, io.SeekEnd); err != nil { return 0, 0, err } xr.br.Reset() if _, err := io.Copy(&xr.br, xr.rd); err != nil { return 0, 0, err } // Search for and read the meta block. idx := meta.ReverseSearch(xr.br.Bytes()) if idx < 0 { return 0, 0, errCorrupted } xr.br.Next(idx) // Skip data until magic marker xr.bw.Reset() xr.mr.Reset(&xr.br) if _, err := io.Copy(&xr.bw, &xr.mr); err != nil { return 0, 0, errWrap(err) } if xr.br.Len() != 0 || xr.mr.NumBlocks != 1 { return 0, 0, errCorrupted } if xr.mr.FinalMode != meta.FinalStream { return 0, 0, errCorrupted } if _, err := xr.rd.Seek(-xr.mr.InputOffset, io.SeekCurrent); err != nil { return 0, 0, err } // Parse the footer. bufRaw := xr.bw.Bytes() if len(bufRaw) < 3 || !bytes.Equal(bufRaw[:3], magic[:]) { return 0, 0, errCorrupted // Magic value mismatch } backSizeU64, cnt := binary.Uvarint(bufRaw[3:]) if cnt <= 0 { return 0, 0, errCorrupted // Integer overflow for VLI } if len(bufRaw[3+cnt:]) > 0 { return 0, 0, errCorrupted // Trailing unread bytes } return int64(backSizeU64), xr.mr.InputOffset, nil }