// Copyright 2015, Joe Tsai. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE.md file. // +build ignore // xflate_stats is used to investigate trade-offs in XFLATE design. // // The XFLATE format extends the DEFLATE format in order to allow for limited // random access reading. This is achieved by individually compressing the // input data in chunks (so that each chunk can be decompressed by itself), // and by storing an index recording the size of every chunk (so that each // chunk can be located). // // However, this adds overhead and diminishes the compression ratio slightly. // In order to investigate the cost of chunking, this program computes the total // compressed size when the input is compressed as a single stream or when // compressed in chunks of some fixed size. // // Chunking diminishes compression ratio because the LZ77 dictionary is reset // for each chunk, reducing the amount of savings that could have been gained // by a backwards match to previous data. Secondly, the XFLATE format requires // that each chunk terminates with a SYNC_FLUSH marker of ~5 bytes. // // In addition to the costs of chunking itself, there is also the cost of // storing the index. The index is a list of (rawSize, compSize) tuples where // each record contains information about the uncompressed input size (rawSize) // and compressed output size (compSize) of each chunk. // // In order to reduce the index size, multiple techniques were explored: // * Compressing the index itself using DEFLATE // * Row-oriented vs. column-oriented layout of records; that is, row-oriented // layout has each (rawSize, compSize) tuple laid out one after the other, // while column-oriented has all rawSizes laid out as an array followed by all // compSizes laid out as an array // * Fixed-length vs. variable-length integers; that is, to store size values // as uint64s or to use some variable-length encoding // * Regular encoding vs. Delta encoding; that is, delta encoding encodes each // value as the difference between that value and the previous value // // However, some of these techniques do not make sense unless done in // conjunction with another technique. For example: // * Row-oriented vs. column-oriented is useless without compression // * Delta encoding does not make sense without variable-length integers // // For this reason, the following table shows what techniques were used for the // given format names listed below: // // +-------------- Raw: The index is uncompressed // | +---------- Row: The index is row-oriented // | | +------ Fix: Index records use fixed-length uint64s // | | | +-- Reg: Index records are not delta encoded // | | | | // Raw Row Fix Reg // RawFix [X] [?] [X] [X] // RawVar [X] [?] [ ] [X] // RawDlt [X] [?] [ ] [ ] // RowFix [ ] [X] [X] [X] // RowVar [ ] [X] [ ] [X] // RowDlt [ ] [X] [ ] [ ] // ColFix [ ] [ ] [X] [X] // ColVar [ ] [ ] [ ] [X] // ColDlt [ ] [ ] [ ] [ ] // // As seen, the Raw* formats do not care about the orientation of the index // since it is not compressed. Also, all *Dlt formats uses delta encoding // in conjunction with variable-length encoding. // // In summary, the greatest factor for overhead cost is the chunk size. // Smaller chunks reduce efficiency due to more frequent LZ77 dictionary resets // and also increases the total number of chunks needed, thus also increasing // the index size. On the other hand, larger chunks also diminishes the // effectiveness of random access reading since more data must be discarded to // start reading from some given offset. Thus, the choice of chunk size is not // hard-coded by the XFLATE format and this tool can help identify the // appropriate chunk size. package main import ( "bytes" "compress/flate" "encoding/binary" "flag" "fmt" "io" "log" "os" "github.com/dsnet/compress/xflate/internal/meta" ) func init() { log.SetFlags(log.Lshortfile) } func main() { inputFile := flag.String("f", "-", "path to input file") chunkSize := flag.Int("n", 64*1024, "compress the input in n-sized chunks") compLevel := flag.Int("l", flate.DefaultCompression, "compression level") metaEncode := flag.Bool("m", false, "encode index using XFLATE meta encoding") flag.Parse() // Open the input file. var r io.Reader if *inputFile != "-" { f, err := os.Open(*inputFile) if err != nil { log.Fatal(err) } defer f.Close() r = f } else { r = os.Stdin } // Compute the streamed and chunked record values. var chnkRec indexRecord strmRec, chnkRecs := computeRecords(r, *compLevel, *chunkSize) for _, r := range chnkRecs { chnkRec.rawSize += r.rawSize chnkRec.compSize += r.compSize } var n int var pr indexRecord // Previous record var brf, brv, brd, bcf, bcv, bcd bytes.Buffer buf := make([]byte, binary.MaxVarintLen64) // Row-based index format. pr = indexRecord{} for _, r := range chnkRecs { binary.LittleEndian.PutUint64(buf, uint64(r.rawSize)) brf.Write(buf[:8]) binary.LittleEndian.PutUint64(buf, uint64(r.compSize)) brf.Write(buf[:8]) n = binary.PutUvarint(buf, uint64(r.rawSize)) brv.Write(buf[:n]) n = binary.PutUvarint(buf, uint64(r.compSize)) brv.Write(buf[:n]) n = binary.PutVarint(buf, r.rawSize-pr.rawSize) brd.Write(buf[:n]) n = binary.PutVarint(buf, r.compSize-pr.compSize) brd.Write(buf[:n]) pr = r } // Column-based index format. pr = indexRecord{} for _, r := range chnkRecs { binary.LittleEndian.PutUint64(buf, uint64(r.rawSize)) bcf.Write(buf[:8]) n = binary.PutUvarint(buf, uint64(r.rawSize)) bcv.Write(buf[:n]) n = binary.PutVarint(buf, r.rawSize-pr.rawSize) bcd.Write(buf[:n]) pr.rawSize = r.rawSize } for _, r := range chnkRecs { binary.LittleEndian.PutUint64(buf, uint64(r.compSize)) bcf.Write(buf[:8]) n = binary.PutUvarint(buf, uint64(r.compSize)) bcv.Write(buf[:n]) n = binary.PutVarint(buf, r.compSize-pr.compSize) bcd.Write(buf[:n]) pr.compSize = r.compSize } pf := func(a, b int64) float64 { return 100.0 * (float64(a) / float64(b)) } me := func(b []byte) []byte { return b } if *metaEncode { me = encode } // Print basic statistics about the input and output. ns, nc := strmRec.compSize, chnkRec.compSize ps, pc := pf(ns-ns, ns), pf(nc-ns, ns) fmt.Printf("rawSize: %d\n", strmRec.rawSize) // Uncompressed input size fmt.Printf("strmRec.compSize: %d (%+0.2f%%)\n", ns, ps) // Total compressed size as a single stream fmt.Printf("chnkRec.compSize: %d (%+0.2f%%)\n", nc, pc) // Total compressed size as individual chunks fmt.Printf("chunkSize: %d\n", *chunkSize) // Individual chunk size fmt.Printf("numChunks: %d\n", len(chnkRecs)) // Total number of chunks fmt.Println() // Uncompressed index formats. nf := int64(len(me(brf.Bytes()))) nv := int64(len(me(brv.Bytes()))) nd := int64(len(me(brd.Bytes()))) fmt.Printf("RawFix: %d (%+0.2f%%)\n", nf, pf(nf, ns)) // Fixed-length uint64s fmt.Printf("RawVar: %d (%+0.2f%%)\n", nv, pf(nv, ns)) // Variable-length integers (VLI) fmt.Printf("RawDlt: %d (%+0.2f%%)\n", nd, pf(nd, ns)) // VLI with delta encoding fmt.Println() // Compressed row-oriented index formats. nrf := int64(len(me(compress(brf.Bytes(), *compLevel)))) nrv := int64(len(me(compress(brv.Bytes(), *compLevel)))) nrd := int64(len(me(compress(brd.Bytes(), *compLevel)))) fmt.Printf("RowFix: %d (%+0.2f%%)\n", nrf, pf(nrf, ns)) // Fixed-length uint64s fmt.Printf("RowVar: %d (%+0.2f%%)\n", nrv, pf(nrv, ns)) // Variable-length integers (VLI) fmt.Printf("RowDlt: %d (%+0.2f%%)\n", nrd, pf(nrd, ns)) // VLI with delta encoding fmt.Println() // Compressed column-oriented index formats. ncf := int64(len(me(compress(bcf.Bytes(), *compLevel)))) ncv := int64(len(me(compress(bcv.Bytes(), *compLevel)))) ncd := int64(len(me(compress(bcd.Bytes(), *compLevel)))) fmt.Printf("ColFix: %d (%+0.2f%%)\n", ncf, pf(ncf, ns)) // Fixed-length uint64s fmt.Printf("ColVar: %d (%+0.2f%%)\n", ncv, pf(ncv, ns)) // Variable-length integers (VLI) fmt.Printf("ColDlt: %d (%+0.2f%%)\n", ncd, pf(ncd, ns)) // VLI with delta encoding fmt.Println() } // countWriter counts and discards all bytes written to it. type countWriter int64 func (cw *countWriter) Write(b []byte) (int, error) { *(*int64)(cw) += int64(len(b)) return len(b), nil } type indexRecord struct { rawSize int64 // Size when decompressed compSize int64 // Size when compressed } // computeRecords computes the records the raw input size and the compressed // output size. strmRec is a single record for when the input is compressed as // a single stream. chnkRecs is a list of records for when the input is // compressed individually as chunks. func computeRecords(r io.Reader, lvl, chnkSize int) (strmRec indexRecord, chnkRecs []indexRecord) { var cw1, cw2 countWriter zw1, _ := flate.NewWriter(&cw1, lvl) // Streamed compressor zw2, _ := flate.NewWriter(&cw2, lvl) // Chunked compressor buf := make([]byte, chnkSize) for { // Read a full chunks worth of data. cnt, err := io.ReadFull(r, buf) strmRec.rawSize += int64(cnt) if err == io.EOF { break } // Write chunk to both compressors. if _, err := zw1.Write(buf[:cnt]); err != nil { log.Fatal(err) } if _, err := zw2.Write(buf[:cnt]); err != nil { log.Fatal(err) } // Flush the chunked compressor, append the record, and reset. if err := zw2.Flush(); err != nil { log.Fatal(err) } chnkRecs = append(chnkRecs, indexRecord{rawSize: int64(cnt), compSize: int64(cw2)}) cw2 = 0 zw2.Reset(&cw2) if err == io.ErrUnexpectedEOF { break } if err != nil { log.Fatal(err) } } // Flush the streamed compressor and record the compressed size. if err := zw1.Flush(); err != nil { log.Fatal(err) } strmRec.compSize = int64(cw1) return strmRec, chnkRecs } // compress compresses the input buffer at the given level. func compress(b []byte, lvl int) []byte { var buf bytes.Buffer w, err := flate.NewWriter(&buf, lvl) if err != nil { log.Fatal(err) } if _, err := io.Copy(w, bytes.NewReader(b)); err != nil { log.Fatal(err) } if err := w.Close(); err != nil { log.Fatal(err) } return buf.Bytes() } // encode encodes the input using XFLATE's meta encoding. func encode(b []byte) []byte { var buf bytes.Buffer mw := meta.NewWriter(&buf) mw.FinalMode = meta.FinalMeta if _, err := io.Copy(mw, bytes.NewReader(b)); err != nil { log.Fatal(err) } if err := mw.Close(); err != nil { log.Fatal(err) } return buf.Bytes() }