| // Copyright 2021 The Dawn & Tint Authors |
| // |
| // Redistribution and use in source and binary forms, with or without |
| // modification, are permitted provided that the following conditions are met: |
| // |
| // 1. Redistributions of source code must retain the above copyright notice, this |
| // list of conditions and the following disclaimer. |
| // |
| // 2. Redistributions in binary form must reproduce the above copyright notice, |
| // this list of conditions and the following disclaimer in the documentation |
| // and/or other materials provided with the distribution. |
| // |
| // 3. Neither the name of the copyright holder nor the names of its |
| // contributors may be used to endorse or promote products derived from |
| // this software without specific prior written permission. |
| // |
| // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
| // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
| // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| // DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE |
| // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
| // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR |
| // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER |
| // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
| // OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE |
| // OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| // Package lut provides a look up table, which compresses indexed data |
| package lut |
| |
| import ( |
| "sort" |
| ) |
| |
| // LUT is a look up table. |
| // The table holds a number of items that are stored in a linear list. |
| type LUT[T comparable] interface { |
| // Add adds a sequence of items to the table. |
| // The sequence of items stored at [offset, offset+N), where N is the |
| // number of items added will remain equal, even after calling Compact(). |
| // Returns a pointer to the start index in the list. |
| Add(items []T) *int |
| // Compact reorders the table items so that the table storage is compacted |
| // by shuffling data around and de-duplicating sequences of common data. |
| // Each originally added sequence is preserved in the resulting table, with |
| // the same contiguous ordering, but with a potentially different offset. |
| // Heuristics are used to shorten the table length, by exploiting common |
| // subsequences, and removing duplicate sequences. |
| // Note that shortest common superstring is NP-hard, so heuristics are used. |
| // Compact updates pointers returned by Add(). |
| Compact() []T |
| } |
| |
| // New returns a new look up table |
| func New[T comparable]() LUT[T] { |
| return &lut[T]{storage: []T{}} |
| } |
| |
| // A sequence represents a span of entries in the table |
| type sequence struct { |
| offset *int // Pointer to the start index of the sequence |
| count int // Length of the sequence |
| } |
| |
| // lut implements LUT |
| type lut[T comparable] struct { |
| storage []T // The storage |
| sequences []sequence // The entries in the LUT |
| } |
| |
| func (t *lut[T]) Add(items []T) *int { |
| if len(items) == 0 { |
| return nil |
| } |
| |
| offset := len(t.storage) |
| count := len(items) |
| t.storage = append(t.storage, items...) |
| offsetPtr := &offset |
| t.sequences = append(t.sequences, sequence{offsetPtr, count}) |
| return offsetPtr |
| } |
| |
| // match describes a sequence that can be placed. |
| type match struct { |
| dst int // destination offset |
| src sequence // source sequence |
| len int // number of items that matched |
| idx int // sequence index |
| } |
| |
| func (t lut[T]) Compact() []T { |
| // Generate int32 identifiers for each unique item in the table. |
| // We use these to compare items instead of comparing the real data as this |
| // function is comparison-heavy, and integer compares are cheap. |
| srcIDs := t.itemIDs() |
| dstIDs := make([]int32, len(srcIDs)) |
| |
| // Make a copy the data held in the table, use the copy as the source, and |
| // t.storage as the destination. |
| srcData := make([]T, len(t.storage)) |
| copy(srcData, t.storage) |
| dstData := t.storage |
| |
| // Sort all the sequences by length, with the largest first. |
| // This helps 'seed' the compacted form with the largest items first. |
| // This can improve the compaction as small sequences can pack into larger, |
| // placed items. |
| sort.SliceStable(t.sequences, func(i, j int) bool { |
| return t.sequences[i].count > t.sequences[j].count |
| }) |
| |
| // unplaced is the list of sequences that have not yet been placed. |
| // All sequences are initially unplaced. |
| unplaced := make([]sequence, len(t.sequences)) |
| copy(unplaced, t.sequences) |
| |
| // placed is the list of sequences that have been placed. |
| // Nothing is initially placed. |
| placed := make([]sequence, 0, len(t.sequences)) |
| |
| // remove removes the sequence in unplaced with the index i. |
| remove := func(i int) { |
| placed = append(placed, unplaced[i]) |
| if i > 0 { |
| if i < len(unplaced)-1 { |
| copy(unplaced[i:], unplaced[i+1:]) |
| } |
| unplaced = unplaced[:len(unplaced)-1] |
| } else { |
| unplaced = unplaced[1:] |
| } |
| } |
| |
| // cp copies data from [srcOffset:srcOffset+count] to [dstOffset:dstOffset+count]. |
| cp := func(dstOffset, srcOffset, count int) { |
| copy(dstData[dstOffset:dstOffset+count], srcData[srcOffset:srcOffset+count]) |
| copy(dstIDs[dstOffset:dstOffset+count], srcIDs[srcOffset:srcOffset+count]) |
| } |
| |
| // number of items that have been placed. |
| newSize := 0 |
| |
| // While there's sequences to place... |
| for len(unplaced) > 0 { |
| // Place the next largest, unplaced sequence at the end of the new list |
| cp(newSize, *unplaced[0].offset, unplaced[0].count) |
| *unplaced[0].offset = newSize |
| newSize += unplaced[0].count |
| remove(0) |
| |
| for { |
| // Look for the sequence with the longest match against the |
| // currently placed data. Any mismatches with currently placed data |
| // will nullify the match. The head or tail of this sequence may |
| // extend the currently placed data. |
| best := match{} |
| |
| // For each unplaced sequence... |
| for i := 0; i < len(unplaced); i++ { |
| seq := unplaced[i] |
| |
| if best.len >= seq.count { |
| // The best match is already at least as long as this |
| // sequence and sequences are sorted by size, so best cannot |
| // be beaten. Stop searching. |
| break |
| } |
| |
| // Perform a full sweep from left to right, scoring the match... |
| for shift := -seq.count + 1; shift < newSize; shift++ { |
| dstS := max(shift, 0) |
| dstE := min(shift+seq.count, newSize) |
| count := dstE - dstS |
| srcS := *seq.offset - min(shift, 0) |
| srcE := srcS + count |
| |
| if best.len < count { |
| if equal(srcIDs[srcS:srcE], dstIDs[dstS:dstE]) { |
| best = match{shift, seq, count, i} |
| } |
| } |
| } |
| } |
| |
| if best.src.offset == nil { |
| // Nothing matched. Not even one element. |
| // Resort to placing the next largest sequence at the end. |
| break |
| } |
| |
| if best.dst < 0 { |
| // Best match wants to place the sequence to the left of the |
| // current output. We have to shuffle everything... |
| n := -best.dst |
| copy(dstData[n:n+newSize], dstData) |
| copy(dstIDs[n:n+newSize], dstIDs) |
| newSize += n |
| best.dst = 0 |
| for _, p := range placed { |
| *p.offset += n |
| } |
| } |
| |
| // Place the best matching sequence. |
| cp(best.dst, *best.src.offset, best.src.count) |
| newSize = max(newSize, best.dst+best.src.count) |
| *best.src.offset = best.dst |
| remove(best.idx) |
| } |
| } |
| |
| // Shrink the output buffer to the new size. |
| dstData = dstData[:newSize] |
| |
| // All done. |
| return dstData |
| } |
| |
| // Generate a set of identifiers for all the unique items in storage |
| func (t lut[T]) itemIDs() []int32 { |
| storageSize := len(t.storage) |
| keys := make([]int32, storageSize) |
| dataToKey := map[T]int32{} |
| for i := 0; i < storageSize; i++ { |
| data := t.storage[i] |
| key, found := dataToKey[data] |
| if !found { |
| key = int32(len(dataToKey)) |
| dataToKey[data] = key |
| } |
| keys[i] = key |
| } |
| return keys |
| } |
| |
| func max(a, b int) int { |
| if a < b { |
| return b |
| } |
| return a |
| } |
| |
| func min(a, b int) int { |
| if a > b { |
| return b |
| } |
| return a |
| } |
| |
| func equal(a, b []int32) bool { |
| for i, v := range a { |
| if b[i] != v { |
| return false |
| } |
| } |
| return true |
| } |