blob: 7dac4453cc62f6c6921c56c2993aac01b03cfb2e [file] [log] [blame] [edit]
// Copyright 2021 The Tint Authors.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
// Package lut provides a look up table, which compresses indexed data
package lut
import (
"sort"
)
// LUT is a look up table.
// The table holds a number of items that are stored in a linear list.
type LUT[T comparable] interface {
// Add adds a sequence of items to the table.
// The sequence of items stored at [offset, offset+N), where N is the
// number of items added will remain equal, even after calling Compact().
// Returns a pointer to the start index in the list.
Add(items []T) *int
// Compact reorders the table items so that the table storage is compacted
// by shuffling data around and de-duplicating sequences of common data.
// Each originally added sequence is preserved in the resulting table, with
// the same contiguous ordering, but with a potentially different offset.
// Heuristics are used to shorten the table length, by exploiting common
// subsequences, and removing duplicate sequences.
// Note that shortest common superstring is NP-hard, so heuristics are used.
// Compact updates pointers returned by Add().
Compact() []T
}
// New returns a new look up table
func New[T comparable]() LUT[T] {
return &lut[T]{storage: []T{}}
}
// A sequence represents a span of entries in the table
type sequence struct {
offset *int // Pointer to the start index of the sequence
count int // Length of the sequence
}
// lut implements LUT
type lut[T comparable] struct {
storage []T // The storage
sequences []sequence // The entries in the LUT
}
func (t *lut[T]) Add(items []T) *int {
if len(items) == 0 {
return nil
}
offset := len(t.storage)
count := len(items)
t.storage = append(t.storage, items...)
offsetPtr := &offset
t.sequences = append(t.sequences, sequence{offsetPtr, count})
return offsetPtr
}
// match describes a sequence that can be placed.
type match struct {
dst int // destination offset
src sequence // source sequence
len int // number of items that matched
idx int // sequence index
}
func (t lut[T]) Compact() []T {
// Generate int32 identifiers for each unique item in the table.
// We use these to compare items instead of comparing the real data as this
// function is comparison-heavy, and integer compares are cheap.
srcIDs := t.itemIDs()
dstIDs := make([]int32, len(srcIDs))
// Make a copy the data held in the table, use the copy as the source, and
// t.storage as the destination.
srcData := make([]T, len(t.storage))
copy(srcData, t.storage)
dstData := t.storage
// Sort all the sequences by length, with the largest first.
// This helps 'seed' the compacted form with the largest items first.
// This can improve the compaction as small sequences can pack into larger,
// placed items.
sort.SliceStable(t.sequences, func(i, j int) bool {
return t.sequences[i].count > t.sequences[j].count
})
// unplaced is the list of sequences that have not yet been placed.
// All sequences are initially unplaced.
unplaced := make([]sequence, len(t.sequences))
copy(unplaced, t.sequences)
// placed is the list of sequences that have been placed.
// Nothing is initially placed.
placed := make([]sequence, 0, len(t.sequences))
// remove removes the sequence in unplaced with the index i.
remove := func(i int) {
placed = append(placed, unplaced[i])
if i > 0 {
if i < len(unplaced)-1 {
copy(unplaced[i:], unplaced[i+1:])
}
unplaced = unplaced[:len(unplaced)-1]
} else {
unplaced = unplaced[1:]
}
}
// cp copies data from [srcOffset:srcOffset+count] to [dstOffset:dstOffset+count].
cp := func(dstOffset, srcOffset, count int) {
copy(dstData[dstOffset:dstOffset+count], srcData[srcOffset:srcOffset+count])
copy(dstIDs[dstOffset:dstOffset+count], srcIDs[srcOffset:srcOffset+count])
}
// number of items that have been placed.
newSize := 0
// While there's sequences to place...
for len(unplaced) > 0 {
// Place the next largest, unplaced sequence at the end of the new list
cp(newSize, *unplaced[0].offset, unplaced[0].count)
*unplaced[0].offset = newSize
newSize += unplaced[0].count
remove(0)
for {
// Look for the sequence with the longest match against the
// currently placed data. Any mismatches with currently placed data
// will nullify the match. The head or tail of this sequence may
// extend the currently placed data.
best := match{}
// For each unplaced sequence...
for i := 0; i < len(unplaced); i++ {
seq := unplaced[i]
if best.len >= seq.count {
// The best match is already at least as long as this
// sequence and sequences are sorted by size, so best cannot
// be beaten. Stop searching.
break
}
// Perform a full sweep from left to right, scoring the match...
for shift := -seq.count + 1; shift < newSize; shift++ {
dstS := max(shift, 0)
dstE := min(shift+seq.count, newSize)
count := dstE - dstS
srcS := *seq.offset - min(shift, 0)
srcE := srcS + count
if best.len < count {
if equal(srcIDs[srcS:srcE], dstIDs[dstS:dstE]) {
best = match{shift, seq, count, i}
}
}
}
}
if best.src.offset == nil {
// Nothing matched. Not even one element.
// Resort to placing the next largest sequence at the end.
break
}
if best.dst < 0 {
// Best match wants to place the sequence to the left of the
// current output. We have to shuffle everything...
n := -best.dst
copy(dstData[n:n+newSize], dstData)
copy(dstIDs[n:n+newSize], dstIDs)
newSize += n
best.dst = 0
for _, p := range placed {
*p.offset += n
}
}
// Place the best matching sequence.
cp(best.dst, *best.src.offset, best.src.count)
newSize = max(newSize, best.dst+best.src.count)
*best.src.offset = best.dst
remove(best.idx)
}
}
// Shrink the output buffer to the new size.
dstData = dstData[:newSize]
// All done.
return dstData
}
// Generate a set of identifiers for all the unique items in storage
func (t lut[T]) itemIDs() []int32 {
storageSize := len(t.storage)
keys := make([]int32, storageSize)
dataToKey := map[T]int32{}
for i := 0; i < storageSize; i++ {
data := t.storage[i]
key, found := dataToKey[data]
if !found {
key = int32(len(dataToKey))
dataToKey[data] = key
}
keys[i] = key
}
return keys
}
func max(a, b int) int {
if a < b {
return b
}
return a
}
func min(a, b int) int {
if a > b {
return b
}
return a
}
func equal(a, b []int32) bool {
for i, v := range a {
if b[i] != v {
return false
}
}
return true
}