Add tools/src/lut: A lookup table with compaction
Used by the intrinsic definition generator.
Bug: tint:832
Change-Id: I000b1ebe067117e533b3edd8a3fac67a3cbc14c9
Reviewed-on: https://dawn-review.googlesource.com/c/tint/+/52601
Kokoro: Kokoro <noreply+kokoro@google.com>
Commit-Queue: Ben Clayton <bclayton@google.com>
Reviewed-by: David Neto <dneto@google.com>
diff --git a/tools/src/lut/lut.go b/tools/src/lut/lut.go
new file mode 100644
index 0000000..d811a1e
--- /dev/null
+++ b/tools/src/lut/lut.go
@@ -0,0 +1,245 @@
+// Copyright 2021 The Tint Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package lut provides a look up table, which compresses indexed data
+package lut
+
+import (
+ "sort"
+
+ "dawn.googlesource.com/tint/tools/src/list"
+)
+
+// LUT is a look up table.
+// The table holds a number of items that are stored in a linear list.
+type LUT interface {
+ // Add adds a sequence of items to the table.
+ // items can be a single element, a slice of element, or a List of element.
+ // Returns a pointer to the offset of the first item in the table's list.
+ // The sequence of items stored at [offset, offset+N), where N is the
+ // number of items added will remain equal, even after calling Compact().
+ Add(items interface{}) *int
+ // Compact reorders the table items so that the table storage is compacted
+ // by shuffling data around and de-duplicating sequences of common data.
+ // Each originally added sequence is preserved in the resulting table, with
+ // the same contiguous ordering, but with a potentially different offset.
+ // Heuristics are used to shorten the table length, by exploiting common
+ // subsequences, and removing duplicate sequences.
+ // Note that shortest common superstring is NP-hard, so heuristics are used.
+ // Compact updates pointers returned by Add().
+ Compact()
+}
+
+// New returns a new look up table
+func New(storage list.List) LUT {
+ return &lut{storage: storage}
+}
+
+// A sequence represents a span of entries in the table
+type sequence struct {
+ offset *int // Pointer to the start index of the sequence
+ count int // Length of the sequence
+}
+
+// lut implements LUT
+type lut struct {
+ storage list.List // The List that backs this LUT
+ sequences []sequence // The entries in the LUT
+}
+
+func (t *lut) Add(items interface{}) *int {
+ offset := t.storage.Count()
+ t.storage.Append(items)
+ count := t.storage.Count() - offset
+ offsetPtr := &offset
+ t.sequences = append(t.sequences, sequence{offsetPtr, count})
+ return offsetPtr
+}
+
+func (t lut) Compact() {
+ // Generate int32 identifiers for each unique item in the table.
+ // We use these to compare items instead of comparing the real data as this
+ // function is comparison-heavy, and integer compares are cheap.
+ srcIDs := t.itemIDs()
+ dstIDs := make([]int32, len(srcIDs))
+
+ // Make a copy the data held in the table, use the copy as the source, and
+ // t.storage as the destination.
+ srcData := list.Copy(t.storage)
+ dstData := t.storage
+
+ // Sort all the sequences by length, with the largest first.
+ // This helps 'seed' the compacted form with the largest items first.
+ // This can improve the compaction as small sequences can pack into larger,
+ // placed items.
+ sort.Slice(t.sequences, func(i, j int) bool {
+ return t.sequences[i].count > t.sequences[j].count
+ })
+
+ // unplaced is the list of sequences that have not yet been placed.
+ // All sequences are initially unplaced.
+ unplaced := make([]sequence, len(t.sequences))
+ copy(unplaced, t.sequences)
+
+ // placed is the list of sequences that have been placed.
+ // Nothing is initially placed.
+ placed := make([]sequence, 0, len(t.sequences))
+
+ // remove removes the sequence in unplaced with the index i.
+ remove := func(i int) {
+ placed = append(placed, unplaced[i])
+ if i > 0 {
+ if i < len(unplaced)-1 {
+ copy(unplaced[i:], unplaced[i+1:])
+ }
+ unplaced = unplaced[:len(unplaced)-1]
+ } else {
+ unplaced = unplaced[1:]
+ }
+ }
+
+ // cp copies data from [srcOffset:srcOffset+count] to [dstOffset:dstOffset+count].
+ cp := func(dstOffset, srcOffset, count int) {
+ dstData.CopyFrom(srcData, dstOffset, srcOffset, count)
+ copy(
+ dstIDs[dstOffset:dstOffset+count],
+ srcIDs[srcOffset:srcOffset+count],
+ )
+ }
+
+ // match describes a sequence that can be placed.
+ type match struct {
+ dst int // destination offset
+ src sequence // source sequence
+ len int // number of items that matched
+ idx int // sequence index
+ }
+
+ // number of items that have been placed.
+ newSize := 0
+
+ // While there's sequences to place...
+ for len(unplaced) > 0 {
+ // Place the next largest, unplaced sequence at the end of the new list
+ cp(newSize, *unplaced[0].offset, unplaced[0].count)
+ *unplaced[0].offset = newSize
+ newSize += unplaced[0].count
+ remove(0)
+
+ for {
+ // Look for the sequence with the longest match against the
+ // currently placed data. Any mismatches with currently placed data
+ // will nullify the match. The head or tail of this sequence may
+ // extend the currently placed data.
+ best := match{}
+
+ // For each unplaced sequence...
+ for i := 0; i < len(unplaced); i++ {
+ seq := unplaced[i]
+
+ if best.len >= seq.count {
+ // The best match is already at least as long as this
+ // sequence and sequences are sorted by size, so best cannot
+ // be beaten. Stop searching.
+ break
+ }
+
+ // Perform a full sweep from left to right, scoring the match...
+ for shift := -seq.count + 1; shift < newSize; shift++ {
+ dstS := max(shift, 0)
+ dstE := min(shift+seq.count, newSize)
+ count := dstE - dstS
+ srcS := *seq.offset - min(shift, 0)
+ srcE := srcS + count
+
+ if best.len < count {
+ if equal(srcIDs[srcS:srcE], dstIDs[dstS:dstE]) {
+ best = match{shift, seq, count, i}
+ }
+ }
+ }
+ }
+
+ if best.src.offset == nil {
+ // Nothing matched. Not even one element.
+ // Resort to placing the next largest sequence at the end.
+ break
+ }
+
+ if best.dst < 0 {
+ // Best match wants to place the sequence to the left of the
+ // current output. We have to shuffle everything...
+ n := -best.dst
+ dstData.Copy(n, 0, newSize)
+ copy(dstIDs[n:n+newSize], dstIDs)
+ newSize += n
+ best.dst = 0
+ for _, p := range placed {
+ *p.offset += n
+ }
+ }
+
+ // Place the best matching sequence.
+ cp(best.dst, *best.src.offset, best.src.count)
+ newSize = max(newSize, best.dst+best.src.count)
+ *best.src.offset = best.dst
+ remove(best.idx)
+ }
+ }
+
+ // Shrink the output buffer to the new size.
+ dstData.Resize(newSize)
+
+ // All done.
+}
+
+// Generate a set of identifiers for all the unique items in storage
+func (t lut) itemIDs() []int32 {
+ storageSize := t.storage.Count()
+ keys := make([]int32, storageSize)
+ dataToKey := map[interface{}]int32{}
+ for i := 0; i < storageSize; i++ {
+ data := t.storage.Get(i)
+ key, found := dataToKey[data]
+ if !found {
+ key = int32(len(dataToKey))
+ dataToKey[data] = key
+ }
+ keys[i] = key
+ }
+ return keys
+}
+
+func max(a, b int) int {
+ if a < b {
+ return b
+ }
+ return a
+}
+
+func min(a, b int) int {
+ if a > b {
+ return b
+ }
+ return a
+}
+
+func equal(a, b []int32) bool {
+ for i, v := range a {
+ if b[i] != v {
+ return false
+ }
+ }
+ return true
+}
diff --git a/tools/src/lut/lut_test.go b/tools/src/lut/lut_test.go
new file mode 100644
index 0000000..fa74522
--- /dev/null
+++ b/tools/src/lut/lut_test.go
@@ -0,0 +1,66 @@
+// Copyright 2021 The Tint Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package lut_test
+
+import (
+ "testing"
+
+ "dawn.googlesource.com/tint/tools/src/list"
+ "dawn.googlesource.com/tint/tools/src/lut"
+)
+
+func TestCompactWithFragments(t *testing.T) {
+ runes := []rune{}
+ lut := lut.New(list.Wrap(&runes))
+ indices := []*int{
+ lut.Add([]rune("the life in your")),
+ lut.Add([]rune("in your life that count")),
+ lut.Add([]rune("In the end,")),
+ lut.Add([]rune("the life in")),
+ lut.Add([]rune("count. It's the")),
+ lut.Add([]rune("years")),
+ lut.Add([]rune("in your years.")),
+ lut.Add([]rune("it's not the years in")),
+ lut.Add([]rune("not the years")),
+ lut.Add([]rune("not the years in your")),
+ lut.Add([]rune("end, it's")),
+ }
+
+ lut.Compact()
+
+ expect := "In the end, it's not the years in your life that count. It's the life in your years."
+ got := string(runes)
+ if got != expect {
+ t.Errorf("Compact result was not as expected\nExpected: '%v'\nGot: '%v'", expect, got)
+ }
+ expectedIndices := []int{
+ 61, // the life in your
+ 31, // in your life that count
+ 0, // In the end,
+ 61, // the life in
+ 49, // count. It's the
+ 25, // years
+ 70, // in your years.
+ 12, // it's not the years in
+ 17, // not the years
+ 17, // not the years in your
+ 7, // end, it's
+ }
+ for i, p := range indices {
+ if expected, got := expectedIndices[i], *p; expected != got {
+ t.Errorf("Index %v was not expected. Expected %v, got %v", i, expected, got)
+ }
+ }
+}