Add tools/src/lut: A lookup table with compaction

Used by the intrinsic definition generator.

Bug: tint:832
Change-Id: I000b1ebe067117e533b3edd8a3fac67a3cbc14c9
Reviewed-on: https://dawn-review.googlesource.com/c/tint/+/52601
Kokoro: Kokoro <noreply+kokoro@google.com>
Commit-Queue: Ben Clayton <bclayton@google.com>
Reviewed-by: David Neto <dneto@google.com>
diff --git a/tools/src/lut/lut.go b/tools/src/lut/lut.go
new file mode 100644
index 0000000..d811a1e
--- /dev/null
+++ b/tools/src/lut/lut.go
@@ -0,0 +1,245 @@
+// Copyright 2021 The Tint Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Package lut provides a look up table, which compresses indexed data
+package lut
+
+import (
+	"sort"
+
+	"dawn.googlesource.com/tint/tools/src/list"
+)
+
+// LUT is a look up table.
+// The table holds a number of items that are stored in a linear list.
+type LUT interface {
+	// Add adds a sequence of items to the table.
+	// items can be a single element, a slice of element, or a List of element.
+	// Returns a pointer to the offset of the first item in the table's list.
+	// The sequence of items stored at [offset, offset+N), where N is the
+	// number of items added will remain equal, even after calling Compact().
+	Add(items interface{}) *int
+	// Compact reorders the table items so that the table storage is compacted
+	// by shuffling data around and de-duplicating sequences of common data.
+	// Each originally added sequence is preserved in the resulting table, with
+	// the same contiguous ordering, but with a potentially different offset.
+	// Heuristics are used to shorten the table length, by exploiting common
+	// subsequences, and removing duplicate sequences.
+	// Note that shortest common superstring is NP-hard, so heuristics are used.
+	// Compact updates pointers returned by Add().
+	Compact()
+}
+
+// New returns a new look up table
+func New(storage list.List) LUT {
+	return &lut{storage: storage}
+}
+
+// A sequence represents a span of entries in the table
+type sequence struct {
+	offset *int // Pointer to the start index of the sequence
+	count  int  // Length of the sequence
+}
+
+// lut implements LUT
+type lut struct {
+	storage   list.List  // The List that backs this LUT
+	sequences []sequence // The entries in the LUT
+}
+
+func (t *lut) Add(items interface{}) *int {
+	offset := t.storage.Count()
+	t.storage.Append(items)
+	count := t.storage.Count() - offset
+	offsetPtr := &offset
+	t.sequences = append(t.sequences, sequence{offsetPtr, count})
+	return offsetPtr
+}
+
+func (t lut) Compact() {
+	// Generate int32 identifiers for each unique item in the table.
+	// We use these to compare items instead of comparing the real data as this
+	// function is comparison-heavy, and integer compares are cheap.
+	srcIDs := t.itemIDs()
+	dstIDs := make([]int32, len(srcIDs))
+
+	// Make a copy the data held in the table, use the copy as the source, and
+	// t.storage as the destination.
+	srcData := list.Copy(t.storage)
+	dstData := t.storage
+
+	// Sort all the sequences by length, with the largest first.
+	// This helps 'seed' the compacted form with the largest items first.
+	// This can improve the compaction as small sequences can pack into larger,
+	// placed items.
+	sort.Slice(t.sequences, func(i, j int) bool {
+		return t.sequences[i].count > t.sequences[j].count
+	})
+
+	// unplaced is the list of sequences that have not yet been placed.
+	// All sequences are initially unplaced.
+	unplaced := make([]sequence, len(t.sequences))
+	copy(unplaced, t.sequences)
+
+	// placed is the list of sequences that have been placed.
+	// Nothing is initially placed.
+	placed := make([]sequence, 0, len(t.sequences))
+
+	// remove removes the sequence in unplaced with the index i.
+	remove := func(i int) {
+		placed = append(placed, unplaced[i])
+		if i > 0 {
+			if i < len(unplaced)-1 {
+				copy(unplaced[i:], unplaced[i+1:])
+			}
+			unplaced = unplaced[:len(unplaced)-1]
+		} else {
+			unplaced = unplaced[1:]
+		}
+	}
+
+	// cp copies data from [srcOffset:srcOffset+count] to [dstOffset:dstOffset+count].
+	cp := func(dstOffset, srcOffset, count int) {
+		dstData.CopyFrom(srcData, dstOffset, srcOffset, count)
+		copy(
+			dstIDs[dstOffset:dstOffset+count],
+			srcIDs[srcOffset:srcOffset+count],
+		)
+	}
+
+	// match describes a sequence that can be placed.
+	type match struct {
+		dst int      // destination offset
+		src sequence // source sequence
+		len int      // number of items that matched
+		idx int      // sequence index
+	}
+
+	// number of items that have been placed.
+	newSize := 0
+
+	// While there's sequences to place...
+	for len(unplaced) > 0 {
+		// Place the next largest, unplaced sequence at the end of the new list
+		cp(newSize, *unplaced[0].offset, unplaced[0].count)
+		*unplaced[0].offset = newSize
+		newSize += unplaced[0].count
+		remove(0)
+
+		for {
+			// Look for the sequence with the longest match against the
+			// currently placed data. Any mismatches with currently placed data
+			// will nullify the match. The head or tail of this sequence may
+			// extend the currently placed data.
+			best := match{}
+
+			// For each unplaced sequence...
+			for i := 0; i < len(unplaced); i++ {
+				seq := unplaced[i]
+
+				if best.len >= seq.count {
+					// The best match is already at least as long as this
+					// sequence and sequences are sorted by size, so best cannot
+					// be beaten. Stop searching.
+					break
+				}
+
+				// Perform a full sweep from left to right, scoring the match...
+				for shift := -seq.count + 1; shift < newSize; shift++ {
+					dstS := max(shift, 0)
+					dstE := min(shift+seq.count, newSize)
+					count := dstE - dstS
+					srcS := *seq.offset - min(shift, 0)
+					srcE := srcS + count
+
+					if best.len < count {
+						if equal(srcIDs[srcS:srcE], dstIDs[dstS:dstE]) {
+							best = match{shift, seq, count, i}
+						}
+					}
+				}
+			}
+
+			if best.src.offset == nil {
+				// Nothing matched. Not even one element.
+				// Resort to placing the next largest sequence at the end.
+				break
+			}
+
+			if best.dst < 0 {
+				// Best match wants to place the sequence to the left of the
+				// current output. We have to shuffle everything...
+				n := -best.dst
+				dstData.Copy(n, 0, newSize)
+				copy(dstIDs[n:n+newSize], dstIDs)
+				newSize += n
+				best.dst = 0
+				for _, p := range placed {
+					*p.offset += n
+				}
+			}
+
+			// Place the best matching sequence.
+			cp(best.dst, *best.src.offset, best.src.count)
+			newSize = max(newSize, best.dst+best.src.count)
+			*best.src.offset = best.dst
+			remove(best.idx)
+		}
+	}
+
+	// Shrink the output buffer to the new size.
+	dstData.Resize(newSize)
+
+	// All done.
+}
+
+// Generate a set of identifiers for all the unique items in storage
+func (t lut) itemIDs() []int32 {
+	storageSize := t.storage.Count()
+	keys := make([]int32, storageSize)
+	dataToKey := map[interface{}]int32{}
+	for i := 0; i < storageSize; i++ {
+		data := t.storage.Get(i)
+		key, found := dataToKey[data]
+		if !found {
+			key = int32(len(dataToKey))
+			dataToKey[data] = key
+		}
+		keys[i] = key
+	}
+	return keys
+}
+
+func max(a, b int) int {
+	if a < b {
+		return b
+	}
+	return a
+}
+
+func min(a, b int) int {
+	if a > b {
+		return b
+	}
+	return a
+}
+
+func equal(a, b []int32) bool {
+	for i, v := range a {
+		if b[i] != v {
+			return false
+		}
+	}
+	return true
+}
diff --git a/tools/src/lut/lut_test.go b/tools/src/lut/lut_test.go
new file mode 100644
index 0000000..fa74522
--- /dev/null
+++ b/tools/src/lut/lut_test.go
@@ -0,0 +1,66 @@
+// Copyright 2021 The Tint Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package lut_test
+
+import (
+	"testing"
+
+	"dawn.googlesource.com/tint/tools/src/list"
+	"dawn.googlesource.com/tint/tools/src/lut"
+)
+
+func TestCompactWithFragments(t *testing.T) {
+	runes := []rune{}
+	lut := lut.New(list.Wrap(&runes))
+	indices := []*int{
+		lut.Add([]rune("the life in your")),
+		lut.Add([]rune("in your life that count")),
+		lut.Add([]rune("In the end,")),
+		lut.Add([]rune("the life in")),
+		lut.Add([]rune("count. It's the")),
+		lut.Add([]rune("years")),
+		lut.Add([]rune("in your years.")),
+		lut.Add([]rune("it's not the years in")),
+		lut.Add([]rune("not the years")),
+		lut.Add([]rune("not the years in your")),
+		lut.Add([]rune("end, it's")),
+	}
+
+	lut.Compact()
+
+	expect := "In the end, it's not the years in your life that count. It's the life in your years."
+	got := string(runes)
+	if got != expect {
+		t.Errorf("Compact result was not as expected\nExpected: '%v'\nGot:      '%v'", expect, got)
+	}
+	expectedIndices := []int{
+		61, //                                                              the life in your
+		31, //                                in your life that count
+		0,  // In the end,
+		61, //                                                              the life in
+		49, //                                                  count. It's the
+		25, //                          years
+		70, //                                                                       in your years.
+		12, //             it's not the years in
+		17, //                  not the years
+		17, //                  not the years in your
+		7,  //        end, it's
+	}
+	for i, p := range indices {
+		if expected, got := expectedIndices[i], *p; expected != got {
+			t.Errorf("Index %v was not expected. Expected %v, got %v", i, expected, got)
+		}
+	}
+}