tools/src/cov/optimization.go - dawn - Git at Google

 // Copyright 2022 The Dawn Authors
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 //
 //    http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 package cov

 import (
 	"log"
 	"sort"
 	"sync"
 )

 // Optimize optimizes the Tree by de-duplicating common spans into a tree of SpanGroups.
 //
 // Breaking down tests into group hierarchies provide a natural way to structure
 // coverage data, as tests of the same suite, file or test are likely to have
 // similar coverage spans.
 //
 // For each source file in the codebase, we create a tree of SpanGroups, where the
 // leaves are the test cases.
 //
 // For example, given the following Paths:
 //
 //	a.b.d.h
 //	a.b.d.i.n
 //	a.b.d.i.o
 //	a.b.e.j
 //	a.b.e.k.p
 //	a.b.e.k.q
 //	a.c.f
 //	a.c.g.l.r
 //	a.c.g.m
 //
 // We would construct the following tree:
 //
 //	             a
 //	      ╭──────┴──────╮
 //	      b             c
 //	  ╭───┴───╮     ╭───┴───╮
 //	  d       e     f       g
 //	╭─┴─╮   ╭─┴─╮         ╭─┴─╮
 //	h   i   j   k         l   m
 //	   ╭┴╮     ╭┴╮        │
 //	   n o     p q        r
 //
 // Each leaf node in this tree (`h`, `n`, `o`, `j`, `p`, `q`, `f`, `r`, `m`)
 // represent a test case, and non-leaf nodes (`a`, `b`, `c`, `d`, `e`, `g`, `i`,
 // `k`, `l`) are suite, file or tests.
 //
 // To begin, we create a test tree structure, and associate the full list of test
 // coverage spans with every leaf node (test case) in this tree.
 //
 // This data structure hasn't given us any compression benefits yet, but we can
 // now do a few tricks to dramatically reduce number of spans needed to describe
 // the graph:
 //
 //	~ Optimization 1: Common span promotion ~
 //
 // The first compression scheme is to promote common spans up the tree when they
 // are common for all children. This will reduce the number of spans needed to be
 // encoded in the final file.
 //
 // For example, if the test group `a` has 4 children that all share the same span
 // `X`:
 //
 //	         a
 //	   ╭───┬─┴─┬───╮
 //	   b   c   d   e
 //	[X,Y] [X] [X] [X,Z]
 //
 // Then span `X` can be promoted up to `a`:
 //
 //	      [X]
 //	       a
 //	 ╭───┬─┴─┬───╮
 //	 b   c   d   e
 //	[Y] []   [] [Z]
 //
 //	~ Optimization 2: Span XOR promotion ~
 //
 // This idea can be extended further, by not requiring all the children to share
 // the same span before promotion. If *most* child nodes share the same span, we
 // can still promote the span, but this time we *remove* the span from the
 // children *if they had it*, and *add* the span to children *if they didn't
 // have it*.
 //
 // For example, if the test group `a` has 4 children with 3 that share the span
 // `X`:
 //
 //	         a
 //	   ╭───┬─┴─┬───╮
 //	   b   c   d   e
 //	[X,Y] [X]  [] [X,Z]
 //
 // Then span `X` can be promoted up to `a` by flipping the presence of `X` on the
 // child nodes:
 //
 //	      [X]
 //	       a
 //	 ╭───┬─┴─┬───╮
 //	 b   c   d   e
 //	[Y] []  [X] [Z]
 //
 // This process repeats up the tree.
 //
 // With this optimization applied, we now need to traverse the tree from root to
 // leaf in order to know whether a given span is in use for the leaf node (test case):
 //
 // * If the span is encountered an *odd* number of times during traversal, then
 // the span is *covered*.
 // * If the span is encountered an *even* number of times during traversal, then
 // the span is *not covered*.
 //
 // See tools/src/cov/coverage_test.go for more examples of this optimization.
 //
 //	~ Optimization 3: Common span grouping ~
 //
 // With real world data, we encounter groups of spans that are commonly found
 // together. To further reduce coverage data, the whole graph is scanned for common
 // span patterns, and are indexed by each tree node.
 // The XOR'ing of spans as described above is performed as if the spans were not
 // grouped.
 //
 //	~ Optimization 4: Lookup tables ~
 //
 // All spans, span-groups and strings are stored in de-duplicated tables, and are
 // indexed wherever possible.
 func (t *Tree) Optimize() {
 	log.Printf("Optimizing coverage tree...")

 	// Start by gathering all of the unique spansets
 	wg := sync.WaitGroup{}
 	wg.Add(len(t.files))
 	for _, file := range t.files {
 		file := file
 		go func() {
 			defer wg.Done()
 			o := optimizer{}
 			for idx, tc := range file.tcm {
 				o.invertForCommon(tc, &t.testRoot.children[idx])
 			}
 			o.createGroups(file)
 		}()
 	}
 	wg.Wait()
 }

 type optimizer struct{}

 // createGroups looks for common SpanSets, and creates indexable span groups
 // which are then used instead.
 func (o *optimizer) createGroups(f *treeFile) {
 	const minSpansInGroup = 2

 	type spansetKey string
 	spansetMap := map[spansetKey]SpanSet{}

 	f.tcm.traverse(func(tc *TestCoverage) {
 		if len(tc.Spans) >= minSpansInGroup {
 			key := spansetKey(tc.Spans.String())
 			if _, ok := spansetMap[key]; !ok {
 				spansetMap[key] = tc.Spans
 			}
 		}
 	})

 	if len(spansetMap) == 0 {
 		return
 	}

 	type spansetInfo struct {
 		key spansetKey
 		set SpanSet // fully expanded set
 		grp SpanGroup
 		id  SpanGroupID
 	}
 	spansets := make([]*spansetInfo, 0, len(spansetMap))
 	for key, set := range spansetMap {
 		spansets = append(spansets, &spansetInfo{
 			key: key,
 			set: set,
 			grp: SpanGroup{Spans: set},
 		})
 	}

 	// Sort by number of spans in each sets starting with the largest.
 	sort.Slice(spansets, func(i, j int) bool {
 		a, b := spansets[i].set, spansets[j].set
 		switch {
 		case len(a) > len(b):
 			return true
 		case len(a) < len(b):
 			return false
 		}
 		return a.List().Compare(b.List()) == -1 // Just to keep output stable
 	})

 	// Assign IDs now that we have stable order.
 	for i := range spansets {
 		spansets[i].id = SpanGroupID(i)
 	}

 	// Loop over the spanGroups starting from the largest, and try to fold them
 	// into the larger sets.
 	// This is O(n^2) complexity.
 nextSpan:
 	for i, a := range spansets[:len(spansets)-1] {
 		for _, b := range spansets[i+1:] {
 			if len(a.set) > len(b.set) && a.set.containsAll(b.set) {
 				extend := b.id // Do not take address of iterator!
 				a.grp.Spans = a.set.removeAll(b.set)
 				a.grp.Extend = &extend
 				continue nextSpan
 			}
 		}
 	}

 	// Rebuild a map of spansetKey to SpanGroup
 	spangroupMap := make(map[spansetKey]*spansetInfo, len(spansets))
 	for _, s := range spansets {
 		spangroupMap[s.key] = s
 	}

 	// Store the groups in the tree
 	f.spangroups = make(map[SpanGroupID]SpanGroup, len(spansets))
 	for _, s := range spansets {
 		f.spangroups[s.id] = s.grp
 	}

 	// Update all the uses.
 	f.tcm.traverse(func(tc *TestCoverage) {
 		key := spansetKey(tc.Spans.String())
 		if g, ok := spangroupMap[key]; ok {
 			tc.Spans = nil
 			tc.Group = &g.id
 		}
 	})
 }

 // invertCommon looks for tree nodes with the majority of the child nodes with
 // the same spans. This span is promoted up to the parent, and the children
 // have the span inverted.
 func (o *optimizer) invertForCommon(tc *TestCoverage, t *Test) {
 	wg := sync.WaitGroup{}
 	wg.Add(len(tc.Children))
 	for id, child := range tc.Children {
 		id, child := id, child
 		go func() {
 			defer wg.Done()
 			o.invertForCommon(child, &t.children[id])
 		}()
 	}
 	wg.Wait()

 	counts := map[SpanID]int{}
 	for _, child := range tc.Children {
 		for span := range child.Spans {
 			counts[span] = counts[span] + 1
 		}
 	}

 	for span, count := range counts {
 		if count > len(t.children)/2 {
 			tc.Spans = tc.Spans.invert(span)
 			for _, idx := range t.indices {
 				child := tc.Children.index(idx)
 				child.Spans = child.Spans.invert(span)
 				if child.deletable() {
 					delete(tc.Children, idx)
 				}
 			}
 		}
 	}
 }
	// Copyright 2022 The Dawn Authors
	//
	// Licensed under the Apache License, Version 2.0 (the "License");
	// you may not use this file except in compliance with the License.
	// You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.

	package cov

	import (
	"log"
	"sort"
	"sync"
	)

	// Optimize optimizes the Tree by de-duplicating common spans into a tree of SpanGroups.
	//
	// Breaking down tests into group hierarchies provide a natural way to structure
	// coverage data, as tests of the same suite, file or test are likely to have
	// similar coverage spans.
	//
	// For each source file in the codebase, we create a tree of SpanGroups, where the
	// leaves are the test cases.
	//
	// For example, given the following Paths:
	//
	// a.b.d.h
	// a.b.d.i.n
	// a.b.d.i.o
	// a.b.e.j
	// a.b.e.k.p
	// a.b.e.k.q
	// a.c.f
	// a.c.g.l.r
	// a.c.g.m
	//
	// We would construct the following tree:
	//
	// a
	// ╭──────┴──────╮
	// b c
	// ╭───┴───╮ ╭───┴───╮
	// d e f g
	// ╭─┴─╮ ╭─┴─╮ ╭─┴─╮
	// h i j k l m
	// ╭┴╮ ╭┴╮ │
	// n o p q r
	//
	// Each leaf node in this tree (`h`, `n`, `o`, `j`, `p`, `q`, `f`, `r`, `m`)
	// represent a test case, and non-leaf nodes (`a`, `b`, `c`, `d`, `e`, `g`, `i`,
	// `k`, `l`) are suite, file or tests.
	//
	// To begin, we create a test tree structure, and associate the full list of test
	// coverage spans with every leaf node (test case) in this tree.
	//
	// This data structure hasn't given us any compression benefits yet, but we can
	// now do a few tricks to dramatically reduce number of spans needed to describe
	// the graph:
	//
	// ~ Optimization 1: Common span promotion ~
	//
	// The first compression scheme is to promote common spans up the tree when they
	// are common for all children. This will reduce the number of spans needed to be
	// encoded in the final file.
	//
	// For example, if the test group `a` has 4 children that all share the same span
	// `X`:
	//
	// a
	// ╭───┬─┴─┬───╮
	// b c d e
	// [X,Y] [X] [X] [X,Z]
	//
	// Then span `X` can be promoted up to `a`:
	//
	// [X]
	// a
	// ╭───┬─┴─┬───╮
	// b c d e
	// [Y] [] [] [Z]
	//
	// ~ Optimization 2: Span XOR promotion ~
	//
	// This idea can be extended further, by not requiring all the children to share
	// the same span before promotion. If most child nodes share the same span, we
	// can still promote the span, but this time we remove the span from the
	// children if they had it, and add the span to children *if they didn't
	// have it*.
	//
	// For example, if the test group `a` has 4 children with 3 that share the span
	// `X`:
	//
	// a
	// ╭───┬─┴─┬───╮
	// b c d e
	// [X,Y] [X] [] [X,Z]
	//
	// Then span `X` can be promoted up to `a` by flipping the presence of `X` on the
	// child nodes:
	//
	// [X]
	// a
	// ╭───┬─┴─┬───╮
	// b c d e
	// [Y] [] [X] [Z]
	//
	// This process repeats up the tree.
	//
	// With this optimization applied, we now need to traverse the tree from root to
	// leaf in order to know whether a given span is in use for the leaf node (test case):
	//
	// * If the span is encountered an odd number of times during traversal, then
	// the span is covered.
	// * If the span is encountered an even number of times during traversal, then
	// the span is not covered.
	//
	// See tools/src/cov/coverage_test.go for more examples of this optimization.
	//
	// ~ Optimization 3: Common span grouping ~
	//
	// With real world data, we encounter groups of spans that are commonly found
	// together. To further reduce coverage data, the whole graph is scanned for common
	// span patterns, and are indexed by each tree node.
	// The XOR'ing of spans as described above is performed as if the spans were not
	// grouped.
	//
	// ~ Optimization 4: Lookup tables ~
	//
	// All spans, span-groups and strings are stored in de-duplicated tables, and are
	// indexed wherever possible.
	func (t *Tree) Optimize() {
	log.Printf("Optimizing coverage tree...")

	// Start by gathering all of the unique spansets
	wg := sync.WaitGroup{}
	wg.Add(len(t.files))
	for _, file := range t.files {
	file := file
	go func() {
	defer wg.Done()
	o := optimizer{}
	for idx, tc := range file.tcm {
	o.invertForCommon(tc, &t.testRoot.children[idx])
	}
	o.createGroups(file)
	}()
	}
	wg.Wait()
	}

	type optimizer struct{}

	// createGroups looks for common SpanSets, and creates indexable span groups
	// which are then used instead.
	func (o optimizer) createGroups(f treeFile) {
	const minSpansInGroup = 2

	type spansetKey string
	spansetMap := map[spansetKey]SpanSet{}

	f.tcm.traverse(func(tc *TestCoverage) {
	if len(tc.Spans) >= minSpansInGroup {
	key := spansetKey(tc.Spans.String())
	if _, ok := spansetMap[key]; !ok {
	spansetMap[key] = tc.Spans
	}
	}
	})

	if len(spansetMap) == 0 {
	return
	}

	type spansetInfo struct {
	key spansetKey
	set SpanSet // fully expanded set
	grp SpanGroup
	id SpanGroupID
	}
	spansets := make([]*spansetInfo, 0, len(spansetMap))
	for key, set := range spansetMap {
	spansets = append(spansets, &spansetInfo{
	key: key,
	set: set,
	grp: SpanGroup{Spans: set},
	})
	}

	// Sort by number of spans in each sets starting with the largest.
	sort.Slice(spansets, func(i, j int) bool {
	a, b := spansets[i].set, spansets[j].set
	switch {
	case len(a) > len(b):
	return true
	case len(a) < len(b):
	return false
	}
	return a.List().Compare(b.List()) == -1 // Just to keep output stable
	})

	// Assign IDs now that we have stable order.
	for i := range spansets {
	spansets[i].id = SpanGroupID(i)
	}

	// Loop over the spanGroups starting from the largest, and try to fold them
	// into the larger sets.
	// This is O(n^2) complexity.
	nextSpan:
	for i, a := range spansets[:len(spansets)-1] {
	for _, b := range spansets[i+1:] {
	if len(a.set) > len(b.set) && a.set.containsAll(b.set) {
	extend := b.id // Do not take address of iterator!
	a.grp.Spans = a.set.removeAll(b.set)
	a.grp.Extend = &extend
	continue nextSpan
	}
	}
	}

	// Rebuild a map of spansetKey to SpanGroup
	spangroupMap := make(map[spansetKey]*spansetInfo, len(spansets))
	for _, s := range spansets {
	spangroupMap[s.key] = s
	}

	// Store the groups in the tree
	f.spangroups = make(map[SpanGroupID]SpanGroup, len(spansets))
	for _, s := range spansets {
	f.spangroups[s.id] = s.grp
	}

	// Update all the uses.
	f.tcm.traverse(func(tc *TestCoverage) {
	key := spansetKey(tc.Spans.String())
	if g, ok := spangroupMap[key]; ok {
	tc.Spans = nil
	tc.Group = &g.id
	}
	})
	}

	// invertCommon looks for tree nodes with the majority of the child nodes with
	// the same spans. This span is promoted up to the parent, and the children
	// have the span inverted.
	func (o optimizer) invertForCommon(tc TestCoverage, t *Test) {
	wg := sync.WaitGroup{}
	wg.Add(len(tc.Children))
	for id, child := range tc.Children {
	id, child := id, child
	go func() {
	defer wg.Done()
	o.invertForCommon(child, &t.children[id])
	}()
	}
	wg.Wait()

	counts := map[SpanID]int{}
	for _, child := range tc.Children {
	for span := range child.Spans {
	counts[span] = counts[span] + 1
	}
	}

	for span, count := range counts {
	if count > len(t.children)/2 {
	tc.Spans = tc.Spans.invert(span)
	for _, idx := range t.indices {
	child := tc.Children.index(idx)
	child.Spans = child.Spans.invert(span)
	if child.deletable() {
	delete(tc.Children, idx)
	}
	}
	}
	}
	}