Regex fuzzer: identifier mutation

Mutates a WGSL-like string by replacing a randomly-selected identifier
with a different randomly-selected identifier.

Change-Id: Iecf45ad2800677cf3609b30d415520e5f2a05ba0
Reviewed-on: https://dawn-review.googlesource.com/c/tint/+/60561
Kokoro: Kokoro <noreply+kokoro@google.com>
Reviewed-by: Alastair Donaldson <afdx@google.com>
Commit-Queue: Alastair Donaldson <afdx@google.com>
diff --git a/fuzzers/tint_regex_fuzzer/regex_fuzzer_tests.cc b/fuzzers/tint_regex_fuzzer/regex_fuzzer_tests.cc
index f0f7f12..252958c 100644
--- a/fuzzers/tint_regex_fuzzer/regex_fuzzer_tests.cc
+++ b/fuzzers/tint_regex_fuzzer/regex_fuzzer_tests.cc
@@ -25,13 +25,13 @@
 
 // Swaps two non-consecutive regions in the edge
 TEST(SwapRegionsTest, SwapIntervalsEdgeNonConsecutive) {
-  std::string R1 = ";region1;", R2 = ";regionregion2",
+  std::string R1 = ";region1;", R2 = ";regionregion2;",
               R3 = ";regionregionregion3;";
   std::string all_regions = R1 + R2 + R3;
 
   // this call should swap R1 with R3.
-  SwapIntervals(0, R1.length() - 1, R1.length() + R2.length(),
-                all_regions.length() - 1, all_regions);
+  SwapIntervals(0, R1.length(), R1.length() + R2.length(), R3.length(),
+                all_regions);
 
   ASSERT_EQ(R3 + R2 + R1, all_regions);
 }
@@ -44,15 +44,15 @@
   std::string all_regions = R1 + R2 + R3 + R4 + R5;
 
   // this call should swap R2 with R4.
-  SwapIntervals(R1.length(), R1.length() + R2.length() - 1,
-                R1.length() + R2.length() + R3.length(),
-                R1.length() + R2.length() + R3.length() + R4.length() - 1,
+  SwapIntervals(R1.length(), R2.length(),
+                R1.length() + R2.length() + R3.length(), R4.length(),
                 all_regions);
 
   ASSERT_EQ(R1 + R4 + R3 + R2 + R5, all_regions);
 }
 
-// Swaps two consecutive regions not in the edge (sorrounded by other regions)
+// Swaps two consecutive regions not in the edge (sorrounded by other
+// regions)
 TEST(SwapRegionsTest, SwapIntervalsConsecutiveEdge) {
   std::string R1 = ";region1;", R2 = ";regionregion2;",
               R3 = ";regionregionregion3;", R4 = ";regionregionregionregion4;",
@@ -60,9 +60,8 @@
   std::string all_regions = R1 + R2 + R3 + R4;
 
   // this call should swap R2 with R3.
-  SwapIntervals(R1.length(), R1.length() + R2.length() - 1,
-                R1.length() + R2.length(),
-                R1.length() + R2.length() + R3.length() - 1, all_regions);
+  SwapIntervals(R1.length(), R2.length(), R1.length() + R2.length(),
+                R3.length(), all_regions);
 
   ASSERT_EQ(R1 + R3 + R2 + R4, all_regions);
 }
@@ -76,12 +75,9 @@
   std::string all_regions = R1 + R2 + R3 + R4 + R5;
 
   // this call should swap R4 with R5.
-  SwapIntervals(
-      R1.length() + R2.length() + R3.length(),
-      R1.length() + R2.length() + R3.length() + R4.length() - 1,
-      R1.length() + R2.length() + R3.length() + R4.length(),
-      R1.length() + R2.length() + R3.length() + R4.length() + R5.length() - 1,
-      all_regions);
+  SwapIntervals(R1.length() + R2.length() + R3.length(), R4.length(),
+                R1.length() + R2.length() + R3.length() + R4.length(),
+                R5.length(), all_regions);
 
   ASSERT_EQ(R1 + R2 + R3 + R5 + R4, all_regions);
 }
@@ -94,7 +90,7 @@
   std::string all_regions = R1 + R2 + R3 + R4 + R5;
 
   // This call should delete R1.
-  DeleteInterval(0, R1.length() - 1, all_regions);
+  DeleteInterval(0, R1.length(), all_regions);
 
   ASSERT_EQ(";" + R2 + R3 + R4 + R5, all_regions);
 }
@@ -108,7 +104,7 @@
 
   // This call should delete R5.
   DeleteInterval(R1.length() + R2.length() + R3.length() + R4.length(),
-                 all_regions.length() - 1, all_regions);
+                 R5.length(), all_regions);
 
   ASSERT_EQ(R1 + R2 + R3 + R4 + ";", all_regions);
 }
@@ -121,8 +117,7 @@
   std::string all_regions = R1 + R2 + R3 + R4 + R5;
 
   // This call should delete R3.
-  DeleteInterval(R1.length() + R2.length(),
-                 R1.length() + R2.length() + R3.length() - 1, all_regions);
+  DeleteInterval(R1.length() + R2.length(), R3.length(), all_regions);
 
   ASSERT_EQ(R1 + R2 + ";" + R4 + R5, all_regions);
 }
@@ -134,7 +129,7 @@
   std::string all_regions = R1 + R2 + R3 + R4 + R5;
 
   // This call should insert R2 after R4.
-  DuplicateInterval(R1.length(), R1.length() + R2.length() - 1,
+  DuplicateInterval(R1.length(), R2.length(),
                     R1.length() + R2.length() + R3.length() + R4.length() - 1,
                     all_regions);
 
@@ -149,9 +144,8 @@
   std::string all_regions = R1 + R2 + R3 + R4 + R5;
 
   // This call should insert R3 after R1.
-  DuplicateInterval(R1.length() + R2.length(),
-                    R1.length() + R2.length() + R3.length() - 1,
-                    R1.length() - 1, all_regions);
+  DuplicateInterval(R1.length() + R2.length(), R3.length(), R1.length() - 1,
+                    all_regions);
 
   ASSERT_EQ(R1 + R3.substr(1, R3.length() - 1) + R2 + R3 + R4 + R5,
             all_regions);
@@ -165,13 +159,73 @@
   std::string all_regions = R1 + R2 + R3 + R4 + R5;
 
   // This call should insert R2 after R5.
-  DuplicateInterval(R1.length(), R1.length() + R2.length() - 1,
-                    all_regions.length() - 1, all_regions);
+  DuplicateInterval(R1.length(), R2.length(), all_regions.length() - 1,
+                    all_regions);
 
   ASSERT_EQ(R1 + R2 + R3 + R4 + R5 + R2.substr(1, R2.length() - 1),
             all_regions);
 }
 
+TEST(ReplaceIdentifierTest, ReplaceIdentifierTest1) {
+  std::string R1 = "|region1|", R2 = "; region2;",
+              R3 = "---------region3---------", R4 = "++region4++",
+              R5 = "***region5***";
+  std::string all_regions = R1 + R2 + R3 + R4 + R5;
+
+  // Replaces R3 with R1.
+  ReplaceRegion(0, R1.length(), R1.length() + R2.length(), R3.length(),
+                all_regions);
+
+  ASSERT_EQ(R1 + R2 + R1 + R4 + R5, all_regions);
+}
+
+TEST(ReplaceIdentifierTest, ReplaceIdentifierTest2) {
+  std::string R1 = "|region1|", R2 = "; region2;",
+              R3 = "---------region3---------", R4 = "++region4++",
+              R5 = "***region5***";
+  std::string all_regions = R1 + R2 + R3 + R4 + R5;
+
+  // Replaces R5 with R3.
+  ReplaceRegion(R1.length() + R2.length(), R3.length(),
+                R1.length() + R2.length() + R3.length() + R4.length(),
+                R5.length(), all_regions);
+
+  ASSERT_EQ(R1 + R2 + R3 + R4 + R3, all_regions);
+}
+
+TEST(GetIdentifierTest, GetIdentifierTest1) {
+  std::string wgsl_code =
+      "fn clamp_0acf8f() {"
+      "var res: vec2<f32> = clamp(vec2<f32>(), vec2<f32>(), vec2<f32>());}"
+      "[[stage(vertex)]]"
+      "fn vertex_main() -> [[builtin(position)]] vec4<f32> {"
+      "  clamp_0acf8f();"
+      "  return vec4<f32>();}"
+      "[[stage(fragment)]]"
+      "fn fragment_main() {"
+      "  clamp_0acf8f();}"
+      "[[stage(compute), workgroup_size(1)]]"
+      "fn compute_main() {"
+      "var<private> foo: f32 = 0.0;"
+      "  clamp_0acf8f();}";
+
+  std::vector<std::pair<size_t, size_t>> identifiers_pos =
+      GetIdentifiers(wgsl_code);
+
+  std::vector<std::pair<size_t, size_t>> ground_truth = {
+      std::make_pair(3, 12),   std::make_pair(19, 3),  std::make_pair(28, 4),
+      std::make_pair(40, 5),   std::make_pair(51, 3),  std::make_pair(59, 4),
+      std::make_pair(72, 4),   std::make_pair(88, 5),  std::make_pair(103, 2),
+      std::make_pair(113, 4),  std::make_pair(125, 7), std::make_pair(145, 4),
+      std::make_pair(158, 12), std::make_pair(175, 6), std::make_pair(187, 3),
+      std::make_pair(197, 5),  std::make_pair(214, 2), std::make_pair(226, 4),
+      std::make_pair(236, 12), std::make_pair(254, 5), std::make_pair(270, 14),
+      std::make_pair(289, 2),  std::make_pair(300, 4), std::make_pair(308, 3),
+      std::make_pair(321, 3),  std::make_pair(326, 3), std::make_pair(338, 12)};
+
+  ASSERT_EQ(ground_truth, identifiers_pos);
+}
+
 }  // namespace
 }  // namespace regex_fuzzer
 }  // namespace fuzzers
diff --git a/fuzzers/tint_regex_fuzzer/wgsl_mutator.cc b/fuzzers/tint_regex_fuzzer/wgsl_mutator.cc
index 580e3df..cb1f850 100644
--- a/fuzzers/tint_regex_fuzzer/wgsl_mutator.cc
+++ b/fuzzers/tint_regex_fuzzer/wgsl_mutator.cc
@@ -40,31 +40,68 @@
   return result;
 }
 
-void SwapIntervals(size_t idx1,
-                   size_t idx2,
-                   size_t idx3,
-                   size_t idx4,
-                   std::string& wgsl_code) {
-  std::string region_1 = wgsl_code.substr(idx1 + 1, idx2 - idx1);
+std::vector<std::pair<size_t, size_t>> GetIdentifiers(
+    const std::string& wgsl_code) {
+  std::vector<std::pair<size_t, size_t>> result;
 
-  std::string region_2 = wgsl_code.substr(idx3 + 1, idx4 - idx3);
+  // This regular expression works by looking for a character that
+  // is not part of an identifier followed by a WGSL identifier, followed
+  // by a character which cannot be part of a WGSL identifer. The regex
+  // for the WGSL identifier is obtained from:
+  // https://www.w3.org/TR/WGSL/#identifiers.
+  std::regex wgsl_identifier_regex(
+      "[^a-zA-Z]([a-zA-Z][0-9a-zA-Z_]*)[^0-9a-zA-Z_]");
+
+  std::smatch match;
+
+  std::string::const_iterator search_start(wgsl_code.cbegin());
+  std::string prefix;
+
+  while (regex_search(search_start, wgsl_code.cend(), match,
+                      wgsl_identifier_regex) == true) {
+    prefix += match.prefix();
+    result.push_back(std::make_pair(prefix.size() + 1, match.str(1).size()));
+    prefix += match.str(0);
+    search_start = match.suffix().first;
+  }
+  return result;
+}
+
+void SwapIntervals(size_t idx1,
+                   size_t reg1_len,
+                   size_t idx2,
+                   size_t reg2_len,
+                   std::string& wgsl_code) {
+  std::string region_1 = wgsl_code.substr(idx1 + 1, reg1_len - 1);
+
+  std::string region_2 = wgsl_code.substr(idx2 + 1, reg2_len - 1);
 
   // The second transformation is done first as it doesn't affect ind1 and ind2
-  wgsl_code.replace(idx3 + 1, region_2.size(), region_1);
+  wgsl_code.replace(idx2 + 1, region_2.size(), region_1);
 
   wgsl_code.replace(idx1 + 1, region_1.size(), region_2);
 }
 
-void DeleteInterval(size_t idx1, size_t idx2, std::string& wgsl_code) {
-  wgsl_code.erase(idx1 + 1, idx2 - idx1);
+void DeleteInterval(size_t idx1, size_t reg_len, std::string& wgsl_code) {
+  wgsl_code.erase(idx1 + 1, reg_len - 1);
 }
 
 void DuplicateInterval(size_t idx1,
+                       size_t reg1_len,
                        size_t idx2,
-                       size_t idx3,
                        std::string& wgsl_code) {
-  std::string region = wgsl_code.substr(idx1 + 1, idx2 - idx1);
-  wgsl_code.insert(idx3 + 1, region);
+  std::string region = wgsl_code.substr(idx1 + 1, reg1_len - 1);
+  wgsl_code.insert(idx2 + 1, region);
+}
+
+void ReplaceRegion(size_t idx1,
+                   size_t id1_len,
+                   size_t idx2,
+                   size_t id2_len,
+                   std::string& wgsl_code) {
+  std::string region_1 = wgsl_code.substr(idx1, id1_len);
+  std::string region_2 = wgsl_code.substr(idx2, id2_len);
+  wgsl_code.replace(idx2, region_2.size(), region_1);
 }
 
 bool SwapRandomIntervals(const std::string& delimiter,
@@ -89,8 +126,10 @@
   size_t ind4 = GetRandomIntFromRange(
       ind3 + 1U, delimiter_positions.size() - 1U, generator);
 
-  SwapIntervals(delimiter_positions[ind1], delimiter_positions[ind2],
-                delimiter_positions[ind3], delimiter_positions[ind4],
+  SwapIntervals(delimiter_positions[ind1],
+                delimiter_positions[ind2] - delimiter_positions[ind1],
+                delimiter_positions[ind3],
+                delimiter_positions[ind4] - delimiter_positions[ind3],
                 wgsl_code);
 
   return true;
@@ -112,7 +151,8 @@
   size_t ind2 = GetRandomIntFromRange(
       ind1 + 1U, delimiter_positions.size() - 1U, generator);
 
-  DeleteInterval(delimiter_positions[ind1], delimiter_positions[ind2],
+  DeleteInterval(delimiter_positions[ind1],
+                 delimiter_positions[ind2] - delimiter_positions[ind1],
                  wgsl_code);
 
   return true;
@@ -137,12 +177,40 @@
   size_t ind3 =
       GetRandomIntFromRange(0, delimiter_positions.size() - 1U, generator);
 
-  DuplicateInterval(delimiter_positions[ind1], delimiter_positions[ind2],
+  DuplicateInterval(delimiter_positions[ind1],
+                    delimiter_positions[ind2] - delimiter_positions[ind1],
                     delimiter_positions[ind3], wgsl_code);
 
   return true;
 }
 
+bool ReplaceRandomIdentifier(std::string& wgsl_code, std::mt19937& generator) {
+  std::vector<std::pair<size_t, size_t>> identifiers =
+      GetIdentifiers(wgsl_code);
+
+  // Need at least 2 identifiers
+  if (identifiers.size() < 2) {
+    return false;
+  }
+
+  size_t id1_index =
+      GetRandomIntFromRange(0, identifiers.size() - 1U, generator);
+
+  size_t id2_index =
+      GetRandomIntFromRange(0, identifiers.size() - 1U, generator);
+
+  // The two identifiers must be different
+  while (id1_index == id2_index) {
+    id2_index = GetRandomIntFromRange(0, identifiers.size() - 1U, generator);
+  }
+
+  ReplaceRegion(identifiers[id1_index].first, identifiers[id1_index].second,
+                identifiers[id2_index].first, identifiers[id2_index].second,
+                wgsl_code);
+
+  return true;
+}
+
 }  // namespace regex_fuzzer
 }  // namespace fuzzers
 }  // namespace tint
diff --git a/fuzzers/tint_regex_fuzzer/wgsl_mutator.h b/fuzzers/tint_regex_fuzzer/wgsl_mutator.h
index 7f1a468..bd7975b 100644
--- a/fuzzers/tint_regex_fuzzer/wgsl_mutator.h
+++ b/fuzzers/tint_regex_fuzzer/wgsl_mutator.h
@@ -32,38 +32,58 @@
 std::vector<size_t> FindDelimiterIndices(const std::string& delimiter,
                                          const std::string& wgsl_code);
 
+/// A function that finds all the identifiers in a WGSL-like string.
+/// @param wgsl_code - the WGSL-like string where the identifiers will be found.
+/// @return a vector with the positions and the length of all the
+/// identifiers in wgsl_code.
+std::vector<std::pair<size_t, size_t>> GetIdentifiers(
+    const std::string& wgsl_code);
+
 /// Given 4 indices, idx1, idx2, idx3 and idx4 it swaps the regions
 /// in the interval (idx1, idx2] with the region in the interval (idx3, idx4]
 /// in wgsl_text.
 /// @param idx1 - starting index of the first region.
-/// @param idx2 - terminating index of the second region.
-/// @param idx3 - starting index of the second region.
-/// @param idx4 - terminating index of the second region.
+/// @param reg1_len - length of the first region.
+/// @param idx2 - starting index of the second region.
+/// @param reg2_len - length of the second region.
 /// @param wgsl_code - the string where the swap will occur.
 void SwapIntervals(size_t idx1,
+                   size_t reg1_len,
                    size_t idx2,
-                   size_t idx3,
-                   size_t idx4,
+                   size_t reg2_len,
                    std::string& wgsl_code);
 
-/// Given 2 indices, idx1, idx2, it delets the region in the interval (idx1,
-/// idx2].
+/// Given index idx1 it delets the region of length interval_len
+/// starting at index idx1;
 /// @param idx1 - starting index of the first region.
-/// @param idx2 - terminating index of the second region.
+/// @param reg_len - terminating index of the second region.
 /// @param wgsl_code - the string where the swap will occur.
-void DeleteInterval(size_t idx1, size_t idx2, std::string& wgsl_code);
+void DeleteInterval(size_t idx1, size_t reg_len, std::string& wgsl_code);
 
-/// Given 3 indices, idx1, idx2, and idx3 it inserts the
-/// region in (idx1, idx2] after idx3.
+/// Given 2 indices, idx1, idx2, it inserts the region of length
+/// reg1_len starting at idx1 after idx2.
 /// @param idx1 - starting index of region.
-/// @param idx2 - terminating index of the region.
-/// @param idx3 - the position where the region will be inserted.
+/// @param reg1_len - length of the region.
+/// @param idx2 - the position where the region will be inserted.
 /// @param wgsl_code - the string where the swap will occur.
 void DuplicateInterval(size_t idx1,
+                       size_t reg1_len,
                        size_t idx2,
-                       size_t idx3,
                        std::string& wgsl_code);
 
+/// Replaces a region of a WGSL-like string of length id2_len starting
+/// at position idx2 with a region of length id1_len starting at
+/// position idx1.
+/// @param idx1    -   starting position of the first region.
+/// @param id1_len -   length of the first region.
+/// @param idx2    -   starting position of the second region.
+/// @param id2_len -   length of the second region.
+void ReplaceRegion(size_t idx1,
+                   size_t id1_len,
+                   size_t idx2,
+                   size_t id2_len,
+                   std::string& wgsl_code);
+
 /// A function that, given WGSL-like string and a delimiter,
 /// generates another WGSL-like string by picking two random regions
 /// enclosed by the delimiter and swapping them.
@@ -97,6 +117,13 @@
                              std::string& wgsl_code,
                              std::mt19937& generator);
 
+/// Replaces a random identifier in wgsl_code.
+/// @param wgsl_code - WGSL-like string where the replacement will occur.
+/// @param generator - the random number generator.
+/// @return true if a replacement happened or false otherwise.
+
+bool ReplaceRandomIdentifier(std::string& wgsl_code, std::mt19937& generator);
+
 }  // namespace regex_fuzzer
 }  // namespace fuzzers
 }  // namespace tint