Add multi-threading support to the processing of tensor core planes.

muditgokhale2 · copybara-github · commit 63964151121c · 2025-10-06T01:00:56.000-07:00
PiperOrigin-RevId: 815596292
diff --git a/xprof/utils/BUILD b/xprof/utils/BUILD
@@ -235,6 +235,7 @@ cc_test(
         "@xla//xla/tsl/profiler/utils:xplane_builder",
         "@xla//xla/tsl/profiler/utils:xplane_schema",
         "@xla//xla/tsl/profiler/utils:xplane_test_utils",
+        "@xla//xla/tsl/profiler/utils:xplane_utils",
         "@xla//xla/tsl/profiler/utils:xplane_visitor",
     ],
 )
diff --git a/xprof/utils/derived_timeline.cc b/xprof/utils/derived_timeline.cc
@@ -743,10 +743,21 @@ void GenerateDerivedTimeLines(
   if (host_plane) {
     DeriveEventsFromHostTrace(host_plane, group_metadata_map, device_planes);
   }
-  for (XPlane* plane : FindMutableTensorCorePlanes(space)) {
-    DeriveLinesFromStats(plane);
-    tsl::profiler::SortXPlane(plane);
+
+  std::vector<XPlane*> tensor_core_planes = FindMutableTensorCorePlanes(space);
+
+  int thread_pool_size = std::min(tsl::port::MaxParallelism(),
+                                  static_cast<int>(device_planes.size()));
+  auto plane_processing_executor = std::make_unique<XprofThreadPoolExecutor>(
+      "ProcessTensorCorePlanes", thread_pool_size);
+  // TODO(b/449633660) Analyze multi-threading inside DeriveLinesFromStats.
+  for (XPlane* plane : tensor_core_planes) {
+    plane_processing_executor->Execute([plane]() {
+      DeriveLinesFromStats(plane);
+      tsl::profiler::SortXPlane(plane);
+    });
   }
+  plane_processing_executor->JoinAll();
 }
 
 void DeriveLinesFromStats(XPlane* device_trace) {
diff --git a/xprof/utils/derived_timeline_test.cc b/xprof/utils/derived_timeline_test.cc
@@ -36,6 +36,7 @@ limitations under the License.
 #include "xla/tsl/profiler/utils/xplane_builder.h"
 #include "xla/tsl/profiler/utils/xplane_schema.h"
 #include "xla/tsl/profiler/utils/xplane_test_utils.h"
+#include "xla/tsl/profiler/utils/xplane_utils.h"
 #include "xla/tsl/profiler/utils/xplane_visitor.h"
 #include "tsl/profiler/protobuf/xplane.pb.h"
 
@@ -661,6 +662,78 @@ TEST(DerivedTimelineTest, EnsureAllGpuEventsAreGrouped) {
   });
 }
 
+// Tests that the multi-threaded processing of Tensor Core planes works
+// correctly.
+TEST(DerivedTimelineTest, MultiThreadedTensorCorePlaneProcessing) {
+  constexpr int kNumPlanes = 4;
+  constexpr int kNumEvents = 10;
+  constexpr int kEventDurationPs = 100;
+
+  XSpace space;
+  tsl::profiler::GroupMetadataMap group_metadata_map;
+
+  // Create multiple Tensor Core planes, each with a line of unsorted events.
+  for (int i = 0; i < kNumPlanes; ++i) {
+    XPlane* plane = tsl::profiler::GetOrCreateTpuXPlane(
+        &space, /*device_ordinal=*/i, "TPU V4", 0, 0);
+    XPlaneBuilder plane_builder(plane);
+    auto line_builder = plane_builder.GetOrCreateLine(0);
+    const std::string tf_op_name = absl::StrCat("MyOp:", i);
+
+    for (int j = 0; j < kNumEvents; ++j) {
+      // Add events in reverse order to test sorting.
+      int64_t offset = (kNumEvents - 1 - j) * kEventDurationPs * 2;
+      CreateXEvent(&plane_builder, &line_builder, "kernel", offset,
+                   kEventDurationPs, {{StatType::kTfOp, tf_op_name}});
+    }
+  }
+
+  // This will trigger the multi-threaded logic you added.
+  GenerateDerivedTimeLines(group_metadata_map, &space);
+
+  // Verify that each plane was processed correctly.
+  for (int i = 0; i < kNumPlanes; ++i) {
+    const std::string plane_name = absl::StrCat("/device:TPU:", i);
+    const XPlane* plane = tsl::profiler::FindPlaneWithName(space, plane_name);
+    ASSERT_NE(plane, nullptr);
+    XPlaneVisitor plane_visitor = tsl::profiler::CreateTfXPlaneVisitor(plane);
+
+    // 1. Verify that the events on the original line are now sorted.
+    const XLine* original_line = nullptr;
+    for (const auto& line : plane->lines()) {
+      if (line.id() == 0) {
+        original_line = &line;
+        break;
+      }
+    }
+    ASSERT_NE(original_line, nullptr);
+
+    int64_t last_timestamp_ps = -1;
+    for (const auto& event : original_line->events()) {
+      ASSERT_GE(event.offset_ps(), last_timestamp_ps);
+      last_timestamp_ps = event.offset_ps();
+    }
+    EXPECT_EQ(original_line->events_size(), kNumEvents);
+
+    // 2. Verify that DeriveLinesFromStats created the derived TF Op line.
+    bool tf_op_line_found = false;
+    plane_visitor.ForEachLine([&](const XLineVisitor& line_visitor) {
+      if (line_visitor.Name() == tsl::profiler::kTensorFlowOpLineName) {
+        tf_op_line_found = true;
+        EXPECT_EQ(line_visitor.NumEvents(), 1);  // Should be merged into one.
+        line_visitor.ForEachEvent([&](const XEventVisitor& event) {
+          EXPECT_EQ(event.Name(), absl::StrCat("MyOp:", i));
+          // Check the duration of the merged event.
+          int64_t expected_duration =
+              (kNumEvents - 1) * kEventDurationPs * 2 + kEventDurationPs;
+          EXPECT_EQ(event.DurationPs(), expected_duration);
+        });
+      }
+    });
+    EXPECT_TRUE(tf_op_line_found);
+  }
+}
+
 }  // namespace
 }  // namespace profiler
 }  // namespace tensorflow

Original file line number	Diff line number	Diff line change
`@@ -235,6 +235,7 @@ cc_test(`
`235`	`235`	`"@xla//xla/tsl/profiler/utils:xplane_builder",`
`236`	`236`	`"@xla//xla/tsl/profiler/utils:xplane_schema",`
`237`	`237`	`"@xla//xla/tsl/profiler/utils:xplane_test_utils",`
	`238`	`+ "@xla//xla/tsl/profiler/utils:xplane_utils",`
`238`	`239`	`"@xla//xla/tsl/profiler/utils:xplane_visitor",`
`239`	`240`	`],`
`240`	`241`	`)`