2020#include " GPUCommonAlgorithm.h"
2121
2222#ifndef GPUCA_GPUCODE
23+ #include " MCLabelAccumulator.h"
2324#include " utils/VcShim.h"
2425#endif
2526
@@ -504,7 +505,7 @@ GPUd() void GPUTPCCFHIPTailConnector::Thread<0>(int32_t nBlocks, int32_t nThread
504505// ======== HIP Clusterizer Kernel ========
505506
506507template <>
507- GPUd () void GPUTPCCFHIPClusterizer::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer)
508+ GPUd () void GPUTPCCFHIPClusterizer::Thread<0>(int32_t nBlocks, int32_t nThreads, int32_t iBlock, int32_t iThread, GPUSharedMemory& smem, processorType& clusterer, uint8_t onlyMC )
508509{
509510 if (iBlock >= (int32_t )GPUTPCGeometry::NROWS ) {
510511 return ;
@@ -514,33 +515,32 @@ GPUd() void GPUTPCCFHIPClusterizer::Thread<0>(int32_t nBlocks, int32_t nThreads,
514515 uint32_t nTails = clusterer.mPnHIPTails [row];
515516 nTails = CAMath::Min (nTails, (uint32_t )MaxHIPTailsPerRow - 1 );
516517
517- HIPTailDescriptor * tails = GetHIPTails (clusterer, row);
518+ const auto * tails = GetHIPTails (clusterer, row);
518519 const auto & fragment = clusterer.mPmemory ->fragment ;
519520
520- for (uint32_t iTail = iThread + 1 ; iTail <= nTails; iTail += nThreads) {
521+ auto * clusterPosInRow = clusterer.mPhipClusterPosInRow
522+ ? clusterer.mPhipClusterPosInRow + row * MaxHIPTailsPerRow
523+ : nullptr ;
521524
522- auto * tail = &tails[ iTail];
525+ for ( uint32_t iTail = iThread + 1 ; iTail <= nTails; iTail += nThreads) {
523526
527+ const auto * tail = &tails[iTail];
524528 if (tail->iPrev != 0 ) {
525529 continue ;
526530 }
527531
528- float qTot = tail->qTot ;
529- float qMax = tail->qMax ;
530- const float firstWeight = tail->qTot ;
531- const float firstPad = tail->pad ;
532- const float firstTime = HIPTailTimeMean (*tail);
533- float padSum = firstWeight * firstPad;
534- float padSqSum = firstWeight * firstPad * firstPad;
535- float timeSum = firstWeight * firstTime;
532+ CPU_ONLY (auto labelAcc = MCLabelAccumulator{clusterer});
536533
537- uint32_t tailStart = tail->tailStart ;
538- uint32_t tailEnd = tail->tailEnd ;
539-
540- while (tail->iNext != 0 ) {
541-
542- tail = &tails[tail->iNext ];
534+ float qTot = 0 ;
535+ float qMax = 0 ;
536+ float padSum = 0 ;
537+ float padSqSum = 0 ;
538+ float timeSum = 0 ;
539+ uint32_t tailStart = (uint32_t )-1 ;
540+ uint32_t tailEnd = 0 ;
543541
542+ // Zero-th element is empty tail
543+ for (; tail != tails; tail = &tails[tail->iNext ]) {
544544 const float tailWeight = tail->qTot ;
545545 const float tailPad = tail->pad ;
546546 const float tailTime = HIPTailTimeMean (*tail);
@@ -551,12 +551,14 @@ GPUd() void GPUTPCCFHIPClusterizer::Thread<0>(int32_t nBlocks, int32_t nThreads,
551551 timeSum += tailWeight * tailTime;
552552 tailStart = CAMath::Min<uint32_t >(tailStart, tail->tailStart );
553553 tailEnd = CAMath::Max<uint32_t >(tailEnd, tail->tailEnd );
554+
555+ CPU_ONLY (labelAcc.collectTail (row, tail->pad , tail->tailStart , tail->tailEnd ));
554556 }
555557
556558 const float weightSum = CAMath::Max (qTot, 1 .f );
557- float padMean = padSum / weightSum;
558- float timeMean = timeSum / weightSum; // TODO: Use timebin of saturated signal instead! Time mean is biased for long tails.
559- float padSigma = CAMath::Sqrt (CAMath::Max (0 .f , padSqSum / weightSum - padMean * padMean));
559+ const float padMean = padSum / weightSum;
560+ const float timeMean = timeSum / weightSum; // TODO: Use timebin of saturated signal instead! Time mean is biased for long tails.
561+ const float padSigma = CAMath::Sqrt (CAMath::Max (0 .f , padSqSum / weightSum - padMean * padMean));
560562
561563 tpc::ClusterNative cn;
562564 cn.qMax = qMax;
@@ -568,13 +570,26 @@ GPUd() void GPUTPCCFHIPClusterizer::Thread<0>(int32_t nBlocks, int32_t nThreads,
568570 cn.setSigmaPad (padSigma);
569571
570572 if (cn.qMax >= 1023 ) {
571- // Cut off clusters where the tail connection failed for some reason
572- // TODO: Deduplicate with GPUTPCCFClusterizer::sortIntoBuckets (can't call cross-kernel).
573- // TODO: Add error reporting for row cluster overflow.
574- uint32_t index = CAMath::AtomicAdd (&clusterer.mPclusterInRow [row], 1u );
575- if (index < clusterer.mNMaxClusterPerRow ) {
576- clusterer.mPclusterByRow [clusterer.mNMaxClusterPerRow * row + index] = cn;
573+
574+ uint32_t index;
575+
576+ if (!onlyMC) {
577+ // Cut off clusters where the tail connection failed for some reason
578+ // TODO: Deduplicate with GPUTPCCFClusterizer::sortIntoBuckets (can't call cross-kernel).
579+ // TODO: Add error reporting for row cluster overflow.
580+ index = CAMath::AtomicAdd (&clusterer.mPclusterInRow [row], 1u );
581+ if (index < clusterer.mNMaxClusterPerRow ) {
582+ clusterer.mPclusterByRow [clusterer.mNMaxClusterPerRow * row + index] = cn;
583+ }
584+ if (clusterPosInRow) {
585+ clusterPosInRow[iTail] = index;
586+ }
587+ } else {
588+ index = clusterPosInRow[iTail];
577589 }
590+
591+ CPU_ONLY (labelAcc.commit (row, index, clusterer.mNMaxClusterPerRow ));
578592 }
579- }
593+
594+ } // for (uint32_t iTail = iThread + 1; iTail <= nTails; iTail += nThreads)
580595}
0 commit comments