#include "perf_tools.h" namespace PerfTools { string Percentage(uint32_t aCount, uint32_t aTotal) { stringstream StrStrm; StrStrm << DoublePrinter(100.0 * double(aCount)/double(aTotal), 5, 2, true) << "%"; return StrStrm.str(); } void Pipeline_c::Tick(uint32_t aAmount) { mTickCount += aAmount; for (auto It=mResources.begin(); It != mResources.end(); ++It) (*It)->Tick(aAmount); } void Pipeline_c::Stall(uint32_t aAmount) { mStallCount += aAmount; Tick(aAmount); } void Pipeline_c::Print(ostream &aStream) const { aStream << "Pipeline summary: " << endl; aStream << "===========================" << endl; aStream << " Number of cycles: " << DecPrinter(mTickCount) << endl; aStream << " Number of stalls: " << DecPrinter(mStallCount) << endl; aStream << " Average IPC: " << DoublePrinter(double(mTickCount-mStallCount)/double(mTickCount),4,4,true) << endl; } void Pipeline_c::AddResource(Resource_i *aResource) { mResources.push_back(aResource); } void InstFetch_c::Tick(uint32_t aAmount) { if (!mInternalStall) mPrefetchAmount = max(mPrefetchBufferSize/2, mPrefetchAmount + aAmount); } void InstFetch_c::Fetch(bool aHidden) { // CRAY_ASSERT(mInstructionFetches <= 7020670); // Terminate after a set amount of instructions for easy benchmarking ExtraFetch(aHidden); ++mCurrentBasicBlockLength; ++mInstructionFetches; } void InstFetch_c::ExtraFetch(bool aHidden) { if (mPrefetchAmount == 0) { uint32_t Stall = mMemoryModel.ReadLatency(); mInternalStall = true; mParent.Stall(Stall); mStallCount += Stall; mInternalStall = false; if (aHidden) mPrefetchAmount = 1; } if (!aHidden) mParent.Tick(); CRAY_ASSERT(mPrefetchAmount > 0); --mPrefetchAmount; ++mFetchCount; mPostExecAmount = max(mPrefetchAmount/2, mPostExecAmount+1); } void InstFetch_c::TargetCalcStall(uint32_t aAmount) { mStallCount += aAmount; mTargetCallStallCount += aAmount; mParent.Stall(aAmount); } void InstFetch_c::ShortJump(int32_t aOffset) { if (mBasicBlockLengths.size() <= mCurrentBasicBlockLength) mBasicBlockLengths.resize(mCurrentBasicBlockLength+1); ++(mBasicBlockLengths[mCurrentBasicBlockLength]); mCurrentBasicBlockLength = 0; ++mShortJumpCount; if ((aOffset > 0 && aOffset <= (int32_t)mPrefetchAmount) || (aOffset < 0 && -aOffset <= (int32_t)mPostExecAmount)) { return; } ++mOutOfBufferShortJumpCount; mPrefetchAmount = 0; mPostExecAmount = 0; } void InstFetch_c::LongJump() { ++mLongJumpCount; mPrefetchAmount = 0; mPostExecAmount = 0; if (mBasicBlockLengths.size() <= mCurrentBasicBlockLength) mBasicBlockLengths.resize(mCurrentBasicBlockLength+1); ++(mBasicBlockLengths[mCurrentBasicBlockLength]); mCurrentBasicBlockLength = 0; } void InstFetch_c::Interrupt() { ++mInterruptCount; mPrefetchAmount = 0; mPostExecAmount = 0; } void InstFetch_c::Print(ostream &aStream) const { aStream << "Instruction buffer summary: " << endl; aStream << "===========================" << endl; aStream << " Number of instruction fetches: " << DecPrinter(mInstructionFetches) << endl; aStream << " Number of fetches: " << DecPrinter(mFetchCount) << endl; aStream << " Number of stalls: " << DecPrinter(mStallCount) << endl; aStream << " Number of target calc stalls: " << DecPrinter(mTargetCallStallCount) << endl; aStream << " Number of short jumps: " << DecPrinter(mShortJumpCount) << endl; aStream << " Number of out of buffer short jumps: " << DecPrinter(mOutOfBufferShortJumpCount) << endl; aStream << " Number of long jumps: " << DecPrinter(mLongJumpCount) << endl; aStream << " Number of interrupts: " << DecPrinter(mInterruptCount) << endl; aStream << " Basic block length histogram: " << endl; uint32_t Sum = 0; for(size_t i=0;i= TerminateSum) { Terminate = i; break; } } Sum2 = 0; for(size_t i=0;i<=Terminate;++i) { Sum2 += mBasicBlockLengths[i]; aStream << " " << DecPrinter(i) << " " << Percentage(mBasicBlockLengths[i],Sum) << " - cumulative: " << Percentage(Sum2,Sum) << endl; } } void ComputeResource_c::Tick(uint32_t aAmount) { if (aAmount < mUseMap.size()) { for(size_t Idx = 0; Idx < mUseMap.size() - aAmount; ++Idx) { mUseMap[Idx] = mUseMap[Idx+aAmount]; } for(size_t Idx = mUseMap.size() - aAmount; Idx < mUseMap.size(); ++Idx) { mUseMap[Idx] = false; } } else { for(size_t Idx = 0; Idx < mUseMap.size(); ++Idx) { mUseMap[Idx] = false; } } } void ComputeResource_c::Use(uint32_t aTimeInFuture, uint32_t aDuration) { CRAY_ASSERT(aDuration > 0); // First let's see if the requested block is available size_t TrueStartTime = aTimeInFuture; for(size_t Idx=std::min(size_t(mUseMap.size()),size_t(aTimeInFuture+aDuration))-1; Idx>=aTimeInFuture; --Idx) { if (mUseMap[Idx]) { TrueStartTime = Idx; break; } } CRAY_ASSERT(TrueStartTime >= aTimeInFuture); // Record the new use-time, resizing the array if needed size_t TrueEndTime = TrueStartTime + aDuration - 1; if (TrueEndTime >= mUseMap.size()) mUseMap.resize(TrueEndTime+1); for(size_t Idx=TrueStartTime; Idx <= TrueEndTime; ++Idx) { mUseMap[Idx] = true; } // Figure out if we needed to stall, if we did, report the stall to parent size_t Stall = TrueStartTime-aTimeInFuture; if (Stall > 0) { mParent.Stall(Stall); mStallCount += Stall; } mUseCount += aDuration; } void ComputeResource_c::Print(ostream &aStream) const { aStream << "Compute resource: " << mName << endl; aStream << "===========================" << endl; aStream << " Number of stalls: " << DecPrinter(mStallCount) << endl; aStream << " Number of active cycles (uses): " << DecPrinter(mUseCount) << endl; } void StorageResource_c::Tick(uint32_t aAmount) { mNextRead = max((int)(mNextRead)-(int)(aAmount), 0); mNextWrite = max((int)(mNextWrite)-(int)(aAmount), 0); } void StorageResource_c::Read(uint32_t aTimeInFuture) { CRAY_ASSERT(aTimeInFuture > 0); uint32_t TrueReadTime = aTimeInFuture; // Look for read-after-read hazards if (aTimeInFuture < mNextRead) { ++mReadAfterReadCount; // We don't have to order reads of the same resource, so this doesn't generate any stalls } // Look for read-after-write hazards if (aTimeInFuture <= mNextWrite) { size_t Stall = mNextWrite - aTimeInFuture+1; mParent.Stall(Stall); ++mReadAfterWriteCount; mReadAfterWriteStallCount += Stall; mStallCount += Stall; TrueReadTime = mNextWrite + 1; CRAY_ASSERT(TrueReadTime == aTimeInFuture); } ++mReadCount; mNextRead = max(mNextRead, TrueReadTime); } void StorageResource_c::Write(uint32_t aTimeInFuture) { CRAY_ASSERT(aTimeInFuture > 0); uint32_t TrueWriteTime = aTimeInFuture; // Look for write-after-write hazards if (TrueWriteTime <= mNextWrite) { size_t Stall = mNextWrite - TrueWriteTime+1; mParent.Stall(Stall); ++mWriteAfterWriteCount; mWriteAfterWriteStallCount += Stall; mStallCount += Stall; TrueWriteTime = mNextWrite + 1; CRAY_ASSERT(TrueWriteTime == aTimeInFuture); } // Look for write-after-read hazards if (TrueWriteTime <= mNextRead) { size_t Stall = mNextRead - TrueWriteTime+1; mParent.Stall(Stall); ++mWriteAfterReadCount; mWriteAfterReadStallCount += Stall; mStallCount += Stall; TrueWriteTime = mNextRead + 1; CRAY_ASSERT(TrueWriteTime == aTimeInFuture); } ++mWriteCount; mNextWrite = max(mNextWrite, TrueWriteTime); } void StorageResource_c::Print(ostream &aStream) const { aStream << "Storage resource: " << mName << endl; aStream << "===========================" << endl; aStream << " Number of stalls: " << DecPrinter(mStallCount) << endl; aStream << " Number of reads: " << DecPrinter(mReadCount) << endl; aStream << " Number of writes: " << DecPrinter(mWriteCount) << endl; aStream << " Number of read-after-read hazards: " << DecPrinter(mReadAfterReadCount) << endl; aStream << " Number of read-after-write hazards: " << DecPrinter(mReadAfterWriteCount) << endl; aStream << " Number of write-after-read hazards: " << DecPrinter(mWriteAfterReadCount) << endl; aStream << " Number of write-after-write hazards: " << DecPrinter(mWriteAfterWriteCount) << endl; aStream << " Number of stalls due to read-after-read hazards: " << DecPrinter(mReadAfterReadStallCount) << endl; aStream << " Number of stalls due to read-after-write hazards: " << DecPrinter(mReadAfterWriteStallCount) << endl; aStream << " Number of stalls due to write-after-read hazards: " << DecPrinter(mWriteAfterReadStallCount) << endl; aStream << " Number of stalls due to write-after-write hazards: " << DecPrinter(mWriteAfterWriteStallCount) << endl; } void MemoryModel_c::Tick(uint32_t aAmount) { } uint32_t MemoryModel_c::ReadLatency(uint32_t aAddr) { // Test for disabled caches if (mTag.size() == 0) return ReadLatency(); ++mReadCount; uint32_t TagAddr = GetTagAddr(aAddr); uint32_t TagIdx = GetTagIdx(aAddr); TagEntry_s *HitEntry = nullptr; TagEntry_s *EntryToAllocate = &mTag[0][TagIdx]; // Update timestamps for(uint32_t Way=0;Way EntryToAllocate->TimeStamp) && EntryToAllocate->Valid) { EntryToAllocate = &Entry; } } else { EntryToAllocate = &Entry; } } // Handle hits if (HitEntry != nullptr) { CRAY_ASSERT(HitEntry->Valid); CRAY_ASSERT(HitEntry->Addr == TagAddr); HitEntry->TimeStamp = 0; // Reset timestamp for the way that hit mReadDelayCount += mTagLatency; return mTagLatency; } uint32_t Delay = mTagLatency; // Handle misses if (EntryToAllocate->Valid) { ++mEvictionCount; if (EntryToAllocate->Dirty) { CRAY_ASSERT(mWriteBack); Delay += mWriteLatency + mLineSize / mBurstReadRate; } // cout << "Evicting cache line: " << HexPrinter(aAddr) << endl; } EntryToAllocate->Valid = true; EntryToAllocate->Dirty = false; EntryToAllocate->Addr = TagAddr; EntryToAllocate->TimeStamp = 0; ++mMissCount; Delay += mReadLatency + mLineSize / mBurstReadRate; mReadDelayCount += Delay; return Delay; } uint32_t MemoryModel_c::WriteLatency(uint32_t aAddr) { // We implement write-through, so we only invalidate the hitting line ++mWriteCount; if (mTag.size() == 0) { mWriteDelayCount += mWriteLatency; return mWriteLatency; } if (!mInvalidateOnWrite & !mAllocateOnWrite & !mWriteBack) { mWriteDelayCount += mWriteLatency; return mWriteLatency; } uint32_t TagAddr = GetTagAddr(aAddr); uint32_t TagIdx = GetTagIdx(aAddr); TagEntry_s *HitEntry = nullptr; TagEntry_s *EntryToAllocate = &mTag[0][TagIdx]; if (mAllocateOnWrite || mWriteBack) { for(uint32_t Way=0;Way EntryToAllocate->TimeStamp) && EntryToAllocate->Valid) { EntryToAllocate = &Entry; } } else { EntryToAllocate = &Entry; } } if (HitEntry != nullptr) { if (mWriteBack) { HitEntry->Dirty = true; uint32_t Delay = mTagLatency; mWriteDelayCount += Delay; return Delay; } else { // In case of write allocate, we have a hit, but still have to do the write uint32_t Delay = mWriteLatency; mWriteDelayCount += Delay; return Delay; } } else { if (EntryToAllocate->Valid) { ++mEvictionCount; // cout << "Evicting cache line: " << HexPrinter(aAddr) << endl; } EntryToAllocate->Valid = true; EntryToAllocate->Addr = TagAddr; EntryToAllocate->TimeStamp = 0; ++mWriteAllocateCount; if (mWriteBack) { EntryToAllocate->Dirty = true; uint32_t Delay = mTagLatency + mReadLatency + mLineSize / mBurstReadRate; mWriteDelayCount += Delay; return Delay; } else { uint32_t Delay = max(mWriteLatency, mTagLatency) + mReadLatency + mLineSize / mBurstReadRate; mWriteDelayCount += Delay; return Delay; } } } if (mInvalidateOnWrite) { for(uint32_t Way=0;Way