// Copyright 2009 The Go Authors. All rights reserved.// Use of this source code is governed by a BSD-style// license that can be found in the LICENSE file.// Malloc profiling.// Patterned after tcmalloc's algorithms; shorter code.package runtimeimport ()// NOTE(rsc): Everything here could use cas if contention became an issue.varproflockmutex// All memory allocations are local and do not escape outside of the profiler.// The profiler is forbidden from referring to garbage-collected memory.const (// profile typesmemProfilebucketType = 1 + iotablockProfilemutexProfile// size of bucket hash tablebuckHashSize = 179999// max depth of stack to record in bucketmaxStack = 32)typebucketTypeint// A bucket holds per-call-stack profiling information.// The representation is a bit sleazy, inherited from C.// This struct defines the bucket header. It is followed in// memory by the stack words and then the actual record// data, either a memRecord or a blockRecord.//// Per-call-stack profiling information.// Lookup by hashing call stack into a linked-list hash table.//// No heap pointers.////go:notinheaptypebucketstruct { next *bucket allnext *bucket typ bucketType// memBucket or blockBucket (includes mutexProfile) hash uintptr size uintptr nstk uintptr}// A memRecord is the bucket data for a bucket of type memProfile,// part of the memory profile.typememRecordstruct {// The following complex 3-stage scheme of stats accumulation // is required to obtain a consistent picture of mallocs and frees // for some point in time. // The problem is that mallocs come in real time, while frees // come only after a GC during concurrent sweeping. So if we would // naively count them, we would get a skew toward mallocs. // // Hence, we delay information to get consistent snapshots as // of mark termination. Allocations count toward the next mark // termination's snapshot, while sweep frees count toward the // previous mark termination's snapshot: // // MT MT MT MT // .·| .·| .·| .·| // .·˙ | .·˙ | .·˙ | .·˙ | // .·˙ | .·˙ | .·˙ | .·˙ | // .·˙ |.·˙ |.·˙ |.·˙ | // // alloc → ▲ ← free // ┠┅┅┅┅┅┅┅┅┅┅┅P // C+2 → C+1 → C // // alloc → ▲ ← free // ┠┅┅┅┅┅┅┅┅┅┅┅P // C+2 → C+1 → C // // Since we can't publish a consistent snapshot until all of // the sweep frees are accounted for, we wait until the next // mark termination ("MT" above) to publish the previous mark // termination's snapshot ("P" above). To do this, allocation // and free events are accounted to *future* heap profile // cycles ("C+n" above) and we only publish a cycle once all // of the events from that cycle must be done. Specifically: // // Mallocs are accounted to cycle C+2. // Explicit frees are accounted to cycle C+2. // GC frees (done during sweeping) are accounted to cycle C+1. // // After mark termination, we increment the global heap // profile cycle counter and accumulate the stats from cycle C // into the active profile.// active is the currently published profile. A profiling // cycle can be accumulated into active once its complete. active memRecordCycle// future records the profile events we're counting for cycles // that have not yet been published. This is ring buffer // indexed by the global heap profile cycle C and stores // cycles C, C+1, and C+2. Unlike active, these counts are // only for a single cycle; they are not cumulative across // cycles. // // We store cycle C here because there's a window between when // C becomes the active cycle and when we've flushed it to // active. future [3]memRecordCycle}// memRecordCycletypememRecordCyclestruct { allocs, frees uintptr alloc_bytes, free_bytes uintptr}// add accumulates b into a. It does not zero b.func ( *memRecordCycle) ( *memRecordCycle) { .allocs += .allocs .frees += .frees .alloc_bytes += .alloc_bytes .free_bytes += .free_bytes}// A blockRecord is the bucket data for a bucket of type blockProfile,// which is used in blocking and mutex profiles.typeblockRecordstruct { count int64 cycles int64}var (mbuckets *bucket// memory profile bucketsbbuckets *bucket// blocking profile bucketsxbuckets *bucket// mutex profile bucketsbuckhash *[179999]*bucketbucketmemuintptrmProfstruct {// All fields in mProf are protected by proflock.// cycle is the global heap profile cycle. This wraps // at mProfCycleWrap. cycle uint32// flushed indicates that future[cycle] in all buckets // has been flushed to the active profile. flushed bool })constmProfCycleWrap = uint32(len(memRecord{}.future)) * (2 << 24)// newBucket allocates a bucket with the given type and number of stack entries.func ( bucketType, int) *bucket { := unsafe.Sizeof(bucket{}) + uintptr()*unsafe.Sizeof(uintptr(0))switch {default:throw("invalid profile bucket type")casememProfile: += unsafe.Sizeof(memRecord{})caseblockProfile, mutexProfile: += unsafe.Sizeof(blockRecord{}) } := (*bucket)(persistentalloc(, 0, &memstats.buckhash_sys))bucketmem += .typ = .nstk = uintptr()return}// stk returns the slice in b holding the stack.func ( *bucket) () []uintptr { := (*[maxStack]uintptr)(add(unsafe.Pointer(), unsafe.Sizeof(*)))return [:.nstk:.nstk]}// mp returns the memRecord associated with the memProfile bucket b.func ( *bucket) () *memRecord {if .typ != memProfile {throw("bad use of bucket.mp") } := add(unsafe.Pointer(), unsafe.Sizeof(*)+.nstk*unsafe.Sizeof(uintptr(0)))return (*memRecord)()}// bp returns the blockRecord associated with the blockProfile bucket b.func ( *bucket) () *blockRecord {if .typ != blockProfile && .typ != mutexProfile {throw("bad use of bucket.bp") } := add(unsafe.Pointer(), unsafe.Sizeof(*)+.nstk*unsafe.Sizeof(uintptr(0)))return (*blockRecord)()}// Return the bucket for stk[0:nstk], allocating new bucket if needed.func ( bucketType, uintptr, []uintptr, bool) *bucket {ifbuckhash == nil {buckhash = (*[buckHashSize]*bucket)(sysAlloc(unsafe.Sizeof(*buckhash), &memstats.buckhash_sys))ifbuckhash == nil {throw("runtime: cannot allocate memory") } }// Hash stack.varuintptrfor , := range { += += << 10 ^= >> 6 }// hash in size += += << 10 ^= >> 6// finalize += << 3 ^= >> 11 := int( % buckHashSize)for := buckhash[]; != nil; = .next {if .typ == && .hash == && .size == && eqslice(.stk(), ) {return } }if ! {returnnil }// Create new bucket. := newBucket(, len())copy(.stk(), ) .hash = .size = .next = buckhash[]buckhash[] = if == memProfile { .allnext = mbucketsmbuckets = } elseif == mutexProfile { .allnext = xbucketsxbuckets = } else { .allnext = bbucketsbbuckets = }return}func (, []uintptr) bool {iflen() != len() {returnfalse }for , := range {if != [] {returnfalse } }returntrue}// mProf_NextCycle publishes the next heap profile cycle and creates a// fresh heap profile cycle. This operation is fast and can be done// during STW. The caller must call mProf_Flush before calling// mProf_NextCycle again.//// This is called by mark termination during STW so allocations and// frees after the world is started again count towards a new heap// profiling cycle.func () {lock(&proflock)// We explicitly wrap mProf.cycle rather than depending on // uint wraparound because the memRecord.future ring does not // itself wrap at a power of two.mProf.cycle = (mProf.cycle + 1) % mProfCycleWrapmProf.flushed = falseunlock(&proflock)}// mProf_Flush flushes the events from the current heap profiling// cycle into the active profile. After this it is safe to start a new// heap profiling cycle with mProf_NextCycle.//// This is called by GC after mark termination starts the world. In// contrast with mProf_NextCycle, this is somewhat expensive, but safe// to do concurrently.func () {lock(&proflock)if !mProf.flushed {mProf_FlushLocked()mProf.flushed = true }unlock(&proflock)}func () { := mProf.cyclefor := mbuckets; != nil; = .allnext { := .mp()// Flush cycle C into the published profile and clear // it for reuse. := &.future[%uint32(len(.future))] .active.add() * = memRecordCycle{} }}// mProf_PostSweep records that all sweep frees for this GC cycle have// completed. This has the effect of publishing the heap profile// snapshot as of the last mark termination without advancing the heap// profile cycle.func () {lock(&proflock)// Flush cycle C+1 to the active profile so everything as of // the last mark termination becomes visible. *Don't* advance // the cycle, since we're still accumulating allocs in cycle // C+2, which have to become C+1 in the next mark termination // and so on. := mProf.cyclefor := mbuckets; != nil; = .allnext { := .mp() := &.future[(+1)%uint32(len(.future))] .active.add() * = memRecordCycle{} }unlock(&proflock)}// Called by malloc to record a profiled block.func ( unsafe.Pointer, uintptr) {var [maxStack]uintptr := callers(4, [:])lock(&proflock) := stkbucket(memProfile, , [:], true) := mProf.cycle := .mp() := &.future[(+2)%uint32(len(.future))] .allocs++ .alloc_bytes += unlock(&proflock)// Setprofilebucket locks a bunch of other mutexes, so we call it outside of proflock. // This reduces potential contention and chances of deadlocks. // Since the object must be alive during call to mProf_Malloc, // it's fine to do this non-atomically.systemstack(func() {setprofilebucket(, ) })}// Called when freeing a profiled block.func ( *bucket, uintptr) {lock(&proflock) := mProf.cycle := .mp() := &.future[(+1)%uint32(len(.future))] .frees++ .free_bytes += unlock(&proflock)}varblockprofilerateuint64// in CPU ticks// SetBlockProfileRate controls the fraction of goroutine blocking events// that are reported in the blocking profile. The profiler aims to sample// an average of one blocking event per rate nanoseconds spent blocked.//// To include every blocking event in the profile, pass rate = 1.// To turn off profiling entirely, pass rate <= 0.func ( int) {varint64if <= 0 { = 0// disable profiling } elseif == 1 { = 1// profile everything } else {// convert ns to cycles, use float64 to prevent overflow during multiplication = int64(float64() * float64(tickspersecond()) / (1000 * 1000 * 1000))if == 0 { = 1 } }atomic.Store64(&blockprofilerate, uint64())}func ( int64, int) {if <= 0 { = 1 }ifblocksampled() {saveblockevent(, +1, blockProfile) }}func ( int64) bool { := int64(atomic.Load64(&blockprofilerate))if <= 0 || ( > && int64(fastrand())% > ) {returnfalse }returntrue}func ( int64, int, bucketType) { := getg()varintvar [maxStack]uintptrif .m.curg == nil || .m.curg == { = callers(, [:]) } else { = gcallers(.m.curg, , [:]) }lock(&proflock) := stkbucket(, 0, [:], true) .bp().count++ .bp().cycles += unlock(&proflock)}varmutexprofilerateuint64// fraction sampled// SetMutexProfileFraction controls the fraction of mutex contention events// that are reported in the mutex profile. On average 1/rate events are// reported. The previous rate is returned.//// To turn off profiling entirely, pass rate 0.// To just read the current rate, pass rate < 0.// (For n>1 the details of sampling may change.)func ( int) int {if < 0 {returnint(mutexprofilerate) } := mutexprofilerateatomic.Store64(&mutexprofilerate, uint64())returnint()}//go:linkname mutexevent sync.eventfunc ( int64, int) {if < 0 { = 0 } := int64(atomic.Load64(&mutexprofilerate))// TODO(pjw): measure impact of always calling fastrand vs using something // like malloc.go:nextSample()if > 0 && int64(fastrand())% == 0 {saveblockevent(, +1, mutexProfile) }}// Go interface to profile data.// A StackRecord describes a single execution stack.typeStackRecordstruct { Stack0 [32]uintptr// stack trace for this record; ends at first 0 entry}// Stack returns the stack trace associated with the record,// a prefix of r.Stack0.func ( *StackRecord) () []uintptr {for , := range .Stack0 {if == 0 {return .Stack0[0:] } }return .Stack0[0:]}// MemProfileRate controls the fraction of memory allocations// that are recorded and reported in the memory profile.// The profiler aims to sample an average of// one allocation per MemProfileRate bytes allocated.//// To include every allocated block in the profile, set MemProfileRate to 1.// To turn off profiling entirely, set MemProfileRate to 0.//// The tools that process the memory profiles assume that the// profile rate is constant across the lifetime of the program// and equal to the current value. Programs that change the// memory profiling rate should do so just once, as early as// possible in the execution of the program (for example,// at the beginning of main).varMemProfileRateint = 512 * 1024// A MemProfileRecord describes the live objects allocated// by a particular call sequence (stack trace).typeMemProfileRecordstruct { AllocBytes, FreeBytes int64// number of bytes allocated, freed AllocObjects, FreeObjects int64// number of objects allocated, freed Stack0 [32]uintptr// stack trace for this record; ends at first 0 entry}// InUseBytes returns the number of bytes in use (AllocBytes - FreeBytes).func ( *MemProfileRecord) () int64 { return .AllocBytes - .FreeBytes }// InUseObjects returns the number of objects in use (AllocObjects - FreeObjects).func ( *MemProfileRecord) () int64 {return .AllocObjects - .FreeObjects}// Stack returns the stack trace associated with the record,// a prefix of r.Stack0.func ( *MemProfileRecord) () []uintptr {for , := range .Stack0 {if == 0 {return .Stack0[0:] } }return .Stack0[0:]}// MemProfile returns a profile of memory allocated and freed per allocation// site.//// MemProfile returns n, the number of records in the current memory profile.// If len(p) >= n, MemProfile copies the profile into p and returns n, true.// If len(p) < n, MemProfile does not change p and returns n, false.//// If inuseZero is true, the profile includes allocation records// where r.AllocBytes > 0 but r.AllocBytes == r.FreeBytes.// These are sites where memory was allocated, but it has all// been released back to the runtime.//// The returned profile may be up to two garbage collection cycles old.// This is to avoid skewing the profile toward allocations; because// allocations happen in real time but frees are delayed until the garbage// collector performs sweeping, the profile only accounts for allocations// that have had a chance to be freed by the garbage collector.//// Most clients should use the runtime/pprof package or// the testing package's -test.memprofile flag instead// of calling MemProfile directly.func ( []MemProfileRecord, bool) ( int, bool) {lock(&proflock)// If we're between mProf_NextCycle and mProf_Flush, take care // of flushing to the active profile so we only have to look // at the active profile below.mProf_FlushLocked() := truefor := mbuckets; != nil; = .allnext { := .mp()if || .active.alloc_bytes != .active.free_bytes { ++ }if .active.allocs != 0 || .active.frees != 0 { = false } }if {// Absolutely no data, suggesting that a garbage collection // has not yet happened. In order to allow profiling when // garbage collection is disabled from the beginning of execution, // accumulate all of the cycles, and recount buckets. = 0for := mbuckets; != nil; = .allnext { := .mp()for := range .future { .active.add(&.future[]) .future[] = memRecordCycle{} }if || .active.alloc_bytes != .active.free_bytes { ++ } } }if <= len() { = true := 0for := mbuckets; != nil; = .allnext { := .mp()if || .active.alloc_bytes != .active.free_bytes {record(&[], ) ++ } } }unlock(&proflock)return}// Write b's data to r.func ( *MemProfileRecord, *bucket) { := .mp() .AllocBytes = int64(.active.alloc_bytes) .FreeBytes = int64(.active.free_bytes) .AllocObjects = int64(.active.allocs) .FreeObjects = int64(.active.frees)ifraceenabled {racewriterangepc(unsafe.Pointer(&.Stack0[0]), unsafe.Sizeof(.Stack0), getcallerpc(), funcPC(MemProfile)) }ifmsanenabled {msanwrite(unsafe.Pointer(&.Stack0[0]), unsafe.Sizeof(.Stack0)) }copy(.Stack0[:], .stk())for := int(.nstk); < len(.Stack0); ++ { .Stack0[] = 0 }}func ( func(*bucket, uintptr, *uintptr, uintptr, uintptr, uintptr)) {lock(&proflock)for := mbuckets; != nil; = .allnext { := .mp() (, .nstk, &.stk()[0], .size, .active.allocs, .active.frees) }unlock(&proflock)}// BlockProfileRecord describes blocking events originated// at a particular call sequence (stack trace).typeBlockProfileRecordstruct { Count int64 Cycles int64StackRecord}// BlockProfile returns n, the number of records in the current blocking profile.// If len(p) >= n, BlockProfile copies the profile into p and returns n, true.// If len(p) < n, BlockProfile does not change p and returns n, false.//// Most clients should use the runtime/pprof package or// the testing package's -test.blockprofile flag instead// of calling BlockProfile directly.func ( []BlockProfileRecord) ( int, bool) {lock(&proflock)for := bbuckets; != nil; = .allnext { ++ }if <= len() { = truefor := bbuckets; != nil; = .allnext { := .bp() := &[0] .Count = .count .Cycles = .cyclesifraceenabled {racewriterangepc(unsafe.Pointer(&.Stack0[0]), unsafe.Sizeof(.Stack0), getcallerpc(), funcPC()) }ifmsanenabled {msanwrite(unsafe.Pointer(&.Stack0[0]), unsafe.Sizeof(.Stack0)) } := copy(.Stack0[:], .stk())for ; < len(.Stack0); ++ { .Stack0[] = 0 } = [1:] } }unlock(&proflock)return}// MutexProfile returns n, the number of records in the current mutex profile.// If len(p) >= n, MutexProfile copies the profile into p and returns n, true.// Otherwise, MutexProfile does not change p, and returns n, false.//// Most clients should use the runtime/pprof package// instead of calling MutexProfile directly.func ( []BlockProfileRecord) ( int, bool) {lock(&proflock)for := xbuckets; != nil; = .allnext { ++ }if <= len() { = truefor := xbuckets; != nil; = .allnext { := .bp() := &[0] .Count = int64(.count) .Cycles = .cycles := copy(.Stack0[:], .stk())for ; < len(.Stack0); ++ { .Stack0[] = 0 } = [1:] } }unlock(&proflock)return}// ThreadCreateProfile returns n, the number of records in the thread creation profile.// If len(p) >= n, ThreadCreateProfile copies the profile into p and returns n, true.// If len(p) < n, ThreadCreateProfile does not change p and returns n, false.//// Most clients should use the runtime/pprof package instead// of calling ThreadCreateProfile directly.func ( []StackRecord) ( int, bool) { := (*m)(atomic.Loadp(unsafe.Pointer(&allm)))for := ; != nil; = .alllink { ++ }if <= len() { = true := 0for := ; != nil; = .alllink { [].Stack0 = .createstack ++ } }return}//go:linkname runtime_goroutineProfileWithLabels runtime/pprof.runtime_goroutineProfileWithLabelsfunc ( []StackRecord, []unsafe.Pointer) ( int, bool) {returngoroutineProfileWithLabels(, )}// labels may be nil. If labels is non-nil, it must have the same length as p.func ( []StackRecord, []unsafe.Pointer) ( int, bool) {if != nil && len() != len() { = nil } := getg() := func( *g) bool {// Checking isSystemGoroutine here makes GoroutineProfile // consistent with both NumGoroutine and Stack.return != && readgstatus() != _Gdead && !isSystemGoroutine(, false) }stopTheWorld("profile") = 1for , := rangeallgs {if () { ++ } }if <= len() { = true , := , // Save current goroutine. := getcallersp() := getcallerpc()systemstack(func() {saveg(, , , &[0]) }) = [1:]// If we have a place to put our goroutine labelmap, insert it there.if != nil { [0] = .labels = [1:] }// Save other goroutines.for , := rangeallgs {if () {iflen() == 0 {// Should be impossible, but better to return a // truncated profile than to crash the entire process.break }saveg(^uintptr(0), ^uintptr(0), , &[0])if != nil { [0] = .labels = [1:] } = [1:] } } }startTheWorld()return , }// GoroutineProfile returns n, the number of records in the active goroutine stack profile.// If len(p) >= n, GoroutineProfile copies the profile into p and returns n, true.// If len(p) < n, GoroutineProfile does not change p and returns n, false.//// Most clients should use the runtime/pprof package instead// of calling GoroutineProfile directly.func ( []StackRecord) ( int, bool) {returngoroutineProfileWithLabels(, nil)}func (, uintptr, *g, *StackRecord) { := gentraceback(, , 0, , 0, &.Stack0[0], len(.Stack0), nil, nil, 0)if < len(.Stack0) { .Stack0[] = 0 }}// Stack formats a stack trace of the calling goroutine into buf// and returns the number of bytes written to buf.// If all is true, Stack formats stack traces of all other goroutines// into buf after the trace for the current goroutine.func ( []byte, bool) int {if {stopTheWorld("stack trace") } := 0iflen() > 0 { := getg() := getcallersp() := getcallerpc()systemstack(func() { := getg()// Force traceback=1 to override GOTRACEBACK setting, // so that Stack's results are consistent. // GOTRACEBACK is only about crash dumps. .m.traceback = 1 .writebuf = [0:0:len()]goroutineheader()traceback(, , 0, )if {tracebackothers() } .m.traceback = 0 = len(.writebuf) .writebuf = nil }) }if {startTheWorld() }return}// Tracing of alloc/free/gc.vartracelockmutexfunc ( unsafe.Pointer, uintptr, *_type) {lock(&tracelock) := getg() .m.traceback = 2if == nil {print("tracealloc(", , ", ", hex(), ")\n") } else {print("tracealloc(", , ", ", hex(), ", ", .string(), ")\n") }if .m.curg == nil || == .m.curg {goroutineheader() := getcallerpc() := getcallersp()systemstack(func() {traceback(, , 0, ) }) } else {goroutineheader(.m.curg)traceback(^uintptr(0), ^uintptr(0), 0, .m.curg) }print("\n") .m.traceback = 0unlock(&tracelock)}func ( unsafe.Pointer, uintptr) {lock(&tracelock) := getg() .m.traceback = 2print("tracefree(", , ", ", hex(), ")\n")goroutineheader() := getcallerpc() := getcallersp()systemstack(func() {traceback(, , 0, ) })print("\n") .m.traceback = 0unlock(&tracelock)}func () {lock(&tracelock) := getg() .m.traceback = 2print("tracegc()\n")// running on m->g0 stack; show all non-g0 goroutinestracebackothers()print("end tracegc\n")print("\n") .m.traceback = 0unlock(&tracelock)}