Skip to content

Commit 4574a3c

Browse files
committed
runtime: don't usleep() in runqgrab()
The most recently goready()'d G on each P is given a special position in the P's runqueue, p.runnext. Other Ps steal p.runnext only as a last resort, and usleep(3) before doing so: findRunnable() => stealWork() => runqsteal() => runqgrab(). As documented in runqgrab(), this is to reduce thrashing of Gs between Ps in cases where one goroutine wakes another and then "almost immediately" blocks. On Linux, usleep() is implemented by invoking the nanosleep system call. Syscall timeouts in the Linux kernel are subject to timer slack, as documented by the man page for syscall prctl, section "PR_SET_TIMERSLACK". Experimentally, short timeouts can expect to expire 50 microseconds late regardless of other system activity. Thus, on Linux, usleep(3) typically sleeps for at least 53 microseconds, more than 17x longer than intended. A P must be in the spinning state in order to attempt work-stealing. While at least one P is spinning, wakep() will refuse to wake a new spinning P. One P sleeping in runqgrab() thus prevents further threads from being woken in response to e.g. goroutine wakeups *globally* (throughout the process). Futex wake-to-wakeup latency is approximately 20 microseconds, so sleeping for 53 microseconds can significantly increase goroutine wakeup latency by delaying thread wakeup. Fix this by timestamping Gs when they are runqput() into p.runnext, and causing runqgrab() to indicate to findRunnable() that it should loop if p.runnext is not yet stealable. Alternative fixes considered: - osyield() on Linux as we do on a few other platforms. On Linux, osyield() is implemented by the sched_yield system call, which IIUC causes the calling thread to yield its timeslice to any thread on its runqueue that it would not preempt on wakeup, potentially introducing even larger latencies on busy systems. See also https://www.realworldtech.com/forum/?threadid=189711&curpostid=189752 for a case against sched_yield on semantic grounds. - Replace the usleep() with a spin loop in-place. This tends to waste the spinning P's time, since it can't check other runqueues and the number of calls to runqgrab() - and therefore sleeps - is linear in the number of Ps. Empirically, it introduces regressions not observed in this change. Unfortunately, this is a load-bearing bug. In programs with goroutines that frequently wake up goroutines and then immediately block, this bug significantly reduces overhead from useless thread wakeups in wakep(). In golang.org/x/benchmarks, this manifests most clearly as regressions in benchmark dustin_broadcast. To avoid this regression, we need to intentionally throttle wakep() => acquirem(). Thus, this change also introduces a "need-wakep()" prediction mechanism, which causes goready() and newproc() to call wakep() only if the calling goroutine is predicted not to immediately block. To handle mispredictions, sysmon is changed to wakep() if it detects underutilization. The current prediction algorithm is simple, but appears to be effective; it can be improved in the future as warranted. Results from golang.org/x/benchmarks: (Baseline is go1.20.1; experiment is go1.20.1 plus this change) shortname: ajstarks_deck_generate goos: linux goarch: amd64 pkg: github.com/ajstarks/deck/generate cpu: Intel(R) Xeon(R) W-2135 CPU @ 3.70GHz │ ./bent-bench/20230303T173250.baseline.stdout │ ./bent-bench/20230303T173250.experiment.stdout │ │ sec/op │ sec/op vs base │ Arc-12 3.857µ ± 5% 3.753µ ± 5% ~ (p=0.424 n=10) Polygon-12 7.074µ ± 6% 6.969µ ± 4% ~ (p=0.190 n=10) geomean 5.224µ 5.114µ -2.10% shortname: aws_jsonutil pkg: github.com/aws/aws-sdk-go/private/protocol/json/jsonutil │ ./bent-bench/20230303T173250.baseline.stdout │ ./bent-bench/20230303T173250.experiment.stdout │ │ sec/op │ sec/op vs base │ BuildJSON-12 5.602µ ± 3% 5.600µ ± 2% ~ (p=0.896 n=10) StdlibJSON-12 3.843µ ± 2% 3.828µ ± 2% ~ (p=0.224 n=10) geomean 4.640µ 4.630µ -0.22% shortname: benhoyt_goawk_1_18 pkg: github.com/benhoyt/goawk/interp │ ./bent-bench/20230303T173250.baseline.stdout │ ./bent-bench/20230303T173250.experiment.stdout │ │ sec/op │ sec/op vs base │ RecursiveFunc-12 17.79µ ± 3% 17.65µ ± 3% ~ (p=0.436 n=10) RegexMatch-12 815.8n ± 4% 823.3n ± 1% ~ (p=0.353 n=10) RepeatExecProgram-12 21.30µ ± 6% 21.69µ ± 3% ~ (p=0.052 n=10) RepeatNew-12 79.21n ± 4% 79.73n ± 3% ~ (p=0.529 n=10) RepeatIOExecProgram-12 41.83µ ± 1% 42.07µ ± 2% ~ (p=0.796 n=10) RepeatIONew-12 1.195µ ± 3% 1.196µ ± 2% ~ (p=1.000 n=10) geomean 3.271µ 3.288µ +0.54% shortname: bindata pkg: github.com/kevinburke/go-bindata │ ./bent-bench/20230303T173250.baseline.stdout │ ./bent-bench/20230303T173250.experiment.stdout │ │ sec/op │ sec/op vs base │ Bindata-12 316.2m ± 5% 309.7m ± 4% ~ (p=0.436 n=10) │ ./bent-bench/20230303T173250.baseline.stdout │ ./bent-bench/20230303T173250.experiment.stdout │ │ B/s │ B/s vs base │ Bindata-12 20.71Mi ± 5% 21.14Mi ± 4% ~ (p=0.436 n=10) │ ./bent-bench/20230303T173250.baseline.stdout │ ./bent-bench/20230303T173250.experiment.stdout │ │ B/op │ B/op vs base │ Bindata-12 183.0Mi ± 0% 183.0Mi ± 0% ~ (p=0.353 n=10) │ ./bent-bench/20230303T173250.baseline.stdout │ ./bent-bench/20230303T173250.experiment.stdout │ │ allocs/op │ allocs/op vs base │ Bindata-12 5.790k ± 0% 5.789k ± 0% ~ (p=0.358 n=10) shortname: bloom_bloom pkg: github.com/bits-and-blooms/bloom/v3 │ ./bent-bench/20230303T173250.baseline.stdout │ ./bent-bench/20230303T173250.experiment.stdout │ │ sec/op │ sec/op vs base │ SeparateTestAndAdd-12 414.6n ± 4% 413.9n ± 2% ~ (p=0.895 n=10) CombinedTestAndAdd-12 425.8n ± 9% 419.8n ± 8% ~ (p=0.353 n=10) geomean 420.2n 416.9n -0.78% shortname: capnproto2 pkg: zombiezen.com/go/capnproto2 │ ./bent-bench/20230303T173250.baseline.stdout │ ./bent-bench/20230303T173250.experiment.stdout │ │ sec/op │ sec/op vs base │ TextMovementBetweenSegments-12 320.5µ ± 5% 318.4µ ± 10% ~ (p=0.579 n=10) Growth_MultiSegment-12 13.63m ± 1% 13.87m ± 2% +1.71% (p=0.029 n=10) geomean 2.090m 2.101m +0.52% │ ./bent-bench/20230303T173250.baseline.stdout │ ./bent-bench/20230303T173250.experiment.stdout │ │ B/s │ B/s vs base │ Growth_MultiSegment-12 73.35Mi ± 1% 72.12Mi ± 2% -1.68% (p=0.027 n=10) │ ./bent-bench/20230303T173250.baseline.stdout │ ./bent-bench/20230303T173250.experiment.stdout │ │ B/op │ B/op vs base │ Growth_MultiSegment-12 1.572Mi ± 0% 1.572Mi ± 0% ~ (p=0.320 n=10) │ ./bent-bench/20230303T173250.baseline.stdout │ ./bent-bench/20230303T173250.experiment.stdout │ │ allocs/op │ allocs/op vs base │ Growth_MultiSegment-12 21.00 ± 0% 21.00 ± 0% ~ (p=1.000 n=10) ¹ ¹ all samples are equal shortname: cespare_mph pkg: github.com/cespare/mph │ ./bent-bench/20230303T173250.baseline.stdout │ ./bent-bench/20230303T173250.experiment.stdout │ │ sec/op │ sec/op vs base │ Build-12 32.72m ± 2% 32.49m ± 1% ~ (p=0.280 n=10) shortname: commonmark_markdown pkg: gitlab.com/golang-commonmark/markdown │ ./bent-bench/20230303T173250.baseline.stdout │ ./bent-bench/20230303T173250.experiment.stdout │ │ sec/op │ sec/op vs base │ RenderSpecNoHTML-12 10.09m ± 2% 10.18m ± 3% ~ (p=0.796 n=10) RenderSpec-12 10.19m ± 1% 10.11m ± 3% ~ (p=0.684 n=10) RenderSpecBlackFriday2-12 6.793m ± 5% 6.946m ± 2% ~ (p=0.063 n=10) geomean 8.872m 8.944m +0.81% shortname: dustin_broadcast pkg: github.com/dustin/go-broadcast │ ./bent-bench/20230303T173250.baseline.stdout │ ./bent-bench/20230303T173250.experiment.stdout │ │ sec/op │ sec/op vs base │ DirectSend-12 570.5n ± 7% 355.2n ± 2% -37.74% (p=0.000 n=10) ParallelDirectSend-12 549.0n ± 5% 360.9n ± 3% -34.25% (p=0.000 n=10) ParallelBrodcast-12 788.7n ± 2% 486.0n ± 4% -38.37% (p=0.000 n=10) MuxBrodcast-12 788.6n ± 4% 471.5n ± 6% -40.21% (p=0.000 n=10) geomean 664.4n 414.0n -37.68% shortname: dustin_humanize pkg: github.com/dustin/go-humanize │ ./bent-bench/20230303T173250.baseline.stdout │ ./bent-bench/20230303T173250.experiment.stdout │ │ sec/op │ sec/op vs base │ ParseBigBytes-12 1.964µ ± 5% 1.941µ ± 3% ~ (p=0.289 n=10) shortname: ericlagergren_decimal pkg: github.com/ericlagergren/decimal/benchmarks │ ./bent-bench/20230303T173250.baseline.stdout │ ./bent-bench/20230303T173250.experiment.stdout │ │ sec/op │ sec/op vs base │ Pi/foo=ericlagergren_(Go)/prec=100-12 147.5µ ± 2% 147.5µ ± 1% ~ (p=0.912 n=10) Pi/foo=ericlagergren_(GDA)/prec=100-12 329.6µ ± 1% 332.1µ ± 2% ~ (p=0.063 n=10) Pi/foo=shopspring/prec=100-12 680.5µ ± 4% 688.6µ ± 2% ~ (p=0.481 n=10) Pi/foo=apmckinlay/prec=100-12 2.541µ ± 4% 2.525µ ± 3% ~ (p=0.218 n=10) Pi/foo=go-inf/prec=100-12 169.5µ ± 3% 170.7µ ± 3% ~ (p=0.218 n=10) Pi/foo=float64/prec=100-12 4.136µ ± 3% 4.162µ ± 6% ~ (p=0.436 n=10) geomean 62.38µ 62.66µ +0.45% shortname: ethereum_bitutil pkg: github.com/ethereum/go-ethereum/common/bitutil │ ./bent-bench/20230303T173250.baseline.stdout │ ./bent-bench/20230303T173250.experiment.stdout │ │ sec/op │ sec/op vs base │ FastTest2KB-12 130.4n ± 1% 131.5n ± 1% ~ (p=0.093 n=10) BaseTest2KB-12 624.8n ± 2% 983.0n ± 2% +57.32% (p=0.000 n=10) Encoding4KBVerySparse-12 21.48µ ± 3% 22.20µ ± 3% +3.37% (p=0.005 n=10) geomean 1.205µ 1.421µ +17.94% │ ./bent-bench/20230303T173250.baseline.stdout │ ./bent-bench/20230303T173250.experiment.stdout │ │ B/op │ B/op vs base │ Encoding4KBVerySparse-12 9.750Ki ± 0% 9.750Ki ± 0% ~ (p=1.000 n=10) ¹ ¹ all samples are equal │ ./bent-bench/20230303T173250.baseline.stdout │ ./bent-bench/20230303T173250.experiment.stdout │ │ allocs/op │ allocs/op vs base │ Encoding4KBVerySparse-12 15.00 ± 0% 15.00 ± 0% ~ (p=1.000 n=10) ¹ ¹ all samples are equal shortname: ethereum_core pkg: github.com/ethereum/go-ethereum/core │ ./bent-bench/20230303T173250.baseline.stdout │ ./bent-bench/20230303T173250.experiment.stdout │ │ sec/op │ sec/op vs base │ PendingDemotion10000-12 96.72n ± 4% 98.55n ± 2% ~ (p=0.055 n=10) FuturePromotion10000-12 2.128n ± 3% 2.093n ± 3% ~ (p=0.896 n=10) PoolBatchInsert10000-12 642.6m ± 2% 642.1m ± 5% ~ (p=0.796 n=10) PoolBatchLocalInsert10000-12 805.2m ± 2% 826.6m ± 4% ~ (p=0.105 n=10) geomean 101.6µ 102.3µ +0.69% shortname: ethereum_corevm pkg: github.com/ethereum/go-ethereum/core/vm │ ./bent-bench/20230303T173250.baseline.stdout │ ./bent-bench/20230303T173250.experiment.stdout │ │ sec/op │ sec/op vs base │ OpDiv128-12 137.4n ± 3% 139.5n ± 1% +1.56% (p=0.024 n=10) shortname: ethereum_ecies pkg: github.com/ethereum/go-ethereum/crypto/ecies │ ./bent-bench/20230303T173250.baseline.stdout │ ./bent-bench/20230303T173250.experiment.stdout │ │ sec/op │ sec/op vs base │ GenerateKeyP256-12 15.67µ ± 6% 15.66µ ± 3% ~ (p=0.971 n=10) GenSharedKeyP256-12 51.09µ ± 6% 52.09µ ± 4% ~ (p=0.631 n=10) GenSharedKeyS256-12 47.24µ ± 2% 46.67µ ± 3% ~ (p=0.247 n=10) geomean 33.57µ 33.64µ +0.21% shortname: ethereum_ethash pkg: github.com/ethereum/go-ethereum/consensus/ethash │ ./bent-bench/20230303T173250.baseline.stdout │ ./bent-bench/20230303T173250.experiment.stdout │ │ sec/op │ sec/op vs base │ HashimotoLight-12 1.116m ± 5% 1.112m ± 2% ~ (p=0.684 n=10) shortname: ethereum_trie pkg: github.com/ethereum/go-ethereum/trie │ ./bent-bench/20230303T173250.baseline.stdout │ ./bent-bench/20230303T173250.experiment.stdout │ │ sec/op │ sec/op vs base │ HashFixedSize/10K-12 9.236m ± 1% 9.106m ± 1% -1.40% (p=0.019 n=10) CommitAfterHashFixedSize/10K-12 19.60m ± 1% 19.51m ± 1% ~ (p=0.796 n=10) geomean 13.45m 13.33m -0.93% │ ./bent-bench/20230303T173250.baseline.stdout │ ./bent-bench/20230303T173250.experiment.stdout │ │ B/op │ B/op vs base │ HashFixedSize/10K-12 6.036Mi ± 0% 6.037Mi ± 0% ~ (p=0.247 n=10) CommitAfterHashFixedSize/10K-12 8.626Mi ± 0% 8.626Mi ± 0% ~ (p=0.280 n=10) geomean 7.216Mi 7.216Mi +0.01% │ ./bent-bench/20230303T173250.baseline.stdout │ ./bent-bench/20230303T173250.experiment.stdout │ │ allocs/op │ allocs/op vs base │ HashFixedSize/10K-12 77.17k ± 0% 77.17k ± 0% ~ (p=0.050 n=10) CommitAfterHashFixedSize/10K-12 79.99k ± 0% 79.99k ± 0% ~ (p=0.391 n=10) geomean 78.56k 78.57k +0.00% shortname: gonum_blas_native pkg: gonum.org/v1/gonum/blas/gonum │ ./bent-bench/20230303T173250.baseline.stdout │ ./bent-bench/20230303T173250.experiment.stdout │ │ sec/op │ sec/op vs base │ Dnrm2MediumPosInc-12 1.953µ ± 2% 1.940µ ± 5% ~ (p=0.989 n=10) DasumMediumUnitaryInc-12 932.5n ± 1% 931.2n ± 1% ~ (p=0.753 n=10) geomean 1.349µ 1.344µ -0.40% shortname: gonum_community pkg: gonum.org/v1/gonum/graph/community │ ./bent-bench/20230303T173250.baseline.stdout │ ./bent-bench/20230303T173250.experiment.stdout │ │ sec/op │ sec/op vs base │ LouvainDirectedMultiplex-12 26.40m ± 1% 26.64m ± 1% ~ (p=0.165 n=10) shortname: gonum_lapack_native pkg: gonum.org/v1/gonum/lapack/gonum │ ./bent-bench/20230303T173250.baseline.stdout │ ./bent-bench/20230303T173250.experiment.stdout │ │ sec/op │ sec/op vs base │ Dgeev/Circulant10-12 41.97µ ± 6% 42.90µ ± 4% ~ (p=0.143 n=10) Dgeev/Circulant100-12 12.13m ± 4% 12.30m ± 3% ~ (p=0.796 n=10) geomean 713.4µ 726.4µ +1.81% shortname: gonum_mat pkg: gonum.org/v1/gonum/mat │ ./bent-bench/20230303T173250.baseline.stdout │ ./bent-bench/20230303T173250.experiment.stdout │ │ sec/op │ sec/op vs base │ MulWorkspaceDense1000Hundredth-12 89.78m ± 0% 81.48m ± 1% -9.24% (p=0.000 n=10) ScaleVec10000Inc20-12 7.204µ ± 36% 8.450µ ± 35% ~ (p=0.853 n=10) geomean 804.2µ 829.7µ +3.18% shortname: gonum_topo pkg: gonum.org/v1/gonum/graph/topo │ ./bent-bench/20230303T173250.baseline.stdout │ ./bent-bench/20230303T173250.experiment.stdout │ │ sec/op │ sec/op vs base │ TarjanSCCGnp_10_tenth-12 7.251µ ± 1% 7.187µ ± 1% -0.88% (p=0.025 n=10) TarjanSCCGnp_1000_half-12 74.48m ± 2% 74.37m ± 4% ~ (p=0.796 n=10) geomean 734.8µ 731.1µ -0.51% shortname: gonum_traverse pkg: gonum.org/v1/gonum/graph/traverse │ ./bent-bench/20230303T173250.baseline.stdout │ ./bent-bench/20230303T173250.experiment.stdout │ │ sec/op │ sec/op vs base │ WalkAllBreadthFirstGnp_10_tenth-12 3.517µ ± 1% 3.534µ ± 1% ~ (p=0.343 n=10) WalkAllBreadthFirstGnp_1000_tenth-12 11.12m ± 6% 11.19m ± 2% ~ (p=0.631 n=10) geomean 197.8µ 198.9µ +0.54% shortname: gtank_blake2s pkg: github.com/gtank/blake2s │ ./bent-bench/20230303T173250.baseline.stdout │ ./bent-bench/20230303T173250.experiment.stdout │ │ sec/op │ sec/op vs base │ Hash8K-12 18.96µ ± 4% 18.82µ ± 5% ~ (p=0.579 n=10) │ ./bent-bench/20230303T173250.baseline.stdout │ ./bent-bench/20230303T173250.experiment.stdout │ │ B/s │ B/s vs base │ Hash8K-12 412.2Mi ± 4% 415.2Mi ± 5% ~ (p=0.579 n=10) shortname: hugo_hugolib pkg: github.com/gohugoio/hugo/hugolib │ ./bent-bench/20230303T173250.baseline.stdout │ ./bent-bench/20230303T173250.experiment.stdout │ │ sec/op │ sec/op vs base │ MergeByLanguage-12 529.9n ± 1% 531.5n ± 2% ~ (p=0.305 n=10) ResourceChainPostProcess-12 62.76m ± 3% 56.23m ± 2% -10.39% (p=0.000 n=10) ReplaceShortcodeTokens-12 2.727µ ± 3% 2.701µ ± 7% ~ (p=0.592 n=10) geomean 44.92µ 43.22µ -3.80% shortname: k8s_cache pkg: k8s.io/client-go/tools/cache │ ./bent-bench/20230303T173250.baseline.stdout │ ./bent-bench/20230303T173250.experiment.stdout │ │ sec/op │ sec/op vs base │ Listener-12 1.312µ ± 1% 1.199µ ± 1% -8.62% (p=0.000 n=10) ReflectorResyncChanMany-12 785.7n ± 4% 796.3n ± 3% ~ (p=0.089 n=10) geomean 1.015µ 976.9n -3.76% │ ./bent-bench/20230303T173250.baseline.stdout │ ./bent-bench/20230303T173250.experiment.stdout │ │ B/op │ B/op vs base │ Listener-12 16.00 ± 0% 16.00 ± 0% ~ (p=1.000 n=10) ¹ ¹ all samples are equal │ ./bent-bench/20230303T173250.baseline.stdout │ ./bent-bench/20230303T173250.experiment.stdout │ │ allocs/op │ allocs/op vs base │ Listener-12 1.000 ± 0% 1.000 ± 0% ~ (p=1.000 n=10) ¹ ¹ all samples are equal shortname: k8s_workqueue pkg: k8s.io/client-go/util/workqueue │ ./bent-bench/20230303T173250.baseline.stdout │ ./bent-bench/20230303T173250.experiment.stdout │ │ sec/op │ sec/op vs base │ ParallelizeUntil/pieces:1000,workers:10,chunkSize:1-12 244.6µ ± 1% 245.9µ ± 0% +0.55% (p=0.023 n=10) ParallelizeUntil/pieces:1000,workers:10,chunkSize:10-12 75.09µ ± 1% 63.54µ ± 1% -15.37% (p=0.000 n=10) ParallelizeUntil/pieces:1000,workers:10,chunkSize:100-12 49.47µ ± 2% 42.45µ ± 2% -14.19% (p=0.000 n=10) ParallelizeUntil/pieces:999,workers:10,chunkSize:13-12 68.51µ ± 1% 55.07µ ± 1% -19.63% (p=0.000 n=10) geomean 88.82µ 77.74µ -12.47% shortname: kanzi pkg: github.com/flanglet/kanzi-go/benchmark │ ./bent-bench/20230303T173250.baseline.stdout │ ./bent-bench/20230303T173250.experiment.stdout │ │ sec/op │ sec/op vs base │ BWTS-12 0.4479n ± 6% 0.4385n ± 7% ~ (p=0.529 n=10) FPAQ-12 17.03m ± 3% 17.42m ± 3% ~ (p=0.123 n=10) LZ-12 1.897m ± 2% 1.887m ± 4% ~ (p=1.000 n=10) MTFT-12 771.2µ ± 4% 785.8µ ± 3% ~ (p=0.247 n=10) geomean 57.79µ 58.01µ +0.38% shortname: minio pkg: github.com/minio/minio/cmd │ ./bent-bench/20230303T173250.baseline.stdout │ ./bent-bench/20230303T173250.experiment.stdout │ │ sec/op │ sec/op vs base │ DecodehealingTracker-12 852.8n ± 5% 866.8n ± 5% ~ (p=0.190 n=10) AppendMsgReplicateDecision-12 0.5383n ± 4% 0.7598n ± 3% +41.13% (p=0.000 n=10) AppendMsgResyncTargetsInfo-12 4.785n ± 2% 4.639n ± 3% -3.06% (p=0.003 n=10) DataUpdateTracker-12 3.122µ ± 2% 1.880µ ± 3% -39.77% (p=0.000 n=10) MarshalMsgdataUsageCacheInfo-12 110.9n ± 2% 109.4n ± 3% ~ (p=0.101 n=10) geomean 59.74n 57.50n -3.75% │ ./bent-bench/20230303T173250.baseline.stdout │ ./bent-bench/20230303T173250.experiment.stdout │ │ B/s │ B/s vs base │ DecodehealingTracker-12 347.8Mi ± 5% 342.2Mi ± 6% ~ (p=0.190 n=10) AppendMsgReplicateDecision-12 1.730Gi ± 3% 1.226Gi ± 3% -29.14% (p=0.000 n=10) AppendMsgResyncTargetsInfo-12 1.946Gi ± 2% 2.008Gi ± 3% +3.15% (p=0.003 n=10) DataUpdateTracker-12 312.5Ki ± 3% 517.6Ki ± 2% +65.62% (p=0.000 n=10) geomean 139.1Mi 145.4Mi +4.47% │ ./bent-bench/20230303T173250.baseline.stdout │ ./bent-bench/20230303T173250.experiment.stdout │ │ B/op │ B/op vs base │ DecodehealingTracker-12 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹ AppendMsgReplicateDecision-12 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹ AppendMsgResyncTargetsInfo-12 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹ DataUpdateTracker-12 340.0 ± 0% 339.0 ± 1% ~ (p=0.737 n=10) MarshalMsgdataUsageCacheInfo-12 96.00 ± 0% 96.00 ± 0% ~ (p=1.000 n=10) ¹ geomean ² -0.06% ² ¹ all samples are equal ² summaries must be >0 to compute geomean │ ./bent-bench/20230303T173250.baseline.stdout │ ./bent-bench/20230303T173250.experiment.stdout │ │ allocs/op │ allocs/op vs base │ DecodehealingTracker-12 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹ AppendMsgReplicateDecision-12 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹ AppendMsgResyncTargetsInfo-12 0.000 ± 0% 0.000 ± 0% ~ (p=1.000 n=10) ¹ DataUpdateTracker-12 9.000 ± 0% 9.000 ± 0% ~ (p=1.000 n=10) ¹ MarshalMsgdataUsageCacheInfo-12 1.000 ± 0% 1.000 ± 0% ~ (p=1.000 n=10) ¹ geomean ² +0.00% ² ¹ all samples are equal ² summaries must be >0 to compute geomean shortname: semver pkg: github.com/Masterminds/semver │ ./bent-bench/20230303T173250.baseline.stdout │ ./bent-bench/20230303T173250.experiment.stdout │ │ sec/op │ sec/op vs base │ ValidateVersionTildeFail-12 854.7n ± 2% 842.7n ± 2% ~ (p=0.123 n=10) shortname: shopify_sarama pkg: github.com/Shopify/sarama │ ./bent-bench/20230303T173250.baseline.stdout │ ./bent-bench/20230303T173250.experiment.stdout │ │ sec/op │ sec/op vs base │ Broker_Open-12 212.2µ ± 1% 205.9µ ± 2% -2.95% (p=0.000 n=10) Broker_No_Metrics_Open-12 132.9µ ± 1% 121.3µ ± 2% -8.68% (p=0.000 n=10) geomean 167.9µ 158.1µ -5.86% shortname: spexs2 pkg: github.com/egonelbre/spexs2/_benchmark │ ./bent-bench/20230303T173250.baseline.stdout │ ./bent-bench/20230303T173250.experiment.stdout │ │ sec/op │ sec/op vs base │ Run/10k/1-12 23.29 ± 1% 23.11 ± 2% ~ (p=0.315 n=10) Run/10k/16-12 5.648 ± 2% 5.462 ± 4% -3.30% (p=0.004 n=10) geomean 11.47 11.23 -2.06% shortname: sweet-biogo-igor goos: goarch: pkg: cpu: │ ./sweet/results/biogo-igor/baseline.results │ ./sweet/results/biogo-igor/experiment.results │ │ sec/op │ sec/op vs base │ BiogoIgor 13.53 ± 1% 13.62 ± 1% ~ (p=0.165 n=10) │ ./sweet/results/biogo-igor/baseline.results │ ./sweet/results/biogo-igor/experiment.results │ │ average-RSS-bytes │ average-RSS-bytes vs base │ BiogoIgor 62.19Mi ± 3% 62.86Mi ± 1% ~ (p=0.247 n=10) │ ./sweet/results/biogo-igor/baseline.results │ ./sweet/results/biogo-igor/experiment.results │ │ peak-RSS-bytes │ peak-RSS-bytes vs base │ BiogoIgor 89.57Mi ± 4% 89.03Mi ± 3% ~ (p=0.516 n=10) │ ./sweet/results/biogo-igor/baseline.results │ ./sweet/results/biogo-igor/experiment.results │ │ peak-VM-bytes │ peak-VM-bytes vs base │ BiogoIgor 766.4Mi ± 0% 766.4Mi ± 0% ~ (p=0.954 n=10) shortname: sweet-biogo-krishna │ ./sweet/results/biogo-krishna/baseline.results │ ./sweet/results/biogo-krishna/experiment.results │ │ sec/op │ sec/op vs base │ BiogoKrishna 12.70 ± 2% 12.09 ± 3% -4.86% (p=0.000 n=10) │ ./sweet/results/biogo-krishna/baseline.results │ ./sweet/results/biogo-krishna/experiment.results │ │ average-RSS-bytes │ average-RSS-bytes vs base │ BiogoKrishna 4.085Gi ± 0% 4.083Gi ± 0% ~ (p=0.105 n=10) │ ./sweet/results/biogo-krishna/baseline.results │ ./sweet/results/biogo-krishna/experiment.results │ │ peak-RSS-bytes │ peak-RSS-bytes vs base │ BiogoKrishna 4.174Gi ± 0% 4.173Gi ± 0% ~ (p=0.853 n=10) │ ./sweet/results/biogo-krishna/baseline.results │ ./sweet/results/biogo-krishna/experiment.results │ │ peak-VM-bytes │ peak-VM-bytes vs base │ BiogoKrishna 4.877Gi ± 0% 4.877Gi ± 0% ~ (p=0.591 n=10) shortname: sweet-bleve-index │ ./sweet/results/bleve-index/baseline.results │ ./sweet/results/bleve-index/experiment.results │ │ sec/op │ sec/op vs base │ BleveIndexBatch100 4.675 ± 1% 4.669 ± 1% ~ (p=0.739 n=10) │ ./sweet/results/bleve-index/baseline.results │ ./sweet/results/bleve-index/experiment.results │ │ average-RSS-bytes │ average-RSS-bytes vs base │ BleveIndexBatch100 185.5Mi ± 1% 185.9Mi ± 1% ~ (p=0.796 n=10) │ ./sweet/results/bleve-index/baseline.results │ ./sweet/results/bleve-index/experiment.results │ │ peak-RSS-bytes │ peak-RSS-bytes vs base │ BleveIndexBatch100 267.5Mi ± 6% 265.0Mi ± 2% ~ (p=0.739 n=10) │ ./sweet/results/bleve-index/baseline.results │ ./sweet/results/bleve-index/experiment.results │ │ peak-VM-bytes │ peak-VM-bytes vs base │ BleveIndexBatch100 1.945Gi ± 4% 1.945Gi ± 0% ~ (p=0.725 n=10) shortname: sweet-go-build │ ./sweet/results/go-build/baseline.results │ ./sweet/results/go-build/experiment.results │ │ sec/op │ sec/op vs base │ GoBuildKubelet 51.32 ± 0% 51.38 ± 3% ~ (p=0.105 n=10) GoBuildKubeletLink 7.669 ± 1% 7.663 ± 2% ~ (p=0.579 n=10) GoBuildIstioctl 46.02 ± 0% 46.07 ± 0% ~ (p=0.739 n=10) GoBuildIstioctlLink 8.174 ± 1% 8.143 ± 2% ~ (p=0.436 n=10) GoBuildFrontend 16.17 ± 1% 16.10 ± 1% ~ (p=0.143 n=10) GoBuildFrontendLink 1.399 ± 3% 1.377 ± 3% ~ (p=0.218 n=10) geomean 12.23 12.18 -0.39% shortname: sweet-gopher-lua │ ./sweet/results/gopher-lua/baseline.results │ ./sweet/results/gopher-lua/experiment.results │ │ sec/op │ sec/op vs base │ GopherLuaKNucleotide 22.71 ± 1% 22.86 ± 1% ~ (p=0.218 n=10) │ ./sweet/results/gopher-lua/baseline.results │ ./sweet/results/gopher-lua/experiment.results │ │ average-RSS-bytes │ average-RSS-bytes vs base │ GopherLuaKNucleotide 36.64Mi ± 2% 36.40Mi ± 1% ~ (p=0.631 n=10) │ ./sweet/results/gopher-lua/baseline.results │ ./sweet/results/gopher-lua/experiment.results │ │ peak-RSS-bytes │ peak-RSS-bytes vs base │ GopherLuaKNucleotide 43.28Mi ± 5% 41.55Mi ± 7% ~ (p=0.089 n=10) │ ./sweet/results/gopher-lua/baseline.results │ ./sweet/results/gopher-lua/experiment.results │ │ peak-VM-bytes │ peak-VM-bytes vs base │ GopherLuaKNucleotide 699.6Mi ± 0% 699.9Mi ± 0% +0.04% (p=0.006 n=10) shortname: sweet-markdown │ ./sweet/results/markdown/baseline.results │ ./sweet/results/markdown/experiment.results │ │ sec/op │ sec/op vs base │ MarkdownRenderXHTML 260.6m ± 4% 256.4m ± 4% ~ (p=0.796 n=10) │ ./sweet/results/markdown/baseline.results │ ./sweet/results/markdown/experiment.results │ │ average-RSS-bytes │ average-RSS-bytes vs base │ MarkdownRenderXHTML 20.47Mi ± 1% 20.71Mi ± 2% ~ (p=0.393 n=10) │ ./sweet/results/markdown/baseline.results │ ./sweet/results/markdown/experiment.results │ │ peak-RSS-bytes │ peak-RSS-bytes vs base │ MarkdownRenderXHTML 20.88Mi ± 11% 21.73Mi ± 6% ~ (p=0.470 n=10) │ ./sweet/results/markdown/baseline.results │ ./sweet/results/markdown/experiment.results │ │ peak-VM-bytes │ peak-VM-bytes vs base │ MarkdownRenderXHTML 699.2Mi ± 0% 699.3Mi ± 0% ~ (p=0.464 n=10) shortname: sweet-tile38 │ ./sweet/results/tile38/baseline.results │ ./sweet/results/tile38/experiment.results │ │ sec/op │ sec/op vs base │ Tile38WithinCircle100kmRequest 529.1µ ± 1% 530.3µ ± 1% ~ (p=0.143 n=10) Tile38IntersectsCircle100kmRequest 629.6µ ± 1% 630.8µ ± 1% ~ (p=0.971 n=10) Tile38KNearestLimit100Request 446.4µ ± 1% 453.7µ ± 1% +1.62% (p=0.000 n=10) geomean 529.8µ 533.4µ +0.67% │ ./sweet/results/tile38/baseline.results │ ./sweet/results/tile38/experiment.results │ │ average-RSS-bytes │ average-RSS-bytes vs base │ Tile38WithinCircle100kmRequest 5.054Gi ± 1% 5.057Gi ± 1% ~ (p=0.796 n=10) Tile38IntersectsCircle100kmRequest 5.381Gi ± 0% 5.431Gi ± 1% +0.94% (p=0.019 n=10) Tile38KNearestLimit100Request 6.801Gi ± 0% 6.802Gi ± 0% ~ (p=0.684 n=10) geomean 5.697Gi 5.717Gi +0.34% │ ./sweet/results/tile38/baseline.results │ ./sweet/results/tile38/experiment.results │ │ peak-RSS-bytes │ peak-RSS-bytes vs base │ Tile38WithinCircle100kmRequest 5.380Gi ± 1% 5.381Gi ± 1% ~ (p=0.912 n=10) Tile38IntersectsCircle100kmRequest 5.669Gi ± 1% 5.756Gi ± 1% +1.53% (p=0.019 n=10) Tile38KNearestLimit100Request 7.013Gi ± 0% 7.011Gi ± 0% ~ (p=0.796 n=10) geomean 5.980Gi 6.010Gi +0.50% │ ./sweet/results/tile38/baseline.results │ ./sweet/results/tile38/experiment.results │ │ peak-VM-bytes │ peak-VM-bytes vs base │ Tile38WithinCircle100kmRequest 6.047Gi ± 1% 6.047Gi ± 1% ~ (p=0.725 n=10) Tile38IntersectsCircle100kmRequest 6.305Gi ± 1% 6.402Gi ± 2% +1.53% (p=0.035 n=10) Tile38KNearestLimit100Request 7.685Gi ± 0% 7.685Gi ± 0% ~ (p=0.955 n=10) geomean 6.642Gi 6.676Gi +0.51% │ ./sweet/results/tile38/baseline.results │ ./sweet/results/tile38/experiment.results │ │ p50-latency-sec │ p50-latency-sec vs base │ Tile38WithinCircle100kmRequest 88.81µ ± 1% 89.36µ ± 1% +0.61% (p=0.043 n=10) Tile38IntersectsCircle100kmRequest 151.5µ ± 1% 152.0µ ± 1% ~ (p=0.089 n=10) Tile38KNearestLimit100Request 259.0µ ± 0% 259.1µ ± 0% ~ (p=0.853 n=10) geomean 151.6µ 152.1µ +0.33% │ ./sweet/results/tile38/baseline.results │ ./sweet/results/tile38/experiment.results │ │ p90-latency-sec │ p90-latency-sec vs base │ Tile38WithinCircle100kmRequest 712.5µ ± 0% 713.9µ ± 1% ~ (p=0.190 n=10) Tile38IntersectsCircle100kmRequest 960.6µ ± 1% 958.2µ ± 1% ~ (p=0.739 n=10) Tile38KNearestLimit100Request 1.007m ± 1% 1.032m ± 1% +2.50% (p=0.000 n=10) geomean 883.4µ 890.5µ +0.80% │ ./sweet/results/tile38/baseline.results │ ./sweet/results/tile38/experiment.results │ │ p99-latency-sec │ p99-latency-sec vs base │ Tile38WithinCircle100kmRequest 7.061m ± 1% 7.085m ± 1% ~ (p=0.481 n=10) Tile38IntersectsCircle100kmRequest 7.228m ± 1% 7.187m ± 1% ~ (p=0.143 n=10) Tile38KNearestLimit100Request 2.085m ± 0% 2.131m ± 1% +2.22% (p=0.000 n=10) geomean 4.738m 4.770m +0.66% │ ./sweet/results/tile38/baseline.results │ ./sweet/results/tile38/experiment.results │ │ ops/s │ ops/s vs base │ Tile38WithinCircle100kmRequest 17.01k ± 1% 16.97k ± 1% ~ (p=0.143 n=10) Tile38IntersectsCircle100kmRequest 14.29k ± 1% 14.27k ± 1% ~ (p=0.988 n=10) Tile38KNearestLimit100Request 20.16k ± 1% 19.84k ± 1% -1.59% (p=0.000 n=10) geomean 16.99k 16.87k -0.67% shortname: uber_tally goos: linux goarch: amd64 pkg: github.com/uber-go/tally cpu: Intel(R) Xeon(R) W-2135 CPU @ 3.70GHz │ ./bent-bench/20230303T173250.baseline.stdout │ ./bent-bench/20230303T173250.experiment.stdout │ │ sec/op │ sec/op vs base │ ScopeTaggedNoCachedSubscopes-12 2.867µ ± 4% 2.921µ ± 4% ~ (p=0.579 n=10) HistogramAllocation-12 1.519µ ± 3% 1.507µ ± 7% ~ (p=0.631 n=10) geomean 2.087µ 2.098µ +0.53% │ ./bent-bench/20230303T173250.baseline.stdout │ ./bent-bench/20230303T173250.experiment.stdout │ │ B/op │ B/op vs base │ HistogramAllocation-12 1.124Ki ± 1% 1.125Ki ± 4% ~ (p=0.271 n=10) │ ./bent-bench/20230303T173250.baseline.stdout │ ./bent-bench/20230303T173250.experiment.stdout │ │ allocs/op │ allocs/op vs base │ HistogramAllocation-12 20.00 ± 0% 20.00 ± 0% ~ (p=1.000 n=10) ¹ ¹ all samples are equal shortname: uber_zap pkg: go.uber.org/zap/zapcore │ ./bent-bench/20230303T173250.baseline.stdout │ ./bent-bench/20230303T173250.experiment.stdout │ │ sec/op │ sec/op vs base │ BufferedWriteSyncer/write_file_with_buffer-12 296.1n ± 12% 205.9n ± 10% -30.46% (p=0.000 n=10) MultiWriteSyncer/2_discarder-12 7.528n ± 4% 7.014n ± 2% -6.83% (p=0.000 n=10) MultiWriteSyncer/4_discarder-12 9.065n ± 1% 8.908n ± 1% -1.73% (p=0.002 n=10) MultiWriteSyncer/4_discarder_with_buffer-12 225.2n ± 2% 147.6n ± 2% -34.48% (p=0.000 n=10) WriteSyncer/write_file_with_no_buffer-12 4.785µ ± 1% 4.933µ ± 3% +3.08% (p=0.001 n=10) ZapConsole-12 702.5n ± 1% 649.1n ± 1% -7.62% (p=0.000 n=10) JSONLogMarshalerFunc-12 1.219µ ± 2% 1.226µ ± 3% ~ (p=0.781 n=10) ZapJSON-12 555.4n ± 1% 480.9n ± 3% -13.40% (p=0.000 n=10) StandardJSON-12 814.1n ± 1% 809.0n ± 0% ~ (p=0.101 n=10) Sampler_Check/7_keys-12 10.55n ± 2% 10.61n ± 1% ~ (p=0.594 n=10) Sampler_Check/50_keys-12 11.01n ± 0% 10.98n ± 1% ~ (p=0.286 n=10) Sampler_Check/100_keys-12 10.71n ± 0% 10.71n ± 0% ~ (p=0.563 n=10) Sampler_CheckWithHook/7_keys-12 20.20n ± 2% 20.42n ± 2% ~ (p=0.446 n=10) Sampler_CheckWithHook/50_keys-12 20.72n ± 2% 21.02n ± 1% ~ (p=0.078 n=10) Sampler_CheckWithHook/100_keys-12 20.15n ± 2% 20.68n ± 3% +2.63% (p=0.037 n=10) TeeCheck-12 140.8n ± 2% 140.5n ± 2% ~ (p=0.754 n=10) geomean 87.80n 82.39n -6.15% The only large regression (in ethereum_bitutil's BaseTest2KB) appears to be spurious, as the test does not involve any goroutines (or B.RunParallel()), which profiling confirms. Updates golang/go#18237 Related to golang/go#32113
1 parent cd6d225 commit 4574a3c

File tree

5 files changed

+139
-46
lines changed

5 files changed

+139
-46
lines changed

src/runtime/mgc.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1566,7 +1566,7 @@ func gcSweep(mode gcMode) {
15661566
lock(&sweep.lock)
15671567
if sweep.parked {
15681568
sweep.parked = false
1569-
ready(sweep.g, 0, true)
1569+
ready(sweep.g, 0, true, 0, true)
15701570
}
15711571
unlock(&sweep.lock)
15721572
}

src/runtime/mgcmark.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -690,7 +690,7 @@ func gcFlushBgCredit(scanWork int64) {
690690
// scheduler priority to get itself always run
691691
// before other goroutines and always in the
692692
// fresh quantum started by GC.
693-
ready(gp, 0, false)
693+
ready(gp, 0, false, 0, true)
694694
} else {
695695
// Partially satisfy this assist.
696696
gp.gcAssistBytes += scanBytes

src/runtime/preempt.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -274,7 +274,7 @@ func resumeG(state suspendGState) {
274274

275275
if state.stopped {
276276
// We stopped it, so we need to re-schedule it.
277-
ready(gp, 0, true)
277+
ready(gp, 0, true, 0, true)
278278
}
279279
}
280280

src/runtime/proc.go

Lines changed: 131 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -388,8 +388,18 @@ func goparkunlock(lock *mutex, reason waitReason, traceEv byte, traceskip int) {
388388
}
389389

390390
func goready(gp *g, traceskip int) {
391+
callergp := getg()
392+
now := nanotime()
393+
needWakep := readyUpdateAndPredictNeedWakep(callergp, now)
394+
systemstack(func() {
395+
ready(gp, traceskip, true, now, needWakep)
396+
})
397+
}
398+
399+
func goreadyNoWakep(gp *g, traceskip int) {
400+
now := nanotime()
391401
systemstack(func() {
392-
ready(gp, traceskip, true)
402+
ready(gp, traceskip, true, now, false /* needWakep */)
393403
})
394404
}
395405

@@ -858,8 +868,42 @@ func fastrandinit() {
858868
getRandomData(s)
859869
}
860870

871+
func parkUpdateNeedWakep(gp *g) {
872+
gp.firstReady = 0
873+
if gp.lastReady != 0 {
874+
gp.runnextSwitchHistory = nanotime()-gp.lastReady <= runnextSwitchNS
875+
gp.lastReady = 0
876+
}
877+
}
878+
879+
// Returns true if gp, calling goready() or newproc() at now, is predicted
880+
// *not* to park within the next runnextSwitchNS, such that a spinning P is
881+
// needed to run the new G.
882+
// Incorrectly returning true (causing a spinning P to be uselessly woken)
883+
// wastes cycles but is harmless.
884+
// Incorrectly returning false is handled by sysmon (in retake()).
885+
func readyUpdateAndPredictNeedWakep(gp *g, now int64) bool {
886+
pp := gp.m.p.ptr()
887+
888+
// Update predictor state.
889+
gp.lastReady = now
890+
if now-gp.firstReady > runnextSwitchNS {
891+
if gp.firstReady != 0 {
892+
gp.runnextSwitchHistory = false
893+
}
894+
gp.firstReady = now
895+
}
896+
897+
// If the caller's runqueue is non-empty, predict that we need wakep();
898+
// even if gp parks, there's no guarantee that following Gs will.
899+
if !runqempty(pp) {
900+
return true
901+
}
902+
return !gp.runnextSwitchHistory
903+
}
904+
861905
// Mark gp ready to run.
862-
func ready(gp *g, traceskip int, next bool) {
906+
func ready(gp *g, traceskip int, next bool, now int64, needWakep bool) {
863907
if trace.enabled {
864908
traceGoUnpark(gp, traceskip)
865909
}
@@ -875,8 +919,10 @@ func ready(gp *g, traceskip int, next bool) {
875919

876920
// status is Gwaiting or Gscanwaiting, make Grunnable and put on runq
877921
casgstatus(gp, _Gwaiting, _Grunnable)
878-
runqput(mp.p.ptr(), gp, next)
879-
wakep()
922+
runqput(mp.p.ptr(), gp, next, now)
923+
if needWakep {
924+
wakep()
925+
}
880926
releasem(mp)
881927
}
882928

@@ -2715,7 +2761,7 @@ top:
27152761
// Wake up the finalizer G.
27162762
if fingStatus.Load()&(fingWait|fingWake) == fingWait|fingWake {
27172763
if gp := wakefing(); gp != nil {
2718-
ready(gp, 0, true)
2764+
ready(gp, 0, true, now, true)
27192765
}
27202766
}
27212767
if *cgo_yield != nil {
@@ -3027,7 +3073,7 @@ func pollWork() bool {
30273073
func stealWork(now int64) (gp *g, inheritTime bool, rnow, pollUntil int64, newWork bool) {
30283074
pp := getg().m.p.ptr()
30293075

3030-
ranTimer := false
3076+
ranTimerOrRunnextPending := false
30313077

30323078
const stealTries = 4
30333079
for i := 0; i < stealTries; i++ {
@@ -3072,16 +3118,21 @@ func stealWork(now int64) (gp *g, inheritTime bool, rnow, pollUntil int64, newWo
30723118
// stolen G's. So check now if there
30733119
// is a local G to run.
30743120
if gp, inheritTime := runqget(pp); gp != nil {
3075-
return gp, inheritTime, now, pollUntil, ranTimer
3121+
return gp, inheritTime, now, pollUntil, ranTimerOrRunnextPending
30763122
}
3077-
ranTimer = true
3123+
ranTimerOrRunnextPending = true
30783124
}
30793125
}
30803126

30813127
// Don't bother to attempt to steal if p2 is idle.
30823128
if !idlepMask.read(enum.position()) {
3083-
if gp := runqsteal(pp, p2, stealTimersOrRunNextG); gp != nil {
3084-
return gp, false, now, pollUntil, ranTimer
3129+
tnow, gp, runnextPending := runqsteal(pp, p2, stealTimersOrRunNextG, now)
3130+
now = tnow
3131+
if runnextPending {
3132+
ranTimerOrRunnextPending = true
3133+
}
3134+
if gp != nil {
3135+
return gp, false, now, pollUntil, ranTimerOrRunnextPending
30853136
}
30863137
}
30873138
}
@@ -3090,7 +3141,7 @@ func stealWork(now int64) (gp *g, inheritTime bool, rnow, pollUntil int64, newWo
30903141
// No goroutines found to steal. Regardless, running a timer may have
30913142
// made some goroutine ready that we missed. Indicate the next timer to
30923143
// wait for.
3093-
return nil, false, now, pollUntil, ranTimer
3144+
return nil, false, now, pollUntil, ranTimerOrRunnextPending
30943145
}
30953146

30963147
// Check all Ps for a runnable G to steal.
@@ -3474,6 +3525,8 @@ func parkunlock_c(gp *g, lock unsafe.Pointer) bool {
34743525

34753526
// park continuation on g0.
34763527
func park_m(gp *g) {
3528+
parkUpdateNeedWakep(gp)
3529+
34773530
mp := getg().m
34783531

34793532
if trace.enabled {
@@ -3597,7 +3650,7 @@ func goyield_m(gp *g) {
35973650
pp := gp.m.p.ptr()
35983651
casgstatus(gp, _Grunning, _Grunnable)
35993652
dropg()
3600-
runqput(pp, gp, false)
3653+
runqput(pp, gp, false, 0)
36013654
schedule()
36023655
}
36033656

@@ -4232,11 +4285,13 @@ func newproc(fn *funcval) {
42324285
pc := getcallerpc()
42334286
systemstack(func() {
42344287
newg := newproc1(fn, gp, pc)
4288+
now := nanotime()
4289+
needWakep := readyUpdateAndPredictNeedWakep(gp, now)
42354290

42364291
pp := getg().m.p.ptr()
4237-
runqput(pp, newg, true)
4292+
runqput(pp, newg, true, now)
42384293

4239-
if mainStarted {
4294+
if mainStarted && needWakep {
42404295
wakep()
42414296
}
42424297
})
@@ -5389,6 +5444,7 @@ func sysmon() {
53895444
}
53905445
// retake P's blocked in syscalls
53915446
// and preempt long running G's
5447+
// and start P's if we mispredicted that wakep() was unnecessary
53925448
if retake(now) != 0 {
53935449
idle = 0
53945450
} else {
@@ -5424,6 +5480,7 @@ const forcePreemptNS = 10 * 1000 * 1000 // 10ms
54245480

54255481
func retake(now int64) uint32 {
54265482
n := 0
5483+
needWakep := false
54275484
// Prevent allp slice changes. This lock will be completely
54285485
// uncontended unless we're already stopping the world.
54295486
lock(&allpLock)
@@ -5437,6 +5494,9 @@ func retake(now int64) uint32 {
54375494
// allp but not yet created new Ps.
54385495
continue
54395496
}
5497+
if runqstealable(pp, now) && sched.nmspinning.Load() == 0 {
5498+
needWakep = true
5499+
}
54405500
pd := &pp.sysmontick
54415501
s := pp.status
54425502
sysretake := false
@@ -5488,6 +5548,10 @@ func retake(now int64) uint32 {
54885548
}
54895549
}
54905550
unlock(&allpLock)
5551+
if needWakep {
5552+
wakep()
5553+
n++
5554+
}
54915555
return uint32(n)
54925556
}
54935557

@@ -5766,7 +5830,7 @@ func globrunqget(pp *p, max int32) *g {
57665830
n--
57675831
for ; n > 0; n-- {
57685832
gp1 := sched.runq.pop()
5769-
runqput(pp, gp1, false)
5833+
runqput(pp, gp1, false, 0)
57705834
}
57715835
return gp
57725836
}
@@ -5933,6 +5997,25 @@ func runqempty(pp *p) bool {
59335997
}
59345998
}
59355999

6000+
// runqstealable is like !runqempty, but returns false if pp has a G in
6001+
// p.runnext that can't be stolen yet.
6002+
func runqstealable(pp *p, now int64) bool {
6003+
for {
6004+
head := atomic.Load(&pp.runqhead)
6005+
tail := atomic.Load(&pp.runqtail)
6006+
runnext := atomic.Loaduintptr((*uintptr)(unsafe.Pointer(&pp.runnext)))
6007+
if tail == atomic.Load(&pp.runqtail) {
6008+
if head != tail {
6009+
return true
6010+
}
6011+
if runnext == 0 {
6012+
return false
6013+
}
6014+
return ((guintptr)(runnext)).ptr().runnextSince+runnextSwitchNS < now
6015+
}
6016+
}
6017+
}
6018+
59366019
// To shake out latent assumptions about scheduling order,
59376020
// we introduce some randomness into scheduling decisions
59386021
// when running with the race detector.
@@ -5949,12 +6032,16 @@ const randomizeScheduler = raceenabled
59496032
// If next is true, runqput puts g in the pp.runnext slot.
59506033
// If the run queue is full, runnext puts g on the global queue.
59516034
// Executed only by the owner P.
5952-
func runqput(pp *p, gp *g, next bool) {
6035+
func runqput(pp *p, gp *g, next bool, now int64) {
59536036
if randomizeScheduler && next && fastrandn(2) == 0 {
59546037
next = false
59556038
}
59566039

59576040
if next {
6041+
if now == 0 {
6042+
now = nanotime()
6043+
}
6044+
gp.runnextSince = now
59586045
retryNext:
59596046
oldnext := pp.runnext
59606047
if !pp.runnext.cas(oldnext, guintptr(unsafe.Pointer(gp))) {
@@ -6123,11 +6210,22 @@ retry:
61236210
return
61246211
}
61256212

6213+
// Don't steal p.runnext if it's been made runnable within the last
6214+
// runnextSwitchNS and the P is running.
6215+
// The important use case here is when the G running on the P ready()s another G
6216+
// and then almost immediately blocks.
6217+
// Giving the P a chance to schedule runnext avoids thrashing Gs between
6218+
// different Ps.
6219+
// On most platforms, sleep timeout granularity is coarser than
6220+
// runnextSwitchNS, so sleeping will result in significant overshoot; instead,
6221+
// stealWork() instructs findRunnable() to spin-wait.
6222+
const runnextSwitchNS = 5e3
6223+
61266224
// Grabs a batch of goroutines from pp's runnable queue into batch.
61276225
// Batch is a ring buffer starting at batchHead.
61286226
// Returns number of grabbed goroutines.
61296227
// Can be executed by any P.
6130-
func runqgrab(pp *p, batch *[256]guintptr, batchHead uint32, stealRunNextG bool) uint32 {
6228+
func runqgrab(pp *p, batch *[256]guintptr, batchHead uint32, stealRunNextG bool, now int64) (rnow int64, n uint32, runnextPending bool) {
61316229
for {
61326230
h := atomic.LoadAcq(&pp.runqhead) // load-acquire, synchronize with other consumers
61336231
t := atomic.LoadAcq(&pp.runqtail) // load-acquire, synchronize with the producer
@@ -6138,33 +6236,22 @@ func runqgrab(pp *p, batch *[256]guintptr, batchHead uint32, stealRunNextG bool)
61386236
// Try to steal from pp.runnext.
61396237
if next := pp.runnext; next != 0 {
61406238
if pp.status == _Prunning {
6141-
// Sleep to ensure that pp isn't about to run the g
6142-
// we are about to steal.
6143-
// The important use case here is when the g running
6144-
// on pp ready()s another g and then almost
6145-
// immediately blocks. Instead of stealing runnext
6146-
// in this window, back off to give pp a chance to
6147-
// schedule runnext. This will avoid thrashing gs
6148-
// between different Ps.
6149-
// A sync chan send/recv takes ~50ns as of time of
6150-
// writing, so 3us gives ~50x overshoot.
6151-
if GOOS != "windows" && GOOS != "openbsd" && GOOS != "netbsd" {
6152-
usleep(3)
6153-
} else {
6154-
// On some platforms system timer granularity is
6155-
// 1-15ms, which is way too much for this
6156-
// optimization. So just yield.
6157-
osyield()
6239+
// Enforce runnextSwitchNS.
6240+
if now == 0 {
6241+
now = nanotime()
6242+
}
6243+
if now <= next.ptr().runnextSince+runnextSwitchNS {
6244+
return now, 0, true
61586245
}
61596246
}
61606247
if !pp.runnext.cas(next, 0) {
61616248
continue
61626249
}
61636250
batch[batchHead%uint32(len(batch))] = next
6164-
return 1
6251+
return now, 1, false
61656252
}
61666253
}
6167-
return 0
6254+
return now, 0, false
61686255
}
61696256
if n > uint32(len(pp.runq)/2) { // read inconsistent h and t
61706257
continue
@@ -6174,31 +6261,32 @@ func runqgrab(pp *p, batch *[256]guintptr, batchHead uint32, stealRunNextG bool)
61746261
batch[(batchHead+i)%uint32(len(batch))] = g
61756262
}
61766263
if atomic.CasRel(&pp.runqhead, h, h+n) { // cas-release, commits consume
6177-
return n
6264+
return now, n, false
61786265
}
61796266
}
61806267
}
61816268

61826269
// Steal half of elements from local runnable queue of p2
61836270
// and put onto local runnable queue of p.
61846271
// Returns one of the stolen elements (or nil if failed).
6185-
func runqsteal(pp, p2 *p, stealRunNextG bool) *g {
6272+
func runqsteal(pp, p2 *p, stealRunNextG bool, now int64) (rnow int64, gp *g, runnextPending bool) {
61866273
t := pp.runqtail
6187-
n := runqgrab(p2, &pp.runq, t, stealRunNextG)
6274+
tnow, n, runnextPending := runqgrab(p2, &pp.runq, t, stealRunNextG, now)
6275+
now = tnow
61886276
if n == 0 {
6189-
return nil
6277+
return now, nil, runnextPending
61906278
}
61916279
n--
6192-
gp := pp.runq[(t+n)%uint32(len(pp.runq))].ptr()
6280+
gp = pp.runq[(t+n)%uint32(len(pp.runq))].ptr()
61936281
if n == 0 {
6194-
return gp
6282+
return now, gp, runnextPending
61956283
}
61966284
h := atomic.LoadAcq(&pp.runqhead) // load-acquire, synchronize with consumers
61976285
if t-h+n >= uint32(len(pp.runq)) {
61986286
throw("runqsteal: runq overflow")
61996287
}
62006288
atomic.StoreRel(&pp.runqtail, t+n) // store-release, makes the item available for consumption
6201-
return gp
6289+
return now, gp, runnextPending
62026290
}
62036291

62046292
// A gQueue is a dequeue of Gs linked through g.schedlink. A G can only

0 commit comments

Comments
 (0)